commit 93c48ff3286c9db082973bdf4a7b10fc8edcc8ee Author: Sebastian Hahn sebastian@torproject.org Date: Wed Nov 18 16:31:30 2015 +0100
Throw away http lines --- src/sanitize.py | 14 +++++++++----- src/treat_new_logs.sh | 8 ++++++-- 2 files changed, 15 insertions(+), 7 deletions(-)
diff --git a/src/sanitize.py b/src/sanitize.py index 0c5de19..d6baf41 100755 --- a/src/sanitize.py +++ b/src/sanitize.py @@ -6,7 +6,7 @@ The following sanitizing steps are performed on data read from stdin: 2. Die if other hosts than '0.0.0.0' or '0.0.0.1' are specified. 3. Discard all lines with other methods than GET. 4. Die if a protocol other than HTTP is used. - 5. Discard all lines with status code 404. + 5. Discard all lines with status code 400 and 404. 6. Override client with '-'. 7. Override user with '-'. 8. Override time with '00:00:00 +0000'. @@ -46,7 +46,7 @@ if matched is None: sys.exit(1) today = dateutil.parser.parse(matched.group(1))
-is_valid_regex = re.compile(r'^0.0.0.([01]) - - [(\d{2}/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/\d{4}):00:00:00 +0000] "([^ ]*) ([^ ?]*[?]?|)[^ ]* HTTP([^"]*)" (-|\d*) (-|\d*) "([^"]|\|")*" "([^"]|")*" .*[^ ]$') +is_valid_regex = re.compile(r'^0.0.0.([01]) - - [(\d{2}/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/\d{4}):00:00:00 +0000] "([^ ]*) ([^ ?]*[?]?|).* HTTP([^"]*)" (-|\d*) (-|\d*) "([^"]|\|")*" "([^"]|")*" .*[^ ]$') sanitized_regex = r'0.0.0.\1 - - [\2:00:00:00 +0000] "\4 \5 HTTP\6" \7 \8 "-" "-" -\n' day_before = today - datetime.timedelta(days=1)
@@ -59,18 +59,22 @@ with open(yesterday_fname, 'a') as file_old: matched = is_valid_regex.match(line) if matched is None: print(line, "Last line does not match critera", file=sys.stderr) - sys.exit(1) + continue date = dateutil.parser.parse(matched.group(2)) if today != date and day_before != date: print(line, "Last line does not match date constraints. today:", today, " day before:", day_before, " date:", date, file=sys.stderr) - sys.exit(1) + continue requesttype = matched.group(4) if requesttype != "GET" and requesttype != "HEAD": #print(matched.expand(sanitized_regex), file=sys.stderr, end="") continue
- if matched.group(7) == "404": + if matched.group(7) == "404" or matched.group(7) == "400": + #print(matched.expand(sanitized_regex), file=sys.stderr, end="") + continue + + if matched.group(1) == "0": #print(matched.expand(sanitized_regex), file=sys.stderr, end="") continue
diff --git a/src/treat_new_logs.sh b/src/treat_new_logs.sh index 147ebec..144eec3 100755 --- a/src/treat_new_logs.sh +++ b/src/treat_new_logs.sh @@ -8,6 +8,8 @@ SCRIPTDIR="${BASEDIR}/bin/"
BASEINCOMINGDIR="${BASEDIR}/incoming/"
+INTERESTING_HOSTS="www.torproject.org dist.torproject.org" + cd "${BASEINCOMINGDIR}" for host in *; do INCOMINGDIR="${BASEINCOMINGDIR}/${host}/" @@ -30,12 +32,12 @@ for host in *; do cp "${INCOMINGDIR}/${file}" "${WORKDIR}/${file}" cd "${WORKDIR}" gunzip ${file} - COMPLETED=$(${SCRIPTDIR}/sanitize.py "${basefile}" "${WORKDIR}") + COMPLETED=$(${SCRIPTDIR}/sanitize.py "${basefile}" "${WORKDIR}" 2>>"${WORKDIR}/errors") COMPLETED_BASE=$(basename $COMPLETED) COMPLETED_BASE=${COMPLETED_BASE%_sanitized} sort "${COMPLETED}" > "${COMPLETED}_sorted" xz -ck9e "${COMPLETED}_sorted" > "${OUTDIR}/${COMPLETED_BASE}.xz" - mv "${COMPLETED}_sorted" "${WORKDIR_AWSTATS}" + mv "${OUTDIR}/${COMPLETED_BASE}.xz" "${WORKDIR_AWSTATS}"
rm "${WORKDIR}/${basefile}" rm "${WORKDIR}/${COMPLETED_BASE}_sanitized" @@ -43,3 +45,5 @@ for host in *; do done done
+# Now that we have all output files, process them with awstats +