[tor-commits] [webstats/master] Throw away http lines

sebastian at torproject.org sebastian at torproject.org
Tue Mar 22 05:12:17 UTC 2016


commit 93c48ff3286c9db082973bdf4a7b10fc8edcc8ee
Author: Sebastian Hahn <sebastian at torproject.org>
Date:   Wed Nov 18 16:31:30 2015 +0100

    Throw away http lines
---
 src/sanitize.py       | 14 +++++++++-----
 src/treat_new_logs.sh |  8 ++++++--
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/sanitize.py b/src/sanitize.py
index 0c5de19..d6baf41 100755
--- a/src/sanitize.py
+++ b/src/sanitize.py
@@ -6,7 +6,7 @@ The following sanitizing steps are performed on data read from stdin:
   2. Die if other hosts than '0.0.0.0' or '0.0.0.1' are specified.
   3. Discard all lines with other methods than GET.
   4. Die if a protocol other than HTTP is used.
-  5. Discard all lines with status code 404.
+  5. Discard all lines with status code 400 and 404.
   6. Override client with '-'.
   7. Override user with '-'.
   8. Override time with '00:00:00 +0000'.
@@ -46,7 +46,7 @@ if matched is None:
     sys.exit(1)
 today = dateutil.parser.parse(matched.group(1))
 
-is_valid_regex = re.compile(r'^0\.0\.0\.([01]) - - \[(\d{2}/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/\d{4}):00:00:00 \+0000\] "([^ ]*) ([^ ?]*[?]?|)[^ ]* HTTP([^"]*)" (-|\d*) (-|\d*) "([^\"]|\\|\")*" "([^"]|\")*" .*[^ ]$')
+is_valid_regex = re.compile(r'^0\.0\.0\.([01]) - - \[(\d{2}/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/\d{4}):00:00:00 \+0000\] "([^ ]*) ([^ ?]*[?]?|).* HTTP([^"]*)" (-|\d*) (-|\d*) "([^\"]|\\|\")*" "([^"]|\")*" .*[^ ]$')
 sanitized_regex = r'0.0.0.\1 - - [\2:00:00:00 +0000] "\4 \5 HTTP\6" \7 \8 "-" "-" -\n'
 day_before = today - datetime.timedelta(days=1)
 
@@ -59,18 +59,22 @@ with open(yesterday_fname, 'a') as file_old:
             matched = is_valid_regex.match(line)
             if matched is None:
                 print(line, "Last line does not match critera", file=sys.stderr)
-                sys.exit(1)
+                continue
             date = dateutil.parser.parse(matched.group(2))
             if today != date and day_before != date:
                 print(line, "Last line does not match date constraints. today:", today,
                         " day before:", day_before, " date:", date, file=sys.stderr)
-                sys.exit(1)
+                continue
             requesttype = matched.group(4)
             if requesttype != "GET" and requesttype != "HEAD":
                 #print(matched.expand(sanitized_regex), file=sys.stderr, end="")
                 continue
 
-            if matched.group(7) == "404":
+            if matched.group(7) == "404" or matched.group(7) == "400":
+                #print(matched.expand(sanitized_regex), file=sys.stderr, end="")
+                continue
+
+            if matched.group(1) == "0":
                 #print(matched.expand(sanitized_regex), file=sys.stderr, end="")
                 continue
 
diff --git a/src/treat_new_logs.sh b/src/treat_new_logs.sh
index 147ebec..144eec3 100755
--- a/src/treat_new_logs.sh
+++ b/src/treat_new_logs.sh
@@ -8,6 +8,8 @@ SCRIPTDIR="${BASEDIR}/bin/"
 
 BASEINCOMINGDIR="${BASEDIR}/incoming/"
 
+INTERESTING_HOSTS="www.torproject.org dist.torproject.org"
+
 cd "${BASEINCOMINGDIR}"
 for host in *; do
     INCOMINGDIR="${BASEINCOMINGDIR}/${host}/"
@@ -30,12 +32,12 @@ for host in *; do
         cp "${INCOMINGDIR}/${file}" "${WORKDIR}/${file}"
         cd "${WORKDIR}"
         gunzip ${file}
-        COMPLETED=$(${SCRIPTDIR}/sanitize.py "${basefile}" "${WORKDIR}")
+        COMPLETED=$(${SCRIPTDIR}/sanitize.py "${basefile}" "${WORKDIR}" 2>>"${WORKDIR}/errors")
         COMPLETED_BASE=$(basename $COMPLETED)
         COMPLETED_BASE=${COMPLETED_BASE%_sanitized}
         sort "${COMPLETED}" > "${COMPLETED}_sorted"
         xz -ck9e "${COMPLETED}_sorted" > "${OUTDIR}/${COMPLETED_BASE}.xz"
-        mv "${COMPLETED}_sorted" "${WORKDIR_AWSTATS}"
+        mv "${OUTDIR}/${COMPLETED_BASE}.xz" "${WORKDIR_AWSTATS}"
 
         rm "${WORKDIR}/${basefile}"
         rm "${WORKDIR}/${COMPLETED_BASE}_sanitized"
@@ -43,3 +45,5 @@ for host in *; do
     done
 done
 
+# Now that we have all output files, process them with awstats
+





More information about the tor-commits mailing list