commit d3dd9d3e5678e301ea75e1522bf8a9dcae10b556 Author: Karsten Loesing karsten.loesing@gmx.net Date: Fri Oct 26 17:04:48 2012 -0400
Document what exactly is sanitized and how that is done.
Also remove a variable that Eclipse identifies as not being used. --- src/org/torproject/webstats/Main.java | 14 ++++++++++++-- 1 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/src/org/torproject/webstats/Main.java b/src/org/torproject/webstats/Main.java index 3be2c92..92732cd 100644 --- a/src/org/torproject/webstats/Main.java +++ b/src/org/torproject/webstats/Main.java @@ -10,7 +10,18 @@ import org.apache.commons.compress.compressors.gzip.*; /* * Sanitize Apache web logs by removing all potentially sensitive parts. * - * TODO Document what exactly is sanitized and how that is done. + * The following sanitizing steps are performed on input web logs: + * 1. Discard all lines that are not in the Combined Log Format. + * 2. Discard all lines with other hosts than '0.0.0.0' or '0.0.0.1'. + * 3. Discard all lines with other methods than GET. + * 4. Discard all lines with other protocols than HTTP. + * 5. Discard all lines with status code 404. + * 6. Override client with '-'. + * 7. Override user with '-'. + * 8. Override time with '00:00:00 +0000'. + * 9. Override referer (sic!) with '"-"'. + * 10. Override user agent with '"-"'. + * 11. Truncate resouce at the first '?' character. * * The main operation is to parse Apache web log files from the in/ * directory and write sanitized web log files to the out/ directory. @@ -141,7 +152,6 @@ public class Main { /* Define file and directory names. */ private static File inDirectory = new File("in"); private static File outDirectory = new File("out"); - private static File tempDirectory = new File("temp"); private static File stateLockFile = new File("state/lock"); private static File stateInHistoryFile = new File("state/in-history"); private static File stateInHistoryNewFile =