commit d3dd9d3e5678e301ea75e1522bf8a9dcae10b556
Author: Karsten Loesing <karsten.loesing(a)gmx.net>
Date: Fri Oct 26 17:04:48 2012 -0400
Document what exactly is sanitized and how that is done.
Also remove a variable that Eclipse identifies as not being used.
---
src/org/torproject/webstats/Main.java | 14 ++++++++++++--
1 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/src/org/torproject/webstats/Main.java b/src/org/torproject/webstats/Main.java
index 3be2c92..92732cd 100644
--- a/src/org/torproject/webstats/Main.java
+++ b/src/org/torproject/webstats/Main.java
@@ -10,7 +10,18 @@ import org.apache.commons.compress.compressors.gzip.*;
/*
* Sanitize Apache web logs by removing all potentially sensitive parts.
*
- * TODO Document what exactly is sanitized and how that is done.
+ * The following sanitizing steps are performed on input web logs:
+ * 1. Discard all lines that are not in the Combined Log Format.
+ * 2. Discard all lines with other hosts than '0.0.0.0' or '0.0.0.1'.
+ * 3. Discard all lines with other methods than GET.
+ * 4. Discard all lines with other protocols than HTTP.
+ * 5. Discard all lines with status code 404.
+ * 6. Override client with '-'.
+ * 7. Override user with '-'.
+ * 8. Override time with '00:00:00 +0000'.
+ * 9. Override referer (sic!) with '"-"'.
+ * 10. Override user agent with '"-"'.
+ * 11. Truncate resouce at the first '?' character.
*
* The main operation is to parse Apache web log files from the in/
* directory and write sanitized web log files to the out/ directory.
@@ -141,7 +152,6 @@ public class Main {
/* Define file and directory names. */
private static File inDirectory = new File("in");
private static File outDirectory = new File("out");
- private static File tempDirectory = new File("temp");
private static File stateLockFile = new File("state/lock");
private static File stateInHistoryFile = new File("state/in-history");
private static File stateInHistoryNewFile =