commit edddf0983d668ecd8c27700a83cb7e4d64255c07
Author: Sebastian Hahn <sebastian(a)torproject.org>
Date: Fri Nov 13 06:41:57 2015 +0100
Start script to extract logs
---
bin/receive-log.sh | 47 ---
bin/send-logs.sh | 73 -----
build.xml | 36 --
src/org/torproject/webstats/Main.java | 595 ----------------------------------
src/sanitize.py | 82 +++++
src/treat_new_logs.sh | 45 +++
6 files changed, 127 insertions(+), 751 deletions(-)
diff --git a/bin/receive-log.sh b/bin/receive-log.sh
deleted file mode 100755
index 6f98eaa..0000000
--- a/bin/receive-log.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-
-# Copyright 2011 The Tor Project
-# See LICENSE for licensing information
-#
-# Read a binary file from stdin and store it to disk.
-
-set -e
-set -u
-
-# Read remote host name from local command-line argument.
-rhost="$1"
-if [ -z "$rhost" ]; then
- echo "Missing or illegal host name in authorized_keys."
- exit 1
-fi
-
-# Read remote log file name from SSH command-line arguments.
-rfile="${SSH_ORIGINAL_COMMAND:-}"
-if ! [[ "$rfile" =~ ^[a-zA-Z0-9.-]+$ ]]; then
- echo "Missing or illegal file name in SSH command."
- exit 1
-fi
-
-# Create directories for this remote host if it doesn't exist.
-ldir="in/$rhost"
-if ! [ -d "in" ]; then
- mkdir "in"
-fi
-if ! [ -d "$ldir" ]; then
- mkdir "$ldir"
-fi
-
-# Make sure the file doesn't exist yet.
-lfile="$ldir/$rfile"
-if [[ -f "$lfile" ]]; then
- echo "File already exists."
- exit 1
-fi
-
-# Write file from stdin to temporary file.
-tfile="$ldir/.$rfile"
-cat > "$tfile"
-
-# Move file from temp/ to in/ directory.
-mv "$tfile" "$lfile"
-
diff --git a/bin/send-logs.sh b/bin/send-logs.sh
deleted file mode 100755
index 77e7442..0000000
--- a/bin/send-logs.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-
-# Copyright 2011 The Tor Project
-# See LICENSE for licensing information
-#
-# Send new gzip'ed Apache web log files via ssh to a remote server.
-
-set -e
-set -u
-
-usage() {
- echo "Usage: $0 logs-dir ssh-string state-dir"
- exit 1
-}
-
-# Check command-line arguments.
-if [ "$#" != 3 ]; then
- usage
-fi
-ldir="$1"
-sshstring="$2"
-tdir="$3"
-if ! [ -d "$ldir" ]; then
- echo "Directory '$1' does not exist."
- usage
-fi
-
-# Create the local state directory that has empty files for every log file
-# that we sent to the remote server.
-if ! [ -d "$tdir" ]; then
- mkdir "$tdir"
-fi
-
-# Iterate over files with file names containing access.log and ending in
-# .gz and send the ones we haven't sent before to the remote server.
-for i in $(find "$ldir" -maxdepth 1 -type f -name '*access.log*.gz'); do
-
- # Check that the file exists. This works around issues like filenames
- # containing spaces.
- if ! [ -e $i ]; then
- echo "File '$i' does not exist."
- continue
- fi
-
- # Extract the file name part to pass it as SSH parameter.
- fname="$(basename $i)"
-
- # Warn if file names contain illegal characters that the receiver would
- # not accept.
- if ! [[ $fname =~ ^[a-zA-Z0-9.-]+$ ]]; then
- echo "Illegal file name '$fname'."
- continue
- fi
-
- # Copy the file content to the remote server if we haven't sent it
- # before.
- tfile="$tdir/$fname"
- if ! [ -f "$tfile" ]; then
- ssh -o PreferredAuthentications=publickey $sshstring "$fname" < "$i"
- fi
-
- # Add a state file to note that we don't attempt to send this file in
- # the next execution.
- touch "$tfile"
-
-done
-
-# Delete all state files for which there are no log files anymore.
-for i in $(find "$tdir" -type f); do
- fname="$(basename $i)"
- [ -e "$ldir/$fname" ] || rm -f "$tdir/$fname"
-done
-
diff --git a/build.xml b/build.xml
deleted file mode 100644
index 7f0b430..0000000
--- a/build.xml
+++ /dev/null
@@ -1,36 +0,0 @@
-<project default="run" name="webstats" basedir=".">
- <property name="sources" value="src"/>
- <property name="classes" value="classes"/>
- <property name="libs" value="lib"/>
- <path id="classpath">
- <pathelement path="${classes}"/>
- <pathelement location="${libs}/commons-compress-1.0.jar"/>
- </path>
- <target name="init">
- <mkdir dir="${classes}"/>
- </target>
- <target name="compile"
- depends="init">
- <javac destdir="${classes}"
- srcdir="${sources}"
- source="1.5"
- target="1.5"
- debug="true"
- deprecation="true"
- optimize="false"
- failonerror="true"
- includeantruntime="false">
- <classpath>
- <fileset dir="${libs}"/>
- </classpath>
- </javac>
- </target>
- <target name="run" depends="compile">
- <java fork="true"
- maxmemory="512m"
- classname="org.torproject.webstats.Main">
- <classpath refid="classpath"/>
- </java>
- </target>
-</project>
-
diff --git a/src/org/torproject/webstats/Main.java b/src/org/torproject/webstats/Main.java
deleted file mode 100644
index 378093e..0000000
--- a/src/org/torproject/webstats/Main.java
+++ /dev/null
@@ -1,595 +0,0 @@
-package org.torproject.webstats;
-
-import java.io.*;
-import java.text.*;
-import java.util.*;
-import java.util.regex.*;
-
-import org.apache.commons.compress.compressors.gzip.*;
-
-/*
- * Sanitize Apache web logs by removing all potentially sensitive parts.
- *
- * The following sanitizing steps are performed on input web logs:
- * 1. Discard all lines that are not in the Combined Log Format.
- * 2. Discard all lines with other hosts than '0.0.0.0' or '0.0.0.1'.
- * 3. Discard all lines with other methods than GET.
- * 4. Discard all lines with other protocols than HTTP.
- * 5. Discard all lines with status code 404.
- * 6. Override client with '-'.
- * 7. Override user with '-'.
- * 8. Override time with '00:00:00 +0000'.
- * 9. Override referer (sic!) with '"-"'.
- * 10. Override user agent with '"-"'.
- * 11. Truncate resource at the first '?' character.
- *
- * The main operation is to parse Apache web log files from the in/
- * directory and write sanitized web log files to the out/ directory.
- * Files in the in/ directory are assumed to never change and may be
- * deleted after processing by this program. Files in the out/ directory
- * are guaranteed to never change and may be deleted by a subsequently
- * running program.
- *
- * This program uses a couple of state files to make sure that files in
- * in/ are not parsed more than once and that files in out/ do not need to
- * be changed:
- * - state/lock prevents concurrent executions of this program.
- * - state/in-history contains file names of previously read and possibly
- * deleted files in the in/ directory.
- * - state/in-history.new is the file written in the current execution
- * that will replace state/in-history during the execution.
- * - state/temp/ contains new or updated output files parsed in the
- * current execution that are moved to out/, state/full/, or state/diff/
- * during the execution.
- * - state/out-history contains file names of previously written and
- * possibly deleted files in the out/ directory.
- * - state/out-history.new is the file written in the current execution
- * that will replace state/out-history at the end of the execution.
- * - state/full/ contains complete output files that may or may not be
- * newer than files in the out/ directory.
- * - state/diff/ contains new parts for files in the out/ directory which
- * have been deleted.
- *
- * The phases and steps taken by this program are as follows:
- *
- * Phase I: Read files from in/, sanitize them, and write them to state/.
- * 1. Check that state/lock does not exists, or exit immediately. Add a
- * new state/lock file.
- * 2. Read the contents from state/in-history and the directory listing
- * of in/ to memory.
- * 3. For each file in in/:
- * a. Append the file name to state/in-history.new if it was not
- * contained in state/in-history. If it was contained, skip the
- * file.
- * b. Parse and sanitize the file in chunks of 250,000 lines to reduce
- * writes.
- * c. When writing sanitized chunks to output files, for each output
- * file, check in the following order if there is already such a
- * file in
- * i. state/temp/,
- * ii. state/full/,
- * iii. out/, or
- * iv. state/diff/.
- * If there's such a file, merge the newly sanitized lines into
- * that file and write the sorted result to state/temp/.
- * 4. Rename state/in-history to state/in-history.old and rename
- * state/in-history.new to state/in-history. Delete
- * state/in-history.old.
- *
- * Phase II: Move files that won't change anymore from state/ to out/.
- * 5. Read the contents from state/out-history to memory.
- * 6. For each file in state/temp/:
- * a. Check if there's a corresponding line in state/out-history. If
- * so, check whether there is a file in state/full/ or out/. If
- * so, move the file to state/full/. Otherwise move the file to
- * state/diff/. Print out a warning that there is a more recent
- * file available.
- * b. If a. does not apply and the sanitized log is less than four (4)
- * days old, move the file to state/full/.
- * c. If b. does not apply, append a line to out-history.new and move
- * the file to out/.
- * 7. For each file in state/full/, check whether the sanitized log is at
- * least four (4) days old and not contained in state/out-history. If
- * so, append a line to out-history.new and move the file to out/.
- * 8. Rename state/out-history to state/out-history.old and rename
- * state/out-history.new to state/out-history. Delete
- * state/out-history.old.
- * 9. Delete state/lock and exit.
- *
- * If the program is interrupted and leaves a lock file in state/lock, it
- * requires an operator to fix the state/ directory and make it work
- * again. IMPORTANT: DO NOT CHANGE ANYTHING IN THE state/ DIRECTORY
- * UNLESS YOU'RE CERTAIN WHAT YOU'RE DOING! The following situations can
- * happen. It may make sense to try a solution in a test environment
- * first:
- * A. The file state/in-history.new does not exist and there are no files
- * in state/temp/. The process died before step 3, that is before
- * actually doing anything of phase I. Delete state/lock and re-run
- * the program.
- * B. The file state/in-history.new exists and there are files in
- * state/temp/. The process died during steps 3 or 4, that is, during
- * phase I. Delete all files in state/temp/. If state/in-history
- * does not exist but state/in-history.old does exist, rename the
- * latter to the former. Delete state/lock and re-run the program.
- * C. The file state/in-history.new does not exist, but there are files
- * in state/temp/. The process died after step 4, that is during
- * phase II. Run the steps 5 to 9 manually. Then re-run the program.
- *
- * Whenever logs are parsed that are 4 days old or older, there may
- * already be output files in out/ that cannot be modified anymore. The
- * operator may decide to manually overwrite files in out/ with the files
- * in state/full/ or state/diff/. IMPORTANT: ONLY OVERWRITE FILES IN out/
- * IF YOU'RE CERTAIN HOW TO FIX THE PROGRAM THAT PARSES THESE FILES.
- * There are two possible situations:
- * A. There is a file in state/full/. This file is newer than the file
- * with the same name in out/ and contains everything from that file,
- * too. It's okay to overwrite the file in out/ with the file in
- * state/full/ and delete the file in state/full/.
- * B. There is a file in state/diff/. The file in out/ didn't exist
- * anymore when parsing more log lines for it. The file that was in
- * out/ should be located and merged with the file in state/diff/.
- * Afterwards, the file in state/diff/ must be deleted.
- */
-public class Main {
-
- /* Run the steps described above. */
- public static void main(String[] args) {
-
- /* Phase I */
- checkAndCreateLockFile(); /* Step 1 */
- readInHistoryFile(); /* Step 2 */
- readInDirectoryListing();
- for (File inFile : inFiles) { /* Step 3 */
- appendToInHistoryIfNotContained(inFile);
- if (!checkFileName(inFile) || checkParsedBefore(inFile)) {
- continue;
- }
- sanitizeInFile(inFile);
- }
- overwriteInHistoryFile(); /* Step 4 */
-
- /* Phase II */
- readOutHistoryFile(); /* Step 5 */
- for (String outputFileName : updatedOutputFiles) { /* Step 6 */
- moveOutputFile(outputFileName);
- }
- moveFullFilesToOut(); /* Step 7 */
- overwriteOutHistoryFile(); /* Step 8 */
- deleteLockFile(); /* Step 9 */
- }
-
- /* Define file and directory names. */
- private static File inDirectory = new File("in");
- private static File outDirectory = new File("out");
- private static File stateLockFile = new File("state/lock");
- private static File stateInHistoryFile = new File("state/in-history");
- private static File stateInHistoryNewFile =
- new File("state/in-history.new");
- private static File stateInHistoryOldFile =
- new File("state/in-history.old");
- private static File stateOutHistoryFile = new File("state/out-history");
- private static File stateOutHistoryNewFile =
- new File("state/out-history.new");
- private static File stateOutHistoryOldFile =
- new File("state/out-history.old");
- private static File stateDiffDirectory = new File("state/diff");
- private static String stateFullDirectoryString = "state/full";
- private static File stateFullDirectory = new File("state/full");
- private static File stateTempDirectory = new File("state/temp");
-
- /* Define data structures and helper classes. */
- private static Set<String> inHistoryFiles;
- private static Set<String> inHistoryNewFiles;
- private static Set<String> outHistoryFiles;
- private static Set<File> inFiles;
- private static Map<String, List<String>> cachedLinesPerOutputFile =
- new HashMap<String, List<String>>();
- private static int cachedLines = 0;
- private static Set<String> updatedOutputFiles = new HashSet<String>();
- private static SimpleDateFormat outputFileFormat =
- new SimpleDateFormat("yyyy/MM/dd/");
- private static SimpleDateFormat logDateFormat =
- new SimpleDateFormat("dd/MMM/yyyy");
- static {
- outputFileFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
- logDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
- }
- private static Pattern logLinePattern = Pattern.compile(
- "(0.0.0.[01]) \\S+ \\S+ \\[([\\w/]{11})[\\d:]+\\s[+\\-]\\d{4}\\] "
- + "\"GET (\\S+?) (HTTP/[\\d\\.]+)\" (\\d{3}) ([\\d-]+) \"[^\"]+\" "
- + "\"[^\"]+\"");
- private static long now = System.currentTimeMillis();
-
- /* Implement the substeps. */
- private static void checkAndCreateLockFile() {
- if (stateLockFile.exists()) {
- System.err.println("Lock file '" + stateLockFile.getAbsolutePath()
- + "' exists. This means that a previous run did not exit "
- + "cleanly. Exiting.");
- System.exit(1);
- }
- try {
- stateLockFile.getParentFile().mkdirs();
- BufferedWriter bw = new BufferedWriter(new FileWriter(
- stateLockFile));
- bw.close();
- } catch (IOException e) {
- e.printStackTrace();
- System.err.println("Could not create lock file '"
- + stateLockFile.getAbsolutePath() + "'. Exiting.");
- System.exit(1);
- }
- }
- private static void readInHistoryFile() {
- inHistoryFiles = readAndCopyHistoryFile(stateInHistoryFile,
- stateInHistoryNewFile);
- inHistoryNewFiles = new HashSet<String>(inHistoryFiles);
- }
- private static void readOutHistoryFile() {
- outHistoryFiles = readAndCopyHistoryFile(stateOutHistoryFile,
- stateOutHistoryNewFile);
- }
- private static Set<String> readAndCopyHistoryFile(File historyFile,
- File historyNewFile) {
- Set<String> result = new HashSet<String>();
- try {
- BufferedWriter bw = new BufferedWriter(new FileWriter(
- historyNewFile));
- if (historyFile.exists()) {
- BufferedReader br = new BufferedReader(new FileReader(
- historyFile));
- String line;
- while ((line = br.readLine()) != null) {
- result.add(line);
- bw.write(line + "\n");
- }
- br.close();
- }
- bw.close();
- } catch (IOException e) {
- e.printStackTrace();
- System.err.println("Could not read parse history file '"
- + historyFile.getAbsolutePath() + "'. Exiting.");
- System.exit(1);
- }
- return result;
- }
- private static void readInDirectoryListing() {
- inFiles = readDirectoryListing(inDirectory);
- }
- private static Set<File> readDirectoryListing(File directory) {
- Set<File> result = new HashSet<File>();
- if (directory.exists()) {
- Stack<File> files = new Stack<File>();
- files.add(directory);
- while (!files.isEmpty()) {
- File file = files.pop();
- if (file.isDirectory()) {
- files.addAll(Arrays.asList(file.listFiles()));
- } else {
- result.add(file);
- }
- }
- }
- return result;
- }
- private static void appendToInHistoryIfNotContained(File inFile) {
- if (!inHistoryNewFiles.contains(inFile.getAbsolutePath())) {
- inHistoryNewFiles.add(inFile.getAbsolutePath());
- String line = inFile.getAbsolutePath();
- appendToHistoryFile(stateInHistoryNewFile, line);
- }
- }
- private static void appendToHistoryFile(File historyFile, String line) {
- try {
- BufferedWriter bw = new BufferedWriter(new FileWriter(historyFile,
- true));
- bw.write(line + "\n");
- bw.close();
- } catch (IOException e) {
- e.printStackTrace();
- System.err.println("Could not append line '" + line + "' to parse "
- + "history file '" + historyFile.getAbsolutePath() + "'. "
- + "Exiting.");
- System.exit(1);
- }
- }
- private static boolean checkFileName(File inFile) {
- return inFile.getName().contains("access.log") &&
- inFile.getName().endsWith(".gz");
- }
- private static boolean checkParsedBefore(File inFile) {
- return inHistoryFiles.contains(inFile.getAbsolutePath());
- }
- private static void sanitizeInFile(File inFile) {
- try {
- BufferedReader br = new BufferedReader(new InputStreamReader(
- new GzipCompressorInputStream(new FileInputStream(inFile))));
- String line = null;
- String outputFilenamePart = inFile.getName().substring(0,
- inFile.getName().indexOf("access.log")) + "access.log";
- while (true) {
- line = br.readLine();
- if (line == null || cachedLines > 250000) {
- writeCachedLines();
- }
- if (line == null) {
- break;
- }
- if (!parseLine(line, outputFilenamePart)) {
- break;
- }
- }
- br.close();
- } catch (IOException e) {
- e.printStackTrace();
- System.out.println("Error while parsing log file '"
- + inFile.getAbsolutePath() + "'. Exiting.");
- System.exit(1);
- }
- }
- private static void writeCachedLines() {
- for (Map.Entry<String, List<String>> e :
- cachedLinesPerOutputFile.entrySet()) {
- String outputFilename = e.getKey();
- List<String> cachedLinesList = e.getValue();
- Collections.sort(cachedLinesList);
- storeOutputFile(outputFilename, cachedLinesList);
- }
- cachedLinesPerOutputFile.clear();
- cachedLines = 0;
- }
- private static void storeOutputFile(String outputFileName,
- List<String> cachedLinesList) {
- String outputGzFileName = outputFileName + ".gz";
- File stateTempFile = new File(stateTempDirectory, outputFileName);
- File stateTempGzFile = new File(stateTempDirectory, outputGzFileName);
- File stateFullFile = new File(stateFullDirectory, outputFileName);
- File stateFullGzFile = new File(stateFullDirectory, outputGzFileName);
- File outFile = new File(outDirectory, outputFileName);
- File outGzFile = new File(outDirectory, outputGzFileName);
- File stateDiffFile = new File(stateDiffDirectory, outputFileName);
- File stateDiffGzFile = new File(stateDiffDirectory, outputGzFileName);
- if (stateTempFile.exists()) {
- updatedOutputFiles.add(outputFileName);
- File stateTempOldFile = new File(stateTempDirectory,
- outputFileName + ".old");
- stateTempFile.renameTo(stateTempOldFile);
- mergeOutputFile(stateTempOldFile, cachedLinesList, stateTempFile);
- stateTempOldFile.delete();
- } else if (stateTempGzFile.exists()) {
- updatedOutputFiles.add(outputGzFileName);
- File stateTempGzOldFile = new File(stateTempDirectory,
- outputGzFileName + ".old");
- stateTempGzFile.renameTo(stateTempGzOldFile);
- mergeOutputFile(stateTempGzOldFile, cachedLinesList,
- stateTempGzFile);
- stateTempGzOldFile.delete();
- } else if (stateFullFile.exists()) {
- updatedOutputFiles.add(outputFileName);
- mergeOutputFile(stateFullFile, cachedLinesList, stateTempFile);
- } else if (stateFullGzFile.exists()) {
- updatedOutputFiles.add(outputGzFileName);
- mergeOutputFile(stateFullGzFile, cachedLinesList, stateTempGzFile);
- } else if (outFile.exists()) {
- updatedOutputFiles.add(outputFileName);
- mergeOutputFile(outFile, cachedLinesList, stateTempFile);
- } else if (outGzFile.exists()) {
- updatedOutputFiles.add(outputGzFileName);
- mergeOutputFile(outGzFile, cachedLinesList, stateTempGzFile);
- } else if (stateDiffFile.exists()) {
- updatedOutputFiles.add(outputFileName);
- mergeOutputFile(stateDiffFile, cachedLinesList, stateTempFile);
- } else if (stateDiffGzFile.exists()) {
- updatedOutputFiles.add(outputGzFileName);
- mergeOutputFile(stateDiffGzFile, cachedLinesList, stateTempGzFile);
- } else {
- updatedOutputFiles.add(outputGzFileName);
- writeNewOutputFile(cachedLinesList, stateTempGzFile);
- }
- }
- private static void mergeOutputFile(File oldOutputFile,
- List<String> cachedLinesList, File newOutputFile) {
- try {
- BufferedReader br;
- if (oldOutputFile.getName().endsWith(".gz")) {
- br = new BufferedReader(new InputStreamReader(
- new GzipCompressorInputStream(new FileInputStream(
- oldOutputFile))));
- } else {
- br = new BufferedReader(new FileReader(oldOutputFile));
- }
- String line;
- newOutputFile.getParentFile().mkdirs();
- BufferedWriter bw;
- if (newOutputFile.getName().endsWith(".gz")) {
- bw = new BufferedWriter(new OutputStreamWriter(
- new GzipCompressorOutputStream(new FileOutputStream(
- newOutputFile))));
- } else {
- bw = new BufferedWriter(new FileWriter(newOutputFile));
- }
- int cachedLinesListPosition = 0,
- totalCachedLines = cachedLinesList.size();
- while ((line = br.readLine()) != null) {
- while (cachedLinesListPosition < totalCachedLines &&
- cachedLinesList.get(cachedLinesListPosition).
- compareTo(line) <= 0) {
- bw.write(cachedLinesList.get(
- cachedLinesListPosition) + "\n");
- cachedLinesListPosition++;
- }
- bw.write(line + "\n");
- }
- while (cachedLinesListPosition < totalCachedLines) {
- bw.write(cachedLinesList.get(cachedLinesListPosition++) + "\n");
- }
- br.close();
- bw.close();
- } catch (IOException e) {
- e.printStackTrace();
- System.err.println("Could not merge old output file '"
- + oldOutputFile.getAbsolutePath() + "' with new log lines and "
- + "write it to '" + newOutputFile.getAbsolutePath()
- + "'. Exiting.");
- System.exit(1);
- }
- }
- private static void writeNewOutputFile(List<String> cachedLinesList,
- File outputFile) {
- try {
- outputFile.getParentFile().mkdirs();
- BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
- new GzipCompressorOutputStream(new FileOutputStream(
- outputFile))));
- for (String cachedLine : cachedLinesList) {
- bw.write(cachedLine + "\n");
- }
- bw.close();
- } catch (IOException e) {
- e.printStackTrace();
- System.err.println("Could not write output file '"
- + outputFile.getAbsolutePath() + "'. Exiting.");
- System.exit(1);
- }
- }
- private static boolean parseLine(String line,
- String outputFilenamePart) {
- Matcher matcher = logLinePattern.matcher(line);
- if (!matcher.matches()) {
- return true;
- }
- String statusCode = matcher.group(5);
- if (statusCode.equals("404")) {
- return true;
- }
- String ipAddress = matcher.group(1);
- String date = matcher.group(2);
- String url = matcher.group(3);
- if (url.contains("?")) {
- url = url.substring(0, url.indexOf("?"));
- }
- String httpVersion = matcher.group(4);
- String returnedBytes = matcher.group(6);
- String sanitizedLine = ipAddress + " - - [" + date
- + ":00:00:00 +0000] \"GET " + url + " " + httpVersion
- + "\" " + statusCode + " " + returnedBytes + " \"-\" \"-\"";
- String outputFilename = null;
- try {
- outputFilename = outputFileFormat.format(logDateFormat.parse(
- date).getTime()) + outputFilenamePart;
- } catch (ParseException e) {
- System.out.println("Error parsing date. Aborting to parse "
- + "this file.");
- return false;
- }
- if (!cachedLinesPerOutputFile.containsKey(outputFilename)) {
- cachedLinesPerOutputFile.put(outputFilename,
- new ArrayList<String>());
- }
- cachedLinesPerOutputFile.get(outputFilename).add(
- sanitizedLine);
- cachedLines++;
- return true;
- }
- private static void overwriteInHistoryFile() {
- stateInHistoryFile.renameTo(stateInHistoryOldFile);
- stateInHistoryNewFile.renameTo(stateInHistoryFile);
- stateInHistoryOldFile.delete();
- }
- private static void moveOutputFile(String outputFileName) {
- File outFile = new File(outDirectory, outputFileName);
- File stateTempFile = new File(stateTempDirectory, outputFileName);
- File stateFullFile = new File(stateFullDirectory, outputFileName);
- File stateDiffFile = new File(stateDiffDirectory, outputFileName);
- long ageInDays = -1L;
- try {
- ageInDays = (now
- - outputFileFormat.parse(outputFileName.substring(0,
- outputFileName.lastIndexOf("/") + 1)).getTime())
- / (24L * 60L * 60L * 1000L);
- } catch (ParseException e) {
- e.printStackTrace();
- System.err.println("Could not parse timestamp from '"
- + outputFileName + "'. Exiting.");
- System.exit(1);
- }
- if (outHistoryFiles.contains(outFile.getAbsolutePath())) {
- if (outFile.exists() || stateFullFile.exists()) {
- System.out.println("Could not write to output file '"
- + outFile.getAbsolutePath() + "', because that file was "
- + "written " + (outFile.exists() ? "" : "and deleted ")
- + "before and we're not supposed to change it anymore. The "
- + "updated file that could replace the output file is '"
- + stateFullFile.getAbsolutePath() + "'.");
- stateFullFile.getParentFile().mkdirs();
- stateTempFile.renameTo(stateFullFile);
- } else {
- System.out.println("Could not write to output file '"
- + outFile.getAbsolutePath() + "', because that file was "
- + "written and deleted before and we're not supposed to "
- + "change it anymore (even if we could). The file "
- + "containing the new lines only is '"
- + stateDiffFile.getAbsolutePath() + "'.");
- stateDiffFile.getParentFile().mkdirs();
- stateTempFile.renameTo(stateDiffFile);
- }
- } else if (ageInDays < 4L) {
- stateFullFile.getParentFile().mkdirs();
- stateTempFile.renameTo(stateFullFile);
- } else {
- outFile.getParentFile().mkdirs();
- String line = outFile.getAbsolutePath();
- appendToHistoryFile(stateOutHistoryNewFile, line);
- stateTempFile.renameTo(outFile);
- }
- }
- private static void moveFullFilesToOut() {
- Stack<String> fileNames = new Stack<String>();
- fileNames.add(stateFullDirectoryString);
- while (!fileNames.isEmpty()) {
- String fileName = fileNames.pop();
- File fileOrDirectory = new File(fileName);
- if (!fileOrDirectory.exists()) {
- continue;
- } else if (fileOrDirectory.isDirectory()) {
- for (File file : fileOrDirectory.listFiles()) {
- fileNames.add(fileName + "/" + file.getName());
- }
- } else {
- String outputFileName = fileName.substring(
- (stateFullDirectoryString + "/").length());
- File outFile = new File(outDirectory, outputFileName);
- File stateFullFile = new File(stateFullDirectory, outputFileName);
- long ageInDays = -1L;
- try {
- ageInDays = (now
- - outputFileFormat.parse(outputFileName.substring(0,
- outputFileName.lastIndexOf("/") + 1)).getTime())
- / (24L * 60L * 60L * 1000L);
- } catch (ParseException e) {
- e.printStackTrace();
- System.err.println("Could not parse timestamp from '"
- + outputFileName + "'. Exiting.");
- System.exit(1);
- }
- if (!outHistoryFiles.contains(outFile.getAbsolutePath()) &&
- ageInDays >= 4L) {
- outFile.getParentFile().mkdirs();
- String line = outFile.getAbsolutePath();
- appendToHistoryFile(stateOutHistoryNewFile, line);
- stateFullFile.renameTo(outFile);
- }
- }
- }
- }
- private static void overwriteOutHistoryFile() {
- stateOutHistoryFile.renameTo(stateOutHistoryOldFile);
- stateOutHistoryNewFile.renameTo(stateOutHistoryFile);
- stateOutHistoryOldFile.delete();
- }
- private static void deleteLockFile() {
- stateLockFile.delete();
- }
-}
-
diff --git a/src/sanitize.py b/src/sanitize.py
new file mode 100755
index 0000000..0c5de19
--- /dev/null
+++ b/src/sanitize.py
@@ -0,0 +1,82 @@
+#!/usr/bin/python
+""" Sanitize Apache web logs by removing all potentially sensitive parts.
+
+The following sanitizing steps are performed on data read from stdin:
+ 1. Die if a line is not in the Apache2 Combined Log Format.
+ 2. Die if other hosts than '0.0.0.0' or '0.0.0.1' are specified.
+ 3. Discard all lines with other methods than GET.
+ 4. Die if a protocol other than HTTP is used.
+ 5. Discard all lines with status code 404.
+ 6. Override client with '-'.
+ 7. Override user with '-'.
+ 8. Override time with '00:00:00 +0000'.
+ 9. Override referer (sic!) with '"-"'.
+ 10. Override user agent with '"-"'.
+ 11. Truncate resource at the first '?' character.
+ 12. Die if a valid date wasn't passed as the sole cmdline parameter.
+ 13. Die if a line has a date not equal or one day prior.
+
+USAGE: sanitize.py INPUT_FILE OUTPUT_DIR
+
+The main operation is to parse Apache web log files from INPUT_FILE and
+output the sanitized version in files according to the date of the log
+entry in OUTPUT_DIR. All discarded lines are output on stderr. A nonzero
+exit code indicates an error during processing, error messages go to
+stderr.
+
+The input filename is expected to be in the following format:
+ <hostname>.torproject.org-access.log-YYYYMMDD
+"""
+
+from __future__ import print_function
+
+import re
+import fileinput
+import sys
+import dateutil.parser
+import datetime
+
+assert(len(sys.argv) == 3)
+
+# Extract date from filename
+date = re.compile(r'[^0-9]*([0-9]{8})')
+matched = date.match(sys.argv[1])
+if matched is None:
+ print("Could not extract date from", sys.argv[1], file=sys.stderr)
+ sys.exit(1)
+today = dateutil.parser.parse(matched.group(1))
+
+is_valid_regex = re.compile(r'^0\.0\.0\.([01]) - - \[(\d{2}/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/\d{4}):00:00:00 \+0000\] "([^ ]*) ([^ ?]*[?]?|)[^ ]* HTTP([^"]*)" (-|\d*) (-|\d*) "([^\"]|\\|\")*" "([^"]|\")*" .*[^ ]$')
+sanitized_regex = r'0.0.0.\1 - - [\2:00:00:00 +0000] "\4 \5 HTTP\6" \7 \8 "-" "-" -\n'
+day_before = today - datetime.timedelta(days=1)
+
+today_fname = sys.argv[2] + "/" + sys.argv[1] + "_sanitized"
+yesterday_fname = today_fname.replace(matched.group(1), day_before.strftime("%Y%m%d"))
+
+with open(yesterday_fname, 'a') as file_old:
+ with open(today_fname, 'a') as file_new:
+ for line in fileinput.input(sys.argv[1]):
+ matched = is_valid_regex.match(line)
+ if matched is None:
+ print(line, "Last line does not match critera", file=sys.stderr)
+ sys.exit(1)
+ date = dateutil.parser.parse(matched.group(2))
+ if today != date and day_before != date:
+ print(line, "Last line does not match date constraints. today:", today,
+ " day before:", day_before, " date:", date, file=sys.stderr)
+ sys.exit(1)
+ requesttype = matched.group(4)
+ if requesttype != "GET" and requesttype != "HEAD":
+ #print(matched.expand(sanitized_regex), file=sys.stderr, end="")
+ continue
+
+ if matched.group(7) == "404":
+ #print(matched.expand(sanitized_regex), file=sys.stderr, end="")
+ continue
+
+ if today == date:
+ file_new.write(matched.expand(sanitized_regex))
+ else:
+ file_old.write(matched.expand(sanitized_regex))
+
+print(yesterday_fname)
diff --git a/src/treat_new_logs.sh b/src/treat_new_logs.sh
new file mode 100755
index 0000000..147ebec
--- /dev/null
+++ b/src/treat_new_logs.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+set -u
+set -e
+
+BASEDIR=/srv/webstats.torproject.org/
+SCRIPTDIR="${BASEDIR}/bin/"
+
+BASEINCOMINGDIR="${BASEDIR}/incoming/"
+
+cd "${BASEINCOMINGDIR}"
+for host in *; do
+ INCOMINGDIR="${BASEINCOMINGDIR}/${host}/"
+ WORKDIR="${BASEDIR}/work/${host}/"
+ WORKDIR_AWSTATS="${BASEDIR}/work_awstats/${host}/"
+ OUTDIR="${BASEDIR}/out/${host}/"
+ STAMPDIR="${BASEDIR}/stamp/${host}/"
+
+ cd "${INCOMINGDIR}"
+ mkdir -p "${WORKDIR}"
+ mkdir -p "${WORKDIR_AWSTATS}"
+ mkdir -p "${OUTDIR}"
+ mkdir -p "${STAMPDIR}"
+
+ for file in *; do
+ basefile=${file%.gz}
+ if [ -e "${STAMPDIR}/${file}_treated" ]; then
+ continue
+ fi
+ cp "${INCOMINGDIR}/${file}" "${WORKDIR}/${file}"
+ cd "${WORKDIR}"
+ gunzip ${file}
+ COMPLETED=$(${SCRIPTDIR}/sanitize.py "${basefile}" "${WORKDIR}")
+ COMPLETED_BASE=$(basename $COMPLETED)
+ COMPLETED_BASE=${COMPLETED_BASE%_sanitized}
+ sort "${COMPLETED}" > "${COMPLETED}_sorted"
+ xz -ck9e "${COMPLETED}_sorted" > "${OUTDIR}/${COMPLETED_BASE}.xz"
+ mv "${COMPLETED}_sorted" "${WORKDIR_AWSTATS}"
+
+ rm "${WORKDIR}/${basefile}"
+ rm "${WORKDIR}/${COMPLETED_BASE}_sanitized"
+ touch "${STAMPDIR}/${file}_treated"
+ done
+done
+