[tor-commits] [webstats/master] Start script to extract logs

sebastian at torproject.org sebastian at torproject.org
Tue Mar 22 05:12:17 UTC 2016


commit edddf0983d668ecd8c27700a83cb7e4d64255c07
Author: Sebastian Hahn <sebastian at torproject.org>
Date:   Fri Nov 13 06:41:57 2015 +0100

    Start script to extract logs
---
 bin/receive-log.sh                    |  47 ---
 bin/send-logs.sh                      |  73 -----
 build.xml                             |  36 --
 src/org/torproject/webstats/Main.java | 595 ----------------------------------
 src/sanitize.py                       |  82 +++++
 src/treat_new_logs.sh                 |  45 +++
 6 files changed, 127 insertions(+), 751 deletions(-)

diff --git a/bin/receive-log.sh b/bin/receive-log.sh
deleted file mode 100755
index 6f98eaa..0000000
--- a/bin/receive-log.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-
-# Copyright 2011 The Tor Project
-# See LICENSE for licensing information
-#
-# Read a binary file from stdin and store it to disk.
-
-set -e
-set -u
-
-# Read remote host name from local command-line argument.
-rhost="$1"
-if [ -z "$rhost" ]; then
-  echo "Missing or illegal host name in authorized_keys."
-  exit 1
-fi
-
-# Read remote log file name from SSH command-line arguments.
-rfile="${SSH_ORIGINAL_COMMAND:-}"
-if ! [[ "$rfile" =~ ^[a-zA-Z0-9.-]+$ ]]; then
-  echo "Missing or illegal file name in SSH command."
-  exit 1
-fi
-
-# Create directories for this remote host if it doesn't exist.
-ldir="in/$rhost"
-if ! [ -d "in" ]; then
-  mkdir "in"
-fi
-if ! [ -d "$ldir" ]; then
-  mkdir "$ldir"
-fi
-
-# Make sure the file doesn't exist yet.
-lfile="$ldir/$rfile"
-if [[ -f "$lfile" ]]; then
-  echo "File already exists."
-  exit 1
-fi
-
-# Write file from stdin to temporary file.
-tfile="$ldir/.$rfile"
-cat > "$tfile"
-
-# Move file from temp/ to in/ directory.
-mv "$tfile" "$lfile"
-
diff --git a/bin/send-logs.sh b/bin/send-logs.sh
deleted file mode 100755
index 77e7442..0000000
--- a/bin/send-logs.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-
-# Copyright 2011 The Tor Project
-# See LICENSE for licensing information
-#
-# Send new gzip'ed Apache web log files via ssh to a remote server.
-
-set -e
-set -u
-
-usage() {
-  echo "Usage: $0 logs-dir ssh-string state-dir"
-  exit 1
-}
-
-# Check command-line arguments.
-if [ "$#" != 3 ]; then
-  usage
-fi
-ldir="$1"
-sshstring="$2"
-tdir="$3"
-if ! [ -d "$ldir" ]; then
-  echo "Directory '$1' does not exist."
-  usage
-fi
-
-# Create the local state directory that has empty files for every log file
-# that we sent to the remote server.
-if ! [ -d "$tdir" ]; then
-  mkdir "$tdir"
-fi
-
-# Iterate over files with file names containing access.log and ending in
-# .gz and send the ones we haven't sent before to the remote server.
-for i in $(find "$ldir" -maxdepth 1 -type f -name '*access.log*.gz'); do
-
-  # Check that the file exists.  This works around issues like filenames
-  # containing spaces.
-  if ! [ -e $i ]; then
-    echo "File '$i' does not exist."
-    continue
-  fi
-
-  # Extract the file name part to pass it as SSH parameter.
-  fname="$(basename $i)"
-
-  # Warn if file names contain illegal characters that the receiver would
-  # not accept.
-  if ! [[ $fname =~ ^[a-zA-Z0-9.-]+$ ]]; then
-    echo "Illegal file name '$fname'."
-    continue
-  fi
-
-  # Copy the file content to the remote server if we haven't sent it
-  # before.
-  tfile="$tdir/$fname"
-  if ! [ -f "$tfile" ]; then
-    ssh -o PreferredAuthentications=publickey $sshstring "$fname" < "$i"
-  fi
-
-  # Add a state file to note that we don't attempt to send this file in
-  # the next execution.
-  touch "$tfile"
-
-done
-
-# Delete all state files for which there are no log files anymore.
-for i in $(find "$tdir" -type f); do
-  fname="$(basename $i)"
-  [ -e "$ldir/$fname" ] || rm -f "$tdir/$fname"
-done
-
diff --git a/build.xml b/build.xml
deleted file mode 100644
index 7f0b430..0000000
--- a/build.xml
+++ /dev/null
@@ -1,36 +0,0 @@
-<project default="run" name="webstats" basedir=".">
-  <property name="sources" value="src"/>
-  <property name="classes" value="classes"/>
-  <property name="libs" value="lib"/>
-  <path id="classpath">
-    <pathelement path="${classes}"/>
-    <pathelement location="${libs}/commons-compress-1.0.jar"/>
-  </path>
-  <target name="init">
-    <mkdir dir="${classes}"/>
-  </target>
-  <target name="compile"
-          depends="init">
-    <javac destdir="${classes}"
-           srcdir="${sources}"
-           source="1.5"
-           target="1.5"
-           debug="true"
-           deprecation="true"
-           optimize="false"
-           failonerror="true"
-           includeantruntime="false">
-      <classpath>
-        <fileset dir="${libs}"/>
-      </classpath>
-    </javac>
-  </target>
-  <target name="run" depends="compile">
-    <java fork="true"
-          maxmemory="512m"
-          classname="org.torproject.webstats.Main">
-      <classpath refid="classpath"/>
-    </java>
-  </target>
-</project>
-
diff --git a/src/org/torproject/webstats/Main.java b/src/org/torproject/webstats/Main.java
deleted file mode 100644
index 378093e..0000000
--- a/src/org/torproject/webstats/Main.java
+++ /dev/null
@@ -1,595 +0,0 @@
-package org.torproject.webstats;
-
-import java.io.*;
-import java.text.*;
-import java.util.*;
-import java.util.regex.*;
-
-import org.apache.commons.compress.compressors.gzip.*;
-
-/*
- * Sanitize Apache web logs by removing all potentially sensitive parts.
- *
- * The following sanitizing steps are performed on input web logs:
- *   1. Discard all lines that are not in the Combined Log Format.
- *   2. Discard all lines with other hosts than '0.0.0.0' or '0.0.0.1'.
- *   3. Discard all lines with other methods than GET.
- *   4. Discard all lines with other protocols than HTTP.
- *   5. Discard all lines with status code 404.
- *   6. Override client with '-'.
- *   7. Override user with '-'.
- *   8. Override time with '00:00:00 +0000'.
- *   9. Override referer (sic!) with '"-"'.
- *  10. Override user agent with '"-"'.
- *  11. Truncate resource at the first '?' character.
- *
- * The main operation is to parse Apache web log files from the in/
- * directory and write sanitized web log files to the out/ directory.
- * Files in the in/ directory are assumed to never change and may be
- * deleted after processing by this program.  Files in the out/ directory
- * are guaranteed to never change and may be deleted by a subsequently
- * running program.
- *
- * This program uses a couple of state files to make sure that files in
- * in/ are not parsed more than once and that files in out/ do not need to
- * be changed:
- * - state/lock prevents concurrent executions of this program.
- * - state/in-history contains file names of previously read and possibly
- *   deleted files in the in/ directory.
- * - state/in-history.new is the file written in the current execution
- *   that will replace state/in-history during the execution.
- * - state/temp/ contains new or updated output files parsed in the
- *   current execution that are moved to out/, state/full/, or state/diff/
- *   during the execution.
- * - state/out-history contains file names of previously written and
- *   possibly deleted files in the out/ directory.
- * - state/out-history.new is the file written in the current execution
- *   that will replace state/out-history at the end of the execution.
- * - state/full/ contains complete output files that may or may not be
- *   newer than files in the out/ directory.
- * - state/diff/ contains new parts for files in the out/ directory which
- *   have been deleted.
- *
- * The phases and steps taken by this program are as follows:
- *
- * Phase I: Read files from in/, sanitize them, and write them to state/.
- *  1. Check that state/lock does not exists, or exit immediately.  Add a
- *     new state/lock file.
- *  2. Read the contents from state/in-history and the directory listing
- *     of in/ to memory.
- *  3. For each file in in/:
- *     a. Append the file name to state/in-history.new if it was not
- *        contained in state/in-history.  If it was contained, skip the
- *        file.
- *     b. Parse and sanitize the file in chunks of 250,000 lines to reduce
- *        writes.
- *     c. When writing sanitized chunks to output files, for each output
- *        file, check in the following order if there is already such a
- *        file in
- *          i. state/temp/,
- *         ii. state/full/,
- *        iii. out/, or
- *         iv. state/diff/.
- *        If there's such a file, merge the newly sanitized lines into
- *        that file and write the sorted result to state/temp/.
- *  4. Rename state/in-history to state/in-history.old and rename
- *     state/in-history.new to state/in-history.  Delete
- *     state/in-history.old.
- *
- * Phase II: Move files that won't change anymore from state/ to out/.
- *  5. Read the contents from state/out-history to memory.
- *  6. For each file in state/temp/:
- *     a. Check if there's a corresponding line in state/out-history.  If
- *        so, check whether there is a file in state/full/ or out/.  If
- *        so, move the file to state/full/.  Otherwise move the file to
- *        state/diff/.  Print out a warning that there is a more recent
- *        file available.
- *     b. If a. does not apply and the sanitized log is less than four (4)
- *        days old, move the file to state/full/.
- *     c. If b. does not apply, append a line to out-history.new and move
- *        the file to out/.
- *  7. For each file in state/full/, check whether the sanitized log is at
- *     least four (4) days old and not contained in state/out-history.  If
- *     so, append a line to out-history.new and move the file to out/.
- *  8. Rename state/out-history to state/out-history.old and rename
- *     state/out-history.new to state/out-history.  Delete
- *     state/out-history.old.
- *  9. Delete state/lock and exit.
- *
- * If the program is interrupted and leaves a lock file in state/lock, it
- * requires an operator to fix the state/ directory and make it work
- * again.  IMPORTANT: DO NOT CHANGE ANYTHING IN THE state/ DIRECTORY
- * UNLESS YOU'RE CERTAIN WHAT YOU'RE DOING!  The following situations can
- * happen.  It may make sense to try a solution in a test environment
- * first:
- *  A. The file state/in-history.new does not exist and there are no files
- *     in state/temp/.  The process died before step 3, that is before
- *     actually doing anything of phase I.  Delete state/lock and re-run
- *     the program.
- *  B. The file state/in-history.new exists and there are files in
- *     state/temp/.  The process died during steps 3 or 4, that is, during
- *     phase I.  Delete all files in state/temp/.  If state/in-history
- *     does not exist but state/in-history.old does exist, rename the
- *     latter to the former.  Delete state/lock and re-run the program.
- *  C. The file state/in-history.new does not exist, but there are files
- *     in state/temp/.  The process died after step 4, that is during
- *     phase II.  Run the steps 5 to 9 manually.  Then re-run the program.
- *
- * Whenever logs are parsed that are 4 days old or older, there may
- * already be output files in out/ that cannot be modified anymore.  The
- * operator may decide to manually overwrite files in out/ with the files
- * in state/full/ or state/diff/.  IMPORTANT: ONLY OVERWRITE FILES IN out/
- * IF YOU'RE CERTAIN HOW TO FIX THE PROGRAM THAT PARSES THESE FILES.
- * There are two possible situations:
- *  A. There is a file in state/full/.  This file is newer than the file
- *     with the same name in out/ and contains everything from that file,
- *     too.  It's okay to overwrite the file in out/ with the file in
- *     state/full/ and delete the file in state/full/.
- *  B. There is a file in state/diff/.  The file in out/ didn't exist
- *     anymore when parsing more log lines for it.  The file that was in
- *     out/ should be located and merged with the file in state/diff/.
- *     Afterwards, the file in state/diff/ must be deleted.
- */
-public class Main {
-
-  /* Run the steps described above. */
-  public static void main(String[] args) {
-
-    /* Phase I */
-    checkAndCreateLockFile(); /* Step 1 */
-    readInHistoryFile(); /* Step 2 */
-    readInDirectoryListing();
-    for (File inFile : inFiles) { /* Step 3 */
-      appendToInHistoryIfNotContained(inFile);
-      if (!checkFileName(inFile) || checkParsedBefore(inFile)) {
-        continue;
-      }
-      sanitizeInFile(inFile);
-    }
-    overwriteInHistoryFile(); /* Step 4 */
-
-    /* Phase II */
-    readOutHistoryFile(); /* Step 5 */
-    for (String outputFileName : updatedOutputFiles) { /* Step 6 */
-      moveOutputFile(outputFileName);
-    }
-    moveFullFilesToOut(); /* Step 7 */
-    overwriteOutHistoryFile(); /* Step 8 */
-    deleteLockFile(); /* Step 9 */
-  }
-
-  /* Define file and directory names. */
-  private static File inDirectory = new File("in");
-  private static File outDirectory = new File("out");
-  private static File stateLockFile = new File("state/lock");
-  private static File stateInHistoryFile = new File("state/in-history");
-  private static File stateInHistoryNewFile =
-      new File("state/in-history.new");
-  private static File stateInHistoryOldFile =
-      new File("state/in-history.old");
-  private static File stateOutHistoryFile = new File("state/out-history");
-  private static File stateOutHistoryNewFile =
-      new File("state/out-history.new");
-  private static File stateOutHistoryOldFile =
-      new File("state/out-history.old");
-  private static File stateDiffDirectory = new File("state/diff");
-  private static String stateFullDirectoryString = "state/full";
-  private static File stateFullDirectory = new File("state/full");
-  private static File stateTempDirectory = new File("state/temp");
-
-  /* Define data structures and helper classes. */
-  private static Set<String> inHistoryFiles;
-  private static Set<String> inHistoryNewFiles;
-  private static Set<String> outHistoryFiles;
-  private static Set<File> inFiles;
-  private static Map<String, List<String>> cachedLinesPerOutputFile =
-      new HashMap<String, List<String>>();
-  private static int cachedLines = 0;
-  private static Set<String> updatedOutputFiles = new HashSet<String>();
-  private static SimpleDateFormat outputFileFormat =
-      new SimpleDateFormat("yyyy/MM/dd/");
-  private static SimpleDateFormat logDateFormat =
-      new SimpleDateFormat("dd/MMM/yyyy");
-  static {
-    outputFileFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
-    logDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
-  }
-  private static Pattern logLinePattern = Pattern.compile(
-      "(0.0.0.[01]) \\S+ \\S+ \\[([\\w/]{11})[\\d:]+\\s[+\\-]\\d{4}\\] "
-      + "\"GET (\\S+?) (HTTP/[\\d\\.]+)\" (\\d{3}) ([\\d-]+) \"[^\"]+\" "
-      + "\"[^\"]+\"");
-  private static long now = System.currentTimeMillis();
-
-  /* Implement the substeps. */
-  private static void checkAndCreateLockFile() {
-    if (stateLockFile.exists()) {
-      System.err.println("Lock file '" + stateLockFile.getAbsolutePath()
-          + "' exists.  This means that a previous run did not exit "
-          + "cleanly.  Exiting.");
-      System.exit(1);
-    }
-    try {
-      stateLockFile.getParentFile().mkdirs();
-      BufferedWriter bw = new BufferedWriter(new FileWriter(
-          stateLockFile));
-      bw.close();
-    } catch (IOException e) {
-      e.printStackTrace();
-      System.err.println("Could not create lock file '"
-          + stateLockFile.getAbsolutePath() + "'.  Exiting.");
-      System.exit(1);
-    }
-  }
-  private static void readInHistoryFile() {
-    inHistoryFiles = readAndCopyHistoryFile(stateInHistoryFile,
-        stateInHistoryNewFile);
-    inHistoryNewFiles = new HashSet<String>(inHistoryFiles);
-  }
-  private static void readOutHistoryFile() {
-    outHistoryFiles = readAndCopyHistoryFile(stateOutHistoryFile,
-        stateOutHistoryNewFile);
-  }
-  private static Set<String> readAndCopyHistoryFile(File historyFile,
-      File historyNewFile) {
-    Set<String> result = new HashSet<String>();
-    try {
-      BufferedWriter bw = new BufferedWriter(new FileWriter(
-          historyNewFile));
-      if (historyFile.exists()) {
-        BufferedReader br = new BufferedReader(new FileReader(
-            historyFile));
-        String line;
-        while ((line = br.readLine()) != null) {
-          result.add(line);
-          bw.write(line + "\n");
-        }
-        br.close();
-      }
-      bw.close();
-    } catch (IOException e) {
-      e.printStackTrace();
-      System.err.println("Could not read parse history file '"
-          + historyFile.getAbsolutePath() + "'.  Exiting.");
-      System.exit(1);
-    }
-    return result;
-  }
-  private static void readInDirectoryListing() {
-    inFiles = readDirectoryListing(inDirectory);
-  }
-  private static Set<File> readDirectoryListing(File directory) {
-    Set<File> result = new HashSet<File>();
-    if (directory.exists()) {
-      Stack<File> files = new Stack<File>();
-      files.add(directory);
-      while (!files.isEmpty()) {
-        File file = files.pop();
-        if (file.isDirectory()) {
-          files.addAll(Arrays.asList(file.listFiles()));
-        } else {
-          result.add(file);
-        }
-      }
-    }
-    return result;
-  }
-  private static void appendToInHistoryIfNotContained(File inFile) {
-    if (!inHistoryNewFiles.contains(inFile.getAbsolutePath())) {
-      inHistoryNewFiles.add(inFile.getAbsolutePath());
-      String line = inFile.getAbsolutePath();
-      appendToHistoryFile(stateInHistoryNewFile, line);
-    }
-  }
-  private static void appendToHistoryFile(File historyFile, String line) {
-    try {
-      BufferedWriter bw = new BufferedWriter(new FileWriter(historyFile,
-          true));
-      bw.write(line + "\n");
-      bw.close();
-    } catch (IOException e) {
-      e.printStackTrace();
-      System.err.println("Could not append line '" + line + "' to parse "
-          + "history file '" + historyFile.getAbsolutePath() + "'.  "
-          + "Exiting.");
-      System.exit(1);
-    }
-  }
-  private static boolean checkFileName(File inFile) {
-    return inFile.getName().contains("access.log") &&
-        inFile.getName().endsWith(".gz");
-  }
-  private static boolean checkParsedBefore(File inFile) {
-    return inHistoryFiles.contains(inFile.getAbsolutePath());
-  }
-  private static void sanitizeInFile(File inFile) {
-    try {
-      BufferedReader br = new BufferedReader(new InputStreamReader(
-          new GzipCompressorInputStream(new FileInputStream(inFile))));
-      String line = null;
-      String outputFilenamePart = inFile.getName().substring(0,
-          inFile.getName().indexOf("access.log")) + "access.log";
-      while (true) {
-        line = br.readLine();
-        if (line == null || cachedLines > 250000) {
-          writeCachedLines();
-        }
-        if (line == null) {
-          break;
-        }
-        if (!parseLine(line, outputFilenamePart)) {
-          break;
-        }
-      }
-      br.close();
-    } catch (IOException e) {
-      e.printStackTrace();
-      System.out.println("Error while parsing log file '"
-          + inFile.getAbsolutePath() + "'.  Exiting.");
-      System.exit(1);
-    }
-  }
-  private static void writeCachedLines() {
-    for (Map.Entry<String, List<String>> e :
-        cachedLinesPerOutputFile.entrySet()) {
-      String outputFilename = e.getKey();
-      List<String> cachedLinesList = e.getValue();
-      Collections.sort(cachedLinesList);
-      storeOutputFile(outputFilename, cachedLinesList);
-    }
-    cachedLinesPerOutputFile.clear();
-    cachedLines = 0;
-  }
-  private static void storeOutputFile(String outputFileName,
-      List<String> cachedLinesList) {
-    String outputGzFileName = outputFileName + ".gz";
-    File stateTempFile = new File(stateTempDirectory, outputFileName);
-    File stateTempGzFile = new File(stateTempDirectory, outputGzFileName);
-    File stateFullFile = new File(stateFullDirectory, outputFileName);
-    File stateFullGzFile = new File(stateFullDirectory, outputGzFileName);
-    File outFile = new File(outDirectory, outputFileName);
-    File outGzFile = new File(outDirectory, outputGzFileName);
-    File stateDiffFile = new File(stateDiffDirectory, outputFileName);
-    File stateDiffGzFile = new File(stateDiffDirectory, outputGzFileName);
-    if (stateTempFile.exists()) {
-      updatedOutputFiles.add(outputFileName);
-      File stateTempOldFile = new File(stateTempDirectory,
-          outputFileName + ".old");
-      stateTempFile.renameTo(stateTempOldFile);
-      mergeOutputFile(stateTempOldFile, cachedLinesList, stateTempFile);
-      stateTempOldFile.delete();
-    } else if (stateTempGzFile.exists()) {
-      updatedOutputFiles.add(outputGzFileName);
-      File stateTempGzOldFile = new File(stateTempDirectory,
-          outputGzFileName + ".old");
-      stateTempGzFile.renameTo(stateTempGzOldFile);
-      mergeOutputFile(stateTempGzOldFile, cachedLinesList,
-          stateTempGzFile);
-      stateTempGzOldFile.delete();
-    } else if (stateFullFile.exists()) {
-      updatedOutputFiles.add(outputFileName);
-      mergeOutputFile(stateFullFile, cachedLinesList, stateTempFile);
-    } else if (stateFullGzFile.exists()) {
-      updatedOutputFiles.add(outputGzFileName);
-      mergeOutputFile(stateFullGzFile, cachedLinesList, stateTempGzFile);
-    } else if (outFile.exists()) {
-      updatedOutputFiles.add(outputFileName);
-      mergeOutputFile(outFile, cachedLinesList, stateTempFile);
-    } else if (outGzFile.exists()) {
-      updatedOutputFiles.add(outputGzFileName);
-      mergeOutputFile(outGzFile, cachedLinesList, stateTempGzFile);
-    } else if (stateDiffFile.exists()) {
-      updatedOutputFiles.add(outputFileName);
-      mergeOutputFile(stateDiffFile, cachedLinesList, stateTempFile);
-    } else if (stateDiffGzFile.exists()) {
-      updatedOutputFiles.add(outputGzFileName);
-      mergeOutputFile(stateDiffGzFile, cachedLinesList, stateTempGzFile);
-    } else {
-      updatedOutputFiles.add(outputGzFileName);
-      writeNewOutputFile(cachedLinesList, stateTempGzFile);
-    }
-  }
-  private static void mergeOutputFile(File oldOutputFile,
-      List<String> cachedLinesList, File newOutputFile) {
-    try {
-      BufferedReader br;
-      if (oldOutputFile.getName().endsWith(".gz")) {
-        br = new BufferedReader(new InputStreamReader(
-            new GzipCompressorInputStream(new FileInputStream(
-            oldOutputFile))));
-      } else {
-        br = new BufferedReader(new FileReader(oldOutputFile));
-      }
-      String line;
-      newOutputFile.getParentFile().mkdirs();
-      BufferedWriter bw;
-      if (newOutputFile.getName().endsWith(".gz")) {
-        bw = new BufferedWriter(new OutputStreamWriter(
-            new GzipCompressorOutputStream(new FileOutputStream(
-            newOutputFile))));
-      } else {
-        bw = new BufferedWriter(new FileWriter(newOutputFile));
-      }
-      int cachedLinesListPosition = 0,
-          totalCachedLines = cachedLinesList.size();
-      while ((line = br.readLine()) != null) {
-        while (cachedLinesListPosition < totalCachedLines &&
-            cachedLinesList.get(cachedLinesListPosition).
-            compareTo(line) <= 0) {
-          bw.write(cachedLinesList.get(
-              cachedLinesListPosition) + "\n");
-          cachedLinesListPosition++;
-        }
-        bw.write(line + "\n");
-      }
-      while (cachedLinesListPosition < totalCachedLines) {
-        bw.write(cachedLinesList.get(cachedLinesListPosition++) + "\n");
-      }
-      br.close();
-      bw.close();
-    } catch (IOException e) {
-      e.printStackTrace();
-      System.err.println("Could not merge old output file '"
-          + oldOutputFile.getAbsolutePath() + "' with new log lines and "
-          + "write it to '" + newOutputFile.getAbsolutePath()
-          + "'.  Exiting.");
-      System.exit(1);
-    }
-  }
-  private static void writeNewOutputFile(List<String> cachedLinesList,
-      File outputFile) {
-    try {
-      outputFile.getParentFile().mkdirs();
-      BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
-          new GzipCompressorOutputStream(new FileOutputStream(
-          outputFile))));
-      for (String cachedLine : cachedLinesList) {
-        bw.write(cachedLine + "\n");
-      }
-      bw.close();
-    } catch (IOException e) {
-      e.printStackTrace();
-      System.err.println("Could not write output file '"
-          + outputFile.getAbsolutePath() + "'.  Exiting.");
-      System.exit(1);
-    }
-  }
-  private static boolean parseLine(String line,
-      String outputFilenamePart) {
-    Matcher matcher = logLinePattern.matcher(line);
-    if (!matcher.matches()) {
-      return true;
-    }
-    String statusCode = matcher.group(5);
-    if (statusCode.equals("404")) {
-      return true;
-    }
-    String ipAddress = matcher.group(1);
-    String date = matcher.group(2);
-    String url = matcher.group(3);
-    if (url.contains("?")) {
-      url = url.substring(0, url.indexOf("?"));
-    }
-    String httpVersion = matcher.group(4);
-    String returnedBytes = matcher.group(6);
-    String sanitizedLine = ipAddress + " - - [" + date
-        + ":00:00:00 +0000] \"GET " + url + " " + httpVersion
-        + "\" " + statusCode + " " + returnedBytes + " \"-\" \"-\"";
-    String outputFilename = null;
-    try {
-      outputFilename = outputFileFormat.format(logDateFormat.parse(
-          date).getTime()) + outputFilenamePart;
-    } catch (ParseException e) {
-      System.out.println("Error parsing date.  Aborting to parse "
-          + "this file.");
-      return false;
-    }
-    if (!cachedLinesPerOutputFile.containsKey(outputFilename)) {
-      cachedLinesPerOutputFile.put(outputFilename,
-          new ArrayList<String>());
-    }
-    cachedLinesPerOutputFile.get(outputFilename).add(
-        sanitizedLine);
-    cachedLines++;
-    return true;
-  }
-  private static void overwriteInHistoryFile() {
-    stateInHistoryFile.renameTo(stateInHistoryOldFile);
-    stateInHistoryNewFile.renameTo(stateInHistoryFile);
-    stateInHistoryOldFile.delete();
-  }
-  private static void moveOutputFile(String outputFileName) {
-    File outFile = new File(outDirectory, outputFileName);
-    File stateTempFile = new File(stateTempDirectory, outputFileName);
-    File stateFullFile = new File(stateFullDirectory, outputFileName);
-    File stateDiffFile = new File(stateDiffDirectory, outputFileName);
-    long ageInDays = -1L;
-    try {
-      ageInDays = (now
-          - outputFileFormat.parse(outputFileName.substring(0,
-          outputFileName.lastIndexOf("/") + 1)).getTime())
-          / (24L * 60L * 60L * 1000L);
-    } catch (ParseException e) {
-      e.printStackTrace();
-      System.err.println("Could not parse timestamp from '"
-          + outputFileName + "'.  Exiting.");
-      System.exit(1);
-    }
-    if (outHistoryFiles.contains(outFile.getAbsolutePath())) {
-      if (outFile.exists() || stateFullFile.exists()) {
-        System.out.println("Could not write to output file '"
-            + outFile.getAbsolutePath() + "', because that file was "
-            + "written " + (outFile.exists() ? "" : "and deleted ")
-            + "before and we're not supposed to change it anymore.  The "
-            + "updated file that could replace the output file is '"
-            + stateFullFile.getAbsolutePath() + "'.");
-        stateFullFile.getParentFile().mkdirs();
-        stateTempFile.renameTo(stateFullFile);
-      } else {
-        System.out.println("Could not write to output file '"
-            + outFile.getAbsolutePath() + "', because that file was "
-            + "written and deleted before and we're not supposed to "
-            + "change it anymore (even if we could).  The file "
-            + "containing the new lines only is '"
-            + stateDiffFile.getAbsolutePath() + "'.");
-        stateDiffFile.getParentFile().mkdirs();
-        stateTempFile.renameTo(stateDiffFile);
-      }
-    } else if (ageInDays < 4L) {
-      stateFullFile.getParentFile().mkdirs();
-      stateTempFile.renameTo(stateFullFile);
-    } else {
-      outFile.getParentFile().mkdirs();
-      String line = outFile.getAbsolutePath();
-      appendToHistoryFile(stateOutHistoryNewFile, line);
-      stateTempFile.renameTo(outFile);
-    }
-  }
-  private static void moveFullFilesToOut() {
-    Stack<String> fileNames = new Stack<String>();
-    fileNames.add(stateFullDirectoryString);
-    while (!fileNames.isEmpty()) {
-      String fileName = fileNames.pop();
-      File fileOrDirectory = new File(fileName);
-      if (!fileOrDirectory.exists()) {
-        continue;
-      } else if (fileOrDirectory.isDirectory()) {
-        for (File file : fileOrDirectory.listFiles()) {
-          fileNames.add(fileName + "/" + file.getName());
-        }
-      } else {
-        String outputFileName = fileName.substring(
-            (stateFullDirectoryString + "/").length());
-        File outFile = new File(outDirectory, outputFileName);
-        File stateFullFile = new File(stateFullDirectory, outputFileName);
-        long ageInDays = -1L;
-        try {
-          ageInDays = (now
-              - outputFileFormat.parse(outputFileName.substring(0,
-              outputFileName.lastIndexOf("/") + 1)).getTime())
-              / (24L * 60L * 60L * 1000L);
-        } catch (ParseException e) {
-          e.printStackTrace();
-          System.err.println("Could not parse timestamp from '"
-              + outputFileName + "'.  Exiting.");
-          System.exit(1);
-        }
-        if (!outHistoryFiles.contains(outFile.getAbsolutePath()) &&
-            ageInDays >= 4L) {
-          outFile.getParentFile().mkdirs();
-          String line = outFile.getAbsolutePath();
-          appendToHistoryFile(stateOutHistoryNewFile, line);
-          stateFullFile.renameTo(outFile);
-        }
-      }
-    }
-  }
-  private static void overwriteOutHistoryFile() {
-    stateOutHistoryFile.renameTo(stateOutHistoryOldFile);
-    stateOutHistoryNewFile.renameTo(stateOutHistoryFile);
-    stateOutHistoryOldFile.delete();
-  }
-  private static void deleteLockFile() {
-    stateLockFile.delete();
-  }
-}
-
diff --git a/src/sanitize.py b/src/sanitize.py
new file mode 100755
index 0000000..0c5de19
--- /dev/null
+++ b/src/sanitize.py
@@ -0,0 +1,82 @@
+#!/usr/bin/python
+""" Sanitize Apache web logs by removing all potentially sensitive parts.
+
+The following sanitizing steps are performed on data read from stdin:
+  1. Die if a line is not in the Apache2 Combined Log Format.
+  2. Die if other hosts than '0.0.0.0' or '0.0.0.1' are specified.
+  3. Discard all lines with other methods than GET.
+  4. Die if a protocol other than HTTP is used.
+  5. Discard all lines with status code 404.
+  6. Override client with '-'.
+  7. Override user with '-'.
+  8. Override time with '00:00:00 +0000'.
+  9. Override referer (sic!) with '"-"'.
+ 10. Override user agent with '"-"'.
+ 11. Truncate resource at the first '?' character.
+ 12. Die if a valid date wasn't passed as the sole cmdline parameter.
+ 13. Die if a line has a date not equal or one day prior.
+
+USAGE: sanitize.py INPUT_FILE OUTPUT_DIR
+
+The main operation is to parse Apache web log files from INPUT_FILE and
+output the sanitized version in files according to the date of the log
+entry in OUTPUT_DIR. All discarded lines are output on stderr. A nonzero
+exit code indicates an error during processing, error messages go to
+stderr.
+
+The input filename is expected to be in the following format:
+    <hostname>.torproject.org-access.log-YYYYMMDD
+"""
+
+from __future__ import print_function
+
+import re
+import fileinput
+import sys
+import dateutil.parser
+import datetime
+
+assert(len(sys.argv) == 3)
+
+# Extract date from filename
+date = re.compile(r'[^0-9]*([0-9]{8})')
+matched = date.match(sys.argv[1])
+if matched is None:
+    print("Could not extract date from", sys.argv[1], file=sys.stderr)
+    sys.exit(1)
+today = dateutil.parser.parse(matched.group(1))
+
+is_valid_regex = re.compile(r'^0\.0\.0\.([01]) - - \[(\d{2}/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/\d{4}):00:00:00 \+0000\] "([^ ]*) ([^ ?]*[?]?|)[^ ]* HTTP([^"]*)" (-|\d*) (-|\d*) "([^\"]|\\|\")*" "([^"]|\")*" .*[^ ]$')
+sanitized_regex = r'0.0.0.\1 - - [\2:00:00:00 +0000] "\4 \5 HTTP\6" \7 \8 "-" "-" -\n'
+day_before = today - datetime.timedelta(days=1)
+
+today_fname = sys.argv[2] + "/" + sys.argv[1] + "_sanitized"
+yesterday_fname = today_fname.replace(matched.group(1), day_before.strftime("%Y%m%d"))
+
+with open(yesterday_fname, 'a') as file_old:
+    with open(today_fname, 'a') as file_new:
+        for line in fileinput.input(sys.argv[1]):
+            matched = is_valid_regex.match(line)
+            if matched is None:
+                print(line, "Last line does not match critera", file=sys.stderr)
+                sys.exit(1)
+            date = dateutil.parser.parse(matched.group(2))
+            if today != date and day_before != date:
+                print(line, "Last line does not match date constraints. today:", today,
+                        " day before:", day_before, " date:", date, file=sys.stderr)
+                sys.exit(1)
+            requesttype = matched.group(4)
+            if requesttype != "GET" and requesttype != "HEAD":
+                #print(matched.expand(sanitized_regex), file=sys.stderr, end="")
+                continue
+
+            if matched.group(7) == "404":
+                #print(matched.expand(sanitized_regex), file=sys.stderr, end="")
+                continue
+
+            if today == date:
+                file_new.write(matched.expand(sanitized_regex))
+            else:
+                file_old.write(matched.expand(sanitized_regex))
+
+print(yesterday_fname)
diff --git a/src/treat_new_logs.sh b/src/treat_new_logs.sh
new file mode 100755
index 0000000..147ebec
--- /dev/null
+++ b/src/treat_new_logs.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+set -u
+set -e
+
+BASEDIR=/srv/webstats.torproject.org/
+SCRIPTDIR="${BASEDIR}/bin/"
+
+BASEINCOMINGDIR="${BASEDIR}/incoming/"
+
+cd "${BASEINCOMINGDIR}"
+for host in *; do
+    INCOMINGDIR="${BASEINCOMINGDIR}/${host}/"
+    WORKDIR="${BASEDIR}/work/${host}/"
+    WORKDIR_AWSTATS="${BASEDIR}/work_awstats/${host}/"
+    OUTDIR="${BASEDIR}/out/${host}/"
+    STAMPDIR="${BASEDIR}/stamp/${host}/"
+
+    cd "${INCOMINGDIR}"
+    mkdir -p "${WORKDIR}"
+    mkdir -p "${WORKDIR_AWSTATS}"
+    mkdir -p "${OUTDIR}"
+    mkdir -p "${STAMPDIR}"
+
+    for file in *; do
+        basefile=${file%.gz}
+        if [ -e "${STAMPDIR}/${file}_treated" ]; then
+            continue
+        fi
+        cp "${INCOMINGDIR}/${file}" "${WORKDIR}/${file}"
+        cd "${WORKDIR}"
+        gunzip ${file}
+        COMPLETED=$(${SCRIPTDIR}/sanitize.py "${basefile}" "${WORKDIR}")
+        COMPLETED_BASE=$(basename $COMPLETED)
+        COMPLETED_BASE=${COMPLETED_BASE%_sanitized}
+        sort "${COMPLETED}" > "${COMPLETED}_sorted"
+        xz -ck9e "${COMPLETED}_sorted" > "${OUTDIR}/${COMPLETED_BASE}.xz"
+        mv "${COMPLETED}_sorted" "${WORKDIR_AWSTATS}"
+
+        rm "${WORKDIR}/${basefile}"
+        rm "${WORKDIR}/${COMPLETED_BASE}_sanitized"
+        touch "${STAMPDIR}/${file}_treated"
+    done
+done
+





More information about the tor-commits mailing list