commit edddf0983d668ecd8c27700a83cb7e4d64255c07 Author: Sebastian Hahn sebastian@torproject.org Date: Fri Nov 13 06:41:57 2015 +0100
Start script to extract logs --- bin/receive-log.sh | 47 --- bin/send-logs.sh | 73 ----- build.xml | 36 -- src/org/torproject/webstats/Main.java | 595 ---------------------------------- src/sanitize.py | 82 +++++ src/treat_new_logs.sh | 45 +++ 6 files changed, 127 insertions(+), 751 deletions(-)
diff --git a/bin/receive-log.sh b/bin/receive-log.sh deleted file mode 100755 index 6f98eaa..0000000 --- a/bin/receive-log.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -# Copyright 2011 The Tor Project -# See LICENSE for licensing information -# -# Read a binary file from stdin and store it to disk. - -set -e -set -u - -# Read remote host name from local command-line argument. -rhost="$1" -if [ -z "$rhost" ]; then - echo "Missing or illegal host name in authorized_keys." - exit 1 -fi - -# Read remote log file name from SSH command-line arguments. -rfile="${SSH_ORIGINAL_COMMAND:-}" -if ! [[ "$rfile" =~ ^[a-zA-Z0-9.-]+$ ]]; then - echo "Missing or illegal file name in SSH command." - exit 1 -fi - -# Create directories for this remote host if it doesn't exist. -ldir="in/$rhost" -if ! [ -d "in" ]; then - mkdir "in" -fi -if ! [ -d "$ldir" ]; then - mkdir "$ldir" -fi - -# Make sure the file doesn't exist yet. -lfile="$ldir/$rfile" -if [[ -f "$lfile" ]]; then - echo "File already exists." - exit 1 -fi - -# Write file from stdin to temporary file. -tfile="$ldir/.$rfile" -cat > "$tfile" - -# Move file from temp/ to in/ directory. -mv "$tfile" "$lfile" - diff --git a/bin/send-logs.sh b/bin/send-logs.sh deleted file mode 100755 index 77e7442..0000000 --- a/bin/send-logs.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash - -# Copyright 2011 The Tor Project -# See LICENSE for licensing information -# -# Send new gzip'ed Apache web log files via ssh to a remote server. - -set -e -set -u - -usage() { - echo "Usage: $0 logs-dir ssh-string state-dir" - exit 1 -} - -# Check command-line arguments. -if [ "$#" != 3 ]; then - usage -fi -ldir="$1" -sshstring="$2" -tdir="$3" -if ! [ -d "$ldir" ]; then - echo "Directory '$1' does not exist." - usage -fi - -# Create the local state directory that has empty files for every log file -# that we sent to the remote server. -if ! [ -d "$tdir" ]; then - mkdir "$tdir" -fi - -# Iterate over files with file names containing access.log and ending in -# .gz and send the ones we haven't sent before to the remote server. -for i in $(find "$ldir" -maxdepth 1 -type f -name '*access.log*.gz'); do - - # Check that the file exists. This works around issues like filenames - # containing spaces. - if ! [ -e $i ]; then - echo "File '$i' does not exist." - continue - fi - - # Extract the file name part to pass it as SSH parameter. - fname="$(basename $i)" - - # Warn if file names contain illegal characters that the receiver would - # not accept. - if ! [[ $fname =~ ^[a-zA-Z0-9.-]+$ ]]; then - echo "Illegal file name '$fname'." - continue - fi - - # Copy the file content to the remote server if we haven't sent it - # before. - tfile="$tdir/$fname" - if ! [ -f "$tfile" ]; then - ssh -o PreferredAuthentications=publickey $sshstring "$fname" < "$i" - fi - - # Add a state file to note that we don't attempt to send this file in - # the next execution. - touch "$tfile" - -done - -# Delete all state files for which there are no log files anymore. -for i in $(find "$tdir" -type f); do - fname="$(basename $i)" - [ -e "$ldir/$fname" ] || rm -f "$tdir/$fname" -done - diff --git a/build.xml b/build.xml deleted file mode 100644 index 7f0b430..0000000 --- a/build.xml +++ /dev/null @@ -1,36 +0,0 @@ -<project default="run" name="webstats" basedir="."> - <property name="sources" value="src"/> - <property name="classes" value="classes"/> - <property name="libs" value="lib"/> - <path id="classpath"> - <pathelement path="${classes}"/> - <pathelement location="${libs}/commons-compress-1.0.jar"/> - </path> - <target name="init"> - <mkdir dir="${classes}"/> - </target> - <target name="compile" - depends="init"> - <javac destdir="${classes}" - srcdir="${sources}" - source="1.5" - target="1.5" - debug="true" - deprecation="true" - optimize="false" - failonerror="true" - includeantruntime="false"> - <classpath> - <fileset dir="${libs}"/> - </classpath> - </javac> - </target> - <target name="run" depends="compile"> - <java fork="true" - maxmemory="512m" - classname="org.torproject.webstats.Main"> - <classpath refid="classpath"/> - </java> - </target> -</project> - diff --git a/src/org/torproject/webstats/Main.java b/src/org/torproject/webstats/Main.java deleted file mode 100644 index 378093e..0000000 --- a/src/org/torproject/webstats/Main.java +++ /dev/null @@ -1,595 +0,0 @@ -package org.torproject.webstats; - -import java.io.*; -import java.text.*; -import java.util.*; -import java.util.regex.*; - -import org.apache.commons.compress.compressors.gzip.*; - -/* - * Sanitize Apache web logs by removing all potentially sensitive parts. - * - * The following sanitizing steps are performed on input web logs: - * 1. Discard all lines that are not in the Combined Log Format. - * 2. Discard all lines with other hosts than '0.0.0.0' or '0.0.0.1'. - * 3. Discard all lines with other methods than GET. - * 4. Discard all lines with other protocols than HTTP. - * 5. Discard all lines with status code 404. - * 6. Override client with '-'. - * 7. Override user with '-'. - * 8. Override time with '00:00:00 +0000'. - * 9. Override referer (sic!) with '"-"'. - * 10. Override user agent with '"-"'. - * 11. Truncate resource at the first '?' character. - * - * The main operation is to parse Apache web log files from the in/ - * directory and write sanitized web log files to the out/ directory. - * Files in the in/ directory are assumed to never change and may be - * deleted after processing by this program. Files in the out/ directory - * are guaranteed to never change and may be deleted by a subsequently - * running program. - * - * This program uses a couple of state files to make sure that files in - * in/ are not parsed more than once and that files in out/ do not need to - * be changed: - * - state/lock prevents concurrent executions of this program. - * - state/in-history contains file names of previously read and possibly - * deleted files in the in/ directory. - * - state/in-history.new is the file written in the current execution - * that will replace state/in-history during the execution. - * - state/temp/ contains new or updated output files parsed in the - * current execution that are moved to out/, state/full/, or state/diff/ - * during the execution. - * - state/out-history contains file names of previously written and - * possibly deleted files in the out/ directory. - * - state/out-history.new is the file written in the current execution - * that will replace state/out-history at the end of the execution. - * - state/full/ contains complete output files that may or may not be - * newer than files in the out/ directory. - * - state/diff/ contains new parts for files in the out/ directory which - * have been deleted. - * - * The phases and steps taken by this program are as follows: - * - * Phase I: Read files from in/, sanitize them, and write them to state/. - * 1. Check that state/lock does not exists, or exit immediately. Add a - * new state/lock file. - * 2. Read the contents from state/in-history and the directory listing - * of in/ to memory. - * 3. For each file in in/: - * a. Append the file name to state/in-history.new if it was not - * contained in state/in-history. If it was contained, skip the - * file. - * b. Parse and sanitize the file in chunks of 250,000 lines to reduce - * writes. - * c. When writing sanitized chunks to output files, for each output - * file, check in the following order if there is already such a - * file in - * i. state/temp/, - * ii. state/full/, - * iii. out/, or - * iv. state/diff/. - * If there's such a file, merge the newly sanitized lines into - * that file and write the sorted result to state/temp/. - * 4. Rename state/in-history to state/in-history.old and rename - * state/in-history.new to state/in-history. Delete - * state/in-history.old. - * - * Phase II: Move files that won't change anymore from state/ to out/. - * 5. Read the contents from state/out-history to memory. - * 6. For each file in state/temp/: - * a. Check if there's a corresponding line in state/out-history. If - * so, check whether there is a file in state/full/ or out/. If - * so, move the file to state/full/. Otherwise move the file to - * state/diff/. Print out a warning that there is a more recent - * file available. - * b. If a. does not apply and the sanitized log is less than four (4) - * days old, move the file to state/full/. - * c. If b. does not apply, append a line to out-history.new and move - * the file to out/. - * 7. For each file in state/full/, check whether the sanitized log is at - * least four (4) days old and not contained in state/out-history. If - * so, append a line to out-history.new and move the file to out/. - * 8. Rename state/out-history to state/out-history.old and rename - * state/out-history.new to state/out-history. Delete - * state/out-history.old. - * 9. Delete state/lock and exit. - * - * If the program is interrupted and leaves a lock file in state/lock, it - * requires an operator to fix the state/ directory and make it work - * again. IMPORTANT: DO NOT CHANGE ANYTHING IN THE state/ DIRECTORY - * UNLESS YOU'RE CERTAIN WHAT YOU'RE DOING! The following situations can - * happen. It may make sense to try a solution in a test environment - * first: - * A. The file state/in-history.new does not exist and there are no files - * in state/temp/. The process died before step 3, that is before - * actually doing anything of phase I. Delete state/lock and re-run - * the program. - * B. The file state/in-history.new exists and there are files in - * state/temp/. The process died during steps 3 or 4, that is, during - * phase I. Delete all files in state/temp/. If state/in-history - * does not exist but state/in-history.old does exist, rename the - * latter to the former. Delete state/lock and re-run the program. - * C. The file state/in-history.new does not exist, but there are files - * in state/temp/. The process died after step 4, that is during - * phase II. Run the steps 5 to 9 manually. Then re-run the program. - * - * Whenever logs are parsed that are 4 days old or older, there may - * already be output files in out/ that cannot be modified anymore. The - * operator may decide to manually overwrite files in out/ with the files - * in state/full/ or state/diff/. IMPORTANT: ONLY OVERWRITE FILES IN out/ - * IF YOU'RE CERTAIN HOW TO FIX THE PROGRAM THAT PARSES THESE FILES. - * There are two possible situations: - * A. There is a file in state/full/. This file is newer than the file - * with the same name in out/ and contains everything from that file, - * too. It's okay to overwrite the file in out/ with the file in - * state/full/ and delete the file in state/full/. - * B. There is a file in state/diff/. The file in out/ didn't exist - * anymore when parsing more log lines for it. The file that was in - * out/ should be located and merged with the file in state/diff/. - * Afterwards, the file in state/diff/ must be deleted. - */ -public class Main { - - /* Run the steps described above. */ - public static void main(String[] args) { - - /* Phase I */ - checkAndCreateLockFile(); /* Step 1 */ - readInHistoryFile(); /* Step 2 */ - readInDirectoryListing(); - for (File inFile : inFiles) { /* Step 3 */ - appendToInHistoryIfNotContained(inFile); - if (!checkFileName(inFile) || checkParsedBefore(inFile)) { - continue; - } - sanitizeInFile(inFile); - } - overwriteInHistoryFile(); /* Step 4 */ - - /* Phase II */ - readOutHistoryFile(); /* Step 5 */ - for (String outputFileName : updatedOutputFiles) { /* Step 6 */ - moveOutputFile(outputFileName); - } - moveFullFilesToOut(); /* Step 7 */ - overwriteOutHistoryFile(); /* Step 8 */ - deleteLockFile(); /* Step 9 */ - } - - /* Define file and directory names. */ - private static File inDirectory = new File("in"); - private static File outDirectory = new File("out"); - private static File stateLockFile = new File("state/lock"); - private static File stateInHistoryFile = new File("state/in-history"); - private static File stateInHistoryNewFile = - new File("state/in-history.new"); - private static File stateInHistoryOldFile = - new File("state/in-history.old"); - private static File stateOutHistoryFile = new File("state/out-history"); - private static File stateOutHistoryNewFile = - new File("state/out-history.new"); - private static File stateOutHistoryOldFile = - new File("state/out-history.old"); - private static File stateDiffDirectory = new File("state/diff"); - private static String stateFullDirectoryString = "state/full"; - private static File stateFullDirectory = new File("state/full"); - private static File stateTempDirectory = new File("state/temp"); - - /* Define data structures and helper classes. */ - private static Set<String> inHistoryFiles; - private static Set<String> inHistoryNewFiles; - private static Set<String> outHistoryFiles; - private static Set<File> inFiles; - private static Map<String, List<String>> cachedLinesPerOutputFile = - new HashMap<String, List<String>>(); - private static int cachedLines = 0; - private static Set<String> updatedOutputFiles = new HashSet<String>(); - private static SimpleDateFormat outputFileFormat = - new SimpleDateFormat("yyyy/MM/dd/"); - private static SimpleDateFormat logDateFormat = - new SimpleDateFormat("dd/MMM/yyyy"); - static { - outputFileFormat.setTimeZone(TimeZone.getTimeZone("UTC")); - logDateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); - } - private static Pattern logLinePattern = Pattern.compile( - "(0.0.0.[01]) \S+ \S+ \[([\w/]{11})[\d:]+\s[+\-]\d{4}\] " - + ""GET (\S+?) (HTTP/[\d\.]+)" (\d{3}) ([\d-]+) "[^"]+" " - + ""[^"]+""); - private static long now = System.currentTimeMillis(); - - /* Implement the substeps. */ - private static void checkAndCreateLockFile() { - if (stateLockFile.exists()) { - System.err.println("Lock file '" + stateLockFile.getAbsolutePath() - + "' exists. This means that a previous run did not exit " - + "cleanly. Exiting."); - System.exit(1); - } - try { - stateLockFile.getParentFile().mkdirs(); - BufferedWriter bw = new BufferedWriter(new FileWriter( - stateLockFile)); - bw.close(); - } catch (IOException e) { - e.printStackTrace(); - System.err.println("Could not create lock file '" - + stateLockFile.getAbsolutePath() + "'. Exiting."); - System.exit(1); - } - } - private static void readInHistoryFile() { - inHistoryFiles = readAndCopyHistoryFile(stateInHistoryFile, - stateInHistoryNewFile); - inHistoryNewFiles = new HashSet<String>(inHistoryFiles); - } - private static void readOutHistoryFile() { - outHistoryFiles = readAndCopyHistoryFile(stateOutHistoryFile, - stateOutHistoryNewFile); - } - private static Set<String> readAndCopyHistoryFile(File historyFile, - File historyNewFile) { - Set<String> result = new HashSet<String>(); - try { - BufferedWriter bw = new BufferedWriter(new FileWriter( - historyNewFile)); - if (historyFile.exists()) { - BufferedReader br = new BufferedReader(new FileReader( - historyFile)); - String line; - while ((line = br.readLine()) != null) { - result.add(line); - bw.write(line + "\n"); - } - br.close(); - } - bw.close(); - } catch (IOException e) { - e.printStackTrace(); - System.err.println("Could not read parse history file '" - + historyFile.getAbsolutePath() + "'. Exiting."); - System.exit(1); - } - return result; - } - private static void readInDirectoryListing() { - inFiles = readDirectoryListing(inDirectory); - } - private static Set<File> readDirectoryListing(File directory) { - Set<File> result = new HashSet<File>(); - if (directory.exists()) { - Stack<File> files = new Stack<File>(); - files.add(directory); - while (!files.isEmpty()) { - File file = files.pop(); - if (file.isDirectory()) { - files.addAll(Arrays.asList(file.listFiles())); - } else { - result.add(file); - } - } - } - return result; - } - private static void appendToInHistoryIfNotContained(File inFile) { - if (!inHistoryNewFiles.contains(inFile.getAbsolutePath())) { - inHistoryNewFiles.add(inFile.getAbsolutePath()); - String line = inFile.getAbsolutePath(); - appendToHistoryFile(stateInHistoryNewFile, line); - } - } - private static void appendToHistoryFile(File historyFile, String line) { - try { - BufferedWriter bw = new BufferedWriter(new FileWriter(historyFile, - true)); - bw.write(line + "\n"); - bw.close(); - } catch (IOException e) { - e.printStackTrace(); - System.err.println("Could not append line '" + line + "' to parse " - + "history file '" + historyFile.getAbsolutePath() + "'. " - + "Exiting."); - System.exit(1); - } - } - private static boolean checkFileName(File inFile) { - return inFile.getName().contains("access.log") && - inFile.getName().endsWith(".gz"); - } - private static boolean checkParsedBefore(File inFile) { - return inHistoryFiles.contains(inFile.getAbsolutePath()); - } - private static void sanitizeInFile(File inFile) { - try { - BufferedReader br = new BufferedReader(new InputStreamReader( - new GzipCompressorInputStream(new FileInputStream(inFile)))); - String line = null; - String outputFilenamePart = inFile.getName().substring(0, - inFile.getName().indexOf("access.log")) + "access.log"; - while (true) { - line = br.readLine(); - if (line == null || cachedLines > 250000) { - writeCachedLines(); - } - if (line == null) { - break; - } - if (!parseLine(line, outputFilenamePart)) { - break; - } - } - br.close(); - } catch (IOException e) { - e.printStackTrace(); - System.out.println("Error while parsing log file '" - + inFile.getAbsolutePath() + "'. Exiting."); - System.exit(1); - } - } - private static void writeCachedLines() { - for (Map.Entry<String, List<String>> e : - cachedLinesPerOutputFile.entrySet()) { - String outputFilename = e.getKey(); - List<String> cachedLinesList = e.getValue(); - Collections.sort(cachedLinesList); - storeOutputFile(outputFilename, cachedLinesList); - } - cachedLinesPerOutputFile.clear(); - cachedLines = 0; - } - private static void storeOutputFile(String outputFileName, - List<String> cachedLinesList) { - String outputGzFileName = outputFileName + ".gz"; - File stateTempFile = new File(stateTempDirectory, outputFileName); - File stateTempGzFile = new File(stateTempDirectory, outputGzFileName); - File stateFullFile = new File(stateFullDirectory, outputFileName); - File stateFullGzFile = new File(stateFullDirectory, outputGzFileName); - File outFile = new File(outDirectory, outputFileName); - File outGzFile = new File(outDirectory, outputGzFileName); - File stateDiffFile = new File(stateDiffDirectory, outputFileName); - File stateDiffGzFile = new File(stateDiffDirectory, outputGzFileName); - if (stateTempFile.exists()) { - updatedOutputFiles.add(outputFileName); - File stateTempOldFile = new File(stateTempDirectory, - outputFileName + ".old"); - stateTempFile.renameTo(stateTempOldFile); - mergeOutputFile(stateTempOldFile, cachedLinesList, stateTempFile); - stateTempOldFile.delete(); - } else if (stateTempGzFile.exists()) { - updatedOutputFiles.add(outputGzFileName); - File stateTempGzOldFile = new File(stateTempDirectory, - outputGzFileName + ".old"); - stateTempGzFile.renameTo(stateTempGzOldFile); - mergeOutputFile(stateTempGzOldFile, cachedLinesList, - stateTempGzFile); - stateTempGzOldFile.delete(); - } else if (stateFullFile.exists()) { - updatedOutputFiles.add(outputFileName); - mergeOutputFile(stateFullFile, cachedLinesList, stateTempFile); - } else if (stateFullGzFile.exists()) { - updatedOutputFiles.add(outputGzFileName); - mergeOutputFile(stateFullGzFile, cachedLinesList, stateTempGzFile); - } else if (outFile.exists()) { - updatedOutputFiles.add(outputFileName); - mergeOutputFile(outFile, cachedLinesList, stateTempFile); - } else if (outGzFile.exists()) { - updatedOutputFiles.add(outputGzFileName); - mergeOutputFile(outGzFile, cachedLinesList, stateTempGzFile); - } else if (stateDiffFile.exists()) { - updatedOutputFiles.add(outputFileName); - mergeOutputFile(stateDiffFile, cachedLinesList, stateTempFile); - } else if (stateDiffGzFile.exists()) { - updatedOutputFiles.add(outputGzFileName); - mergeOutputFile(stateDiffGzFile, cachedLinesList, stateTempGzFile); - } else { - updatedOutputFiles.add(outputGzFileName); - writeNewOutputFile(cachedLinesList, stateTempGzFile); - } - } - private static void mergeOutputFile(File oldOutputFile, - List<String> cachedLinesList, File newOutputFile) { - try { - BufferedReader br; - if (oldOutputFile.getName().endsWith(".gz")) { - br = new BufferedReader(new InputStreamReader( - new GzipCompressorInputStream(new FileInputStream( - oldOutputFile)))); - } else { - br = new BufferedReader(new FileReader(oldOutputFile)); - } - String line; - newOutputFile.getParentFile().mkdirs(); - BufferedWriter bw; - if (newOutputFile.getName().endsWith(".gz")) { - bw = new BufferedWriter(new OutputStreamWriter( - new GzipCompressorOutputStream(new FileOutputStream( - newOutputFile)))); - } else { - bw = new BufferedWriter(new FileWriter(newOutputFile)); - } - int cachedLinesListPosition = 0, - totalCachedLines = cachedLinesList.size(); - while ((line = br.readLine()) != null) { - while (cachedLinesListPosition < totalCachedLines && - cachedLinesList.get(cachedLinesListPosition). - compareTo(line) <= 0) { - bw.write(cachedLinesList.get( - cachedLinesListPosition) + "\n"); - cachedLinesListPosition++; - } - bw.write(line + "\n"); - } - while (cachedLinesListPosition < totalCachedLines) { - bw.write(cachedLinesList.get(cachedLinesListPosition++) + "\n"); - } - br.close(); - bw.close(); - } catch (IOException e) { - e.printStackTrace(); - System.err.println("Could not merge old output file '" - + oldOutputFile.getAbsolutePath() + "' with new log lines and " - + "write it to '" + newOutputFile.getAbsolutePath() - + "'. Exiting."); - System.exit(1); - } - } - private static void writeNewOutputFile(List<String> cachedLinesList, - File outputFile) { - try { - outputFile.getParentFile().mkdirs(); - BufferedWriter bw = new BufferedWriter(new OutputStreamWriter( - new GzipCompressorOutputStream(new FileOutputStream( - outputFile)))); - for (String cachedLine : cachedLinesList) { - bw.write(cachedLine + "\n"); - } - bw.close(); - } catch (IOException e) { - e.printStackTrace(); - System.err.println("Could not write output file '" - + outputFile.getAbsolutePath() + "'. Exiting."); - System.exit(1); - } - } - private static boolean parseLine(String line, - String outputFilenamePart) { - Matcher matcher = logLinePattern.matcher(line); - if (!matcher.matches()) { - return true; - } - String statusCode = matcher.group(5); - if (statusCode.equals("404")) { - return true; - } - String ipAddress = matcher.group(1); - String date = matcher.group(2); - String url = matcher.group(3); - if (url.contains("?")) { - url = url.substring(0, url.indexOf("?")); - } - String httpVersion = matcher.group(4); - String returnedBytes = matcher.group(6); - String sanitizedLine = ipAddress + " - - [" + date - + ":00:00:00 +0000] "GET " + url + " " + httpVersion - + "" " + statusCode + " " + returnedBytes + " "-" "-""; - String outputFilename = null; - try { - outputFilename = outputFileFormat.format(logDateFormat.parse( - date).getTime()) + outputFilenamePart; - } catch (ParseException e) { - System.out.println("Error parsing date. Aborting to parse " - + "this file."); - return false; - } - if (!cachedLinesPerOutputFile.containsKey(outputFilename)) { - cachedLinesPerOutputFile.put(outputFilename, - new ArrayList<String>()); - } - cachedLinesPerOutputFile.get(outputFilename).add( - sanitizedLine); - cachedLines++; - return true; - } - private static void overwriteInHistoryFile() { - stateInHistoryFile.renameTo(stateInHistoryOldFile); - stateInHistoryNewFile.renameTo(stateInHistoryFile); - stateInHistoryOldFile.delete(); - } - private static void moveOutputFile(String outputFileName) { - File outFile = new File(outDirectory, outputFileName); - File stateTempFile = new File(stateTempDirectory, outputFileName); - File stateFullFile = new File(stateFullDirectory, outputFileName); - File stateDiffFile = new File(stateDiffDirectory, outputFileName); - long ageInDays = -1L; - try { - ageInDays = (now - - outputFileFormat.parse(outputFileName.substring(0, - outputFileName.lastIndexOf("/") + 1)).getTime()) - / (24L * 60L * 60L * 1000L); - } catch (ParseException e) { - e.printStackTrace(); - System.err.println("Could not parse timestamp from '" - + outputFileName + "'. Exiting."); - System.exit(1); - } - if (outHistoryFiles.contains(outFile.getAbsolutePath())) { - if (outFile.exists() || stateFullFile.exists()) { - System.out.println("Could not write to output file '" - + outFile.getAbsolutePath() + "', because that file was " - + "written " + (outFile.exists() ? "" : "and deleted ") - + "before and we're not supposed to change it anymore. The " - + "updated file that could replace the output file is '" - + stateFullFile.getAbsolutePath() + "'."); - stateFullFile.getParentFile().mkdirs(); - stateTempFile.renameTo(stateFullFile); - } else { - System.out.println("Could not write to output file '" - + outFile.getAbsolutePath() + "', because that file was " - + "written and deleted before and we're not supposed to " - + "change it anymore (even if we could). The file " - + "containing the new lines only is '" - + stateDiffFile.getAbsolutePath() + "'."); - stateDiffFile.getParentFile().mkdirs(); - stateTempFile.renameTo(stateDiffFile); - } - } else if (ageInDays < 4L) { - stateFullFile.getParentFile().mkdirs(); - stateTempFile.renameTo(stateFullFile); - } else { - outFile.getParentFile().mkdirs(); - String line = outFile.getAbsolutePath(); - appendToHistoryFile(stateOutHistoryNewFile, line); - stateTempFile.renameTo(outFile); - } - } - private static void moveFullFilesToOut() { - Stack<String> fileNames = new Stack<String>(); - fileNames.add(stateFullDirectoryString); - while (!fileNames.isEmpty()) { - String fileName = fileNames.pop(); - File fileOrDirectory = new File(fileName); - if (!fileOrDirectory.exists()) { - continue; - } else if (fileOrDirectory.isDirectory()) { - for (File file : fileOrDirectory.listFiles()) { - fileNames.add(fileName + "/" + file.getName()); - } - } else { - String outputFileName = fileName.substring( - (stateFullDirectoryString + "/").length()); - File outFile = new File(outDirectory, outputFileName); - File stateFullFile = new File(stateFullDirectory, outputFileName); - long ageInDays = -1L; - try { - ageInDays = (now - - outputFileFormat.parse(outputFileName.substring(0, - outputFileName.lastIndexOf("/") + 1)).getTime()) - / (24L * 60L * 60L * 1000L); - } catch (ParseException e) { - e.printStackTrace(); - System.err.println("Could not parse timestamp from '" - + outputFileName + "'. Exiting."); - System.exit(1); - } - if (!outHistoryFiles.contains(outFile.getAbsolutePath()) && - ageInDays >= 4L) { - outFile.getParentFile().mkdirs(); - String line = outFile.getAbsolutePath(); - appendToHistoryFile(stateOutHistoryNewFile, line); - stateFullFile.renameTo(outFile); - } - } - } - } - private static void overwriteOutHistoryFile() { - stateOutHistoryFile.renameTo(stateOutHistoryOldFile); - stateOutHistoryNewFile.renameTo(stateOutHistoryFile); - stateOutHistoryOldFile.delete(); - } - private static void deleteLockFile() { - stateLockFile.delete(); - } -} - diff --git a/src/sanitize.py b/src/sanitize.py new file mode 100755 index 0000000..0c5de19 --- /dev/null +++ b/src/sanitize.py @@ -0,0 +1,82 @@ +#!/usr/bin/python +""" Sanitize Apache web logs by removing all potentially sensitive parts. + +The following sanitizing steps are performed on data read from stdin: + 1. Die if a line is not in the Apache2 Combined Log Format. + 2. Die if other hosts than '0.0.0.0' or '0.0.0.1' are specified. + 3. Discard all lines with other methods than GET. + 4. Die if a protocol other than HTTP is used. + 5. Discard all lines with status code 404. + 6. Override client with '-'. + 7. Override user with '-'. + 8. Override time with '00:00:00 +0000'. + 9. Override referer (sic!) with '"-"'. + 10. Override user agent with '"-"'. + 11. Truncate resource at the first '?' character. + 12. Die if a valid date wasn't passed as the sole cmdline parameter. + 13. Die if a line has a date not equal or one day prior. + +USAGE: sanitize.py INPUT_FILE OUTPUT_DIR + +The main operation is to parse Apache web log files from INPUT_FILE and +output the sanitized version in files according to the date of the log +entry in OUTPUT_DIR. All discarded lines are output on stderr. A nonzero +exit code indicates an error during processing, error messages go to +stderr. + +The input filename is expected to be in the following format: + <hostname>.torproject.org-access.log-YYYYMMDD +""" + +from __future__ import print_function + +import re +import fileinput +import sys +import dateutil.parser +import datetime + +assert(len(sys.argv) == 3) + +# Extract date from filename +date = re.compile(r'[^0-9]*([0-9]{8})') +matched = date.match(sys.argv[1]) +if matched is None: + print("Could not extract date from", sys.argv[1], file=sys.stderr) + sys.exit(1) +today = dateutil.parser.parse(matched.group(1)) + +is_valid_regex = re.compile(r'^0.0.0.([01]) - - [(\d{2}/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/\d{4}):00:00:00 +0000] "([^ ]*) ([^ ?]*[?]?|)[^ ]* HTTP([^"]*)" (-|\d*) (-|\d*) "([^"]|\|")*" "([^"]|")*" .*[^ ]$') +sanitized_regex = r'0.0.0.\1 - - [\2:00:00:00 +0000] "\4 \5 HTTP\6" \7 \8 "-" "-" -\n' +day_before = today - datetime.timedelta(days=1) + +today_fname = sys.argv[2] + "/" + sys.argv[1] + "_sanitized" +yesterday_fname = today_fname.replace(matched.group(1), day_before.strftime("%Y%m%d")) + +with open(yesterday_fname, 'a') as file_old: + with open(today_fname, 'a') as file_new: + for line in fileinput.input(sys.argv[1]): + matched = is_valid_regex.match(line) + if matched is None: + print(line, "Last line does not match critera", file=sys.stderr) + sys.exit(1) + date = dateutil.parser.parse(matched.group(2)) + if today != date and day_before != date: + print(line, "Last line does not match date constraints. today:", today, + " day before:", day_before, " date:", date, file=sys.stderr) + sys.exit(1) + requesttype = matched.group(4) + if requesttype != "GET" and requesttype != "HEAD": + #print(matched.expand(sanitized_regex), file=sys.stderr, end="") + continue + + if matched.group(7) == "404": + #print(matched.expand(sanitized_regex), file=sys.stderr, end="") + continue + + if today == date: + file_new.write(matched.expand(sanitized_regex)) + else: + file_old.write(matched.expand(sanitized_regex)) + +print(yesterday_fname) diff --git a/src/treat_new_logs.sh b/src/treat_new_logs.sh new file mode 100755 index 0000000..147ebec --- /dev/null +++ b/src/treat_new_logs.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +set -u +set -e + +BASEDIR=/srv/webstats.torproject.org/ +SCRIPTDIR="${BASEDIR}/bin/" + +BASEINCOMINGDIR="${BASEDIR}/incoming/" + +cd "${BASEINCOMINGDIR}" +for host in *; do + INCOMINGDIR="${BASEINCOMINGDIR}/${host}/" + WORKDIR="${BASEDIR}/work/${host}/" + WORKDIR_AWSTATS="${BASEDIR}/work_awstats/${host}/" + OUTDIR="${BASEDIR}/out/${host}/" + STAMPDIR="${BASEDIR}/stamp/${host}/" + + cd "${INCOMINGDIR}" + mkdir -p "${WORKDIR}" + mkdir -p "${WORKDIR_AWSTATS}" + mkdir -p "${OUTDIR}" + mkdir -p "${STAMPDIR}" + + for file in *; do + basefile=${file%.gz} + if [ -e "${STAMPDIR}/${file}_treated" ]; then + continue + fi + cp "${INCOMINGDIR}/${file}" "${WORKDIR}/${file}" + cd "${WORKDIR}" + gunzip ${file} + COMPLETED=$(${SCRIPTDIR}/sanitize.py "${basefile}" "${WORKDIR}") + COMPLETED_BASE=$(basename $COMPLETED) + COMPLETED_BASE=${COMPLETED_BASE%_sanitized} + sort "${COMPLETED}" > "${COMPLETED}_sorted" + xz -ck9e "${COMPLETED}_sorted" > "${OUTDIR}/${COMPLETED_BASE}.xz" + mv "${COMPLETED}_sorted" "${WORKDIR_AWSTATS}" + + rm "${WORKDIR}/${basefile}" + rm "${WORKDIR}/${COMPLETED_BASE}_sanitized" + touch "${STAMPDIR}/${file}_treated" + done +done +
tor-commits@lists.torproject.org