commit b306f7fc0f0285ea1d008dbbe271495bc6d92b3e Author: Karsten Loesing karsten.loesing@gmx.net Date: Wed Nov 7 17:18:43 2012 -0500
Don't delete parsed files from in/. --- src/org/torproject/webstats/Main.java | 63 ++++++++++++-------------------- 1 files changed, 24 insertions(+), 39 deletions(-)
diff --git a/src/org/torproject/webstats/Main.java b/src/org/torproject/webstats/Main.java index 1c7780c..7241d68 100644 --- a/src/org/torproject/webstats/Main.java +++ b/src/org/torproject/webstats/Main.java @@ -25,7 +25,7 @@ import org.apache.commons.compress.compressors.gzip.*; * * The main operation is to parse Apache web log files from the in/ * directory and write sanitized web log files to the out/ directory. - * Files in the in/ directory are assumed to never change and will be + * Files in the in/ directory are assumed to never change and may be * deleted after processing by this program. Files in the out/ directory * are guaranteed to never change and may be deleted by a subsequently * running program. @@ -34,8 +34,8 @@ import org.apache.commons.compress.compressors.gzip.*; * in/ are not parsed more than once and that files in out/ do not need to * be changed: * - state/lock prevents concurrent executions of this program. - * - state/in-history contains file names of previously read and deleted - * files in the in/ directory. + * - state/in-history contains file names of previously read and possibly + * deleted files in the in/ directory. * - state/in-history.new is the file written in the current execution * that will replace state/in-history during the execution. * - state/temp/ contains new or updated output files parsed in the @@ -56,12 +56,12 @@ import org.apache.commons.compress.compressors.gzip.*; * 2. Read the contents from state/in-history and state/out-history and * the directory listings of in/ to memory. * 3. For each file in in/: - * a. Append the file name to state/in-history.new. - * b. Check that the file name is not contained in state/in-history. - * If it is, print out a warning and skip the file. - * c. Parse and sanitize the file in chunks of 250,000 lines to reduce + * a. Append the file name to state/in-history.new if it was not + * contained in state/in-history. If it was contained, skip the + * file. + * b. Parse and sanitize the file in chunks of 250,000 lines to reduce * writes. - * d. When writing sanitized chunks to output files, for each output + * c. When writing sanitized chunks to output files, for each output * file, check in the following order if there is already such a * file in * i. state/temp/, @@ -73,8 +73,7 @@ import org.apache.commons.compress.compressors.gzip.*; * 4. Rename state/in-history to state/in-history.old and rename * state/in-history.new to state/in-history. Delete * state/in-history.old. - * 5. Delete files in in/ that have been parsed in this execution. - * 6. For each file in state/temp/: + * 5. For each file in state/temp/: * a. Check if there's a corresponding line in state/out-history. If * so, check whether there is a file in state/full/ or out/. If * so, move the file to state/full/. Otherwise move the file to @@ -84,13 +83,13 @@ import org.apache.commons.compress.compressors.gzip.*; * days old, move the file to state/full/. * c. If b. does not apply, append a line to out-history.new and move * the file to out/. - * 7. For each file in state/full/, check whether the sanitized log is at + * 6. For each file in state/full/, check whether the sanitized log is at * least four (4) days old and not contained in state/out-history. If * so, append a line to out-history.new and move the file to out/. - * 8. Rename state/out-history to state/out-history.old and rename + * 7. Rename state/out-history to state/out-history.old and rename * state/out-history.new to state/out-history. Delete * state/out-history.old. - * 9. Delete state/lock and exit. + * 8. Delete state/lock and exit. * * If the program is interrupted and leaves a lock file in state/lock, it * requires an operator to fix the state/ directory and make it work @@ -133,20 +132,19 @@ public class Main { readHistoryFiles(); /* Step 2 */ readInDirectoryListing(); for (File inFile : inFiles) { /* Step 3 */ - appendToInHistory(inFile); + appendToInHistoryIfNotContained(inFile); if (!checkFileName(inFile) || checkParsedBefore(inFile)) { continue; } sanitizeInFile(inFile); } overwriteInHistoryFile(); /* Step 4 */ - deleteParsedInFiles(); /* Step 5 */ - for (String outputFileName : updatedOutputFiles) { /* Step 6 */ + for (String outputFileName : updatedOutputFiles) { /* Step 5 */ moveOutputFile(outputFileName); } - moveFullFilesToOut(); /* Step 7 */ - overwriteOutHistoryFile(); /* Step 8 */ - deleteLockFile(); /* Step 9 */ + moveFullFilesToOut(); /* Step 6 */ + overwriteOutHistoryFile(); /* Step 7 */ + deleteLockFile(); /* Step 8 */ }
/* Define file and directory names. */ @@ -262,10 +260,12 @@ public class Main { } return result; } - private static void appendToInHistory(File inFile) { - inHistoryNewFiles.add(inFile.getAbsolutePath()); - String line = inFile.getAbsolutePath(); - appendToHistoryFile(stateInHistoryNewFile, line); + private static void appendToInHistoryIfNotContained(File inFile) { + if (!inHistoryNewFiles.contains(inFile.getAbsolutePath())) { + inHistoryNewFiles.add(inFile.getAbsolutePath()); + String line = inFile.getAbsolutePath(); + appendToHistoryFile(stateInHistoryNewFile, line); + } } private static void appendToHistoryFile(File historyFile, String line) { try { @@ -286,14 +286,7 @@ public class Main { inFile.getName().endsWith(".gz"); } private static boolean checkParsedBefore(File inFile) { - if (inHistoryFiles.contains(inFile.getAbsolutePath())) { - System.err.println("Parsed and subsequently deleted input file '" - + inFile.getAbsolutePath() + "' before. It shouldn't be " - + "there again. Skipping it now and not deleting it later."); - return true; - } else { - return false; - } + return inHistoryFiles.contains(inFile.getAbsolutePath()); } private static void sanitizeInFile(File inFile) { try { @@ -452,14 +445,6 @@ public class Main { stateInHistoryNewFile.renameTo(stateInHistoryFile); stateInHistoryOldFile.delete(); } - private static void deleteParsedInFiles() { - Set<String> filesToDelete = new HashSet<String>(); - filesToDelete.addAll(inHistoryNewFiles); - filesToDelete.removeAll(inHistoryFiles); - for (String file : filesToDelete) { - new File(file).delete(); - } - } private static void moveOutputFile(String outputFileName) { File outFile = new File(outDirectory, outputFileName); File stateTempFile = new File(stateTempDirectory, outputFileName);
tor-commits@lists.torproject.org