[tor-commits] [webstats/master] Don't delete parsed files from in/.

runa at torproject.org runa at torproject.org
Tue Nov 13 15:27:44 UTC 2012


commit b306f7fc0f0285ea1d008dbbe271495bc6d92b3e
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Wed Nov 7 17:18:43 2012 -0500

    Don't delete parsed files from in/.
---
 src/org/torproject/webstats/Main.java |   63 ++++++++++++--------------------
 1 files changed, 24 insertions(+), 39 deletions(-)

diff --git a/src/org/torproject/webstats/Main.java b/src/org/torproject/webstats/Main.java
index 1c7780c..7241d68 100644
--- a/src/org/torproject/webstats/Main.java
+++ b/src/org/torproject/webstats/Main.java
@@ -25,7 +25,7 @@ import org.apache.commons.compress.compressors.gzip.*;
  *
  * The main operation is to parse Apache web log files from the in/
  * directory and write sanitized web log files to the out/ directory.
- * Files in the in/ directory are assumed to never change and will be
+ * Files in the in/ directory are assumed to never change and may be
  * deleted after processing by this program.  Files in the out/ directory
  * are guaranteed to never change and may be deleted by a subsequently
  * running program.
@@ -34,8 +34,8 @@ import org.apache.commons.compress.compressors.gzip.*;
  * in/ are not parsed more than once and that files in out/ do not need to
  * be changed:
  * - state/lock prevents concurrent executions of this program.
- * - state/in-history contains file names of previously read and deleted
- *   files in the in/ directory.
+ * - state/in-history contains file names of previously read and possibly
+ *   deleted files in the in/ directory.
  * - state/in-history.new is the file written in the current execution
  *   that will replace state/in-history during the execution.
  * - state/temp/ contains new or updated output files parsed in the
@@ -56,12 +56,12 @@ import org.apache.commons.compress.compressors.gzip.*;
  *  2. Read the contents from state/in-history and state/out-history and
  *     the directory listings of in/ to memory.
  *  3. For each file in in/:
- *     a. Append the file name to state/in-history.new.
- *     b. Check that the file name is not contained in state/in-history.
- *        If it is, print out a warning and skip the file.
- *     c. Parse and sanitize the file in chunks of 250,000 lines to reduce
+ *     a. Append the file name to state/in-history.new if it was not
+ *        contained in state/in-history.  If it was contained, skip the
+ *        file.
+ *     b. Parse and sanitize the file in chunks of 250,000 lines to reduce
  *        writes.
- *     d. When writing sanitized chunks to output files, for each output
+ *     c. When writing sanitized chunks to output files, for each output
  *        file, check in the following order if there is already such a
  *        file in
  *          i. state/temp/,
@@ -73,8 +73,7 @@ import org.apache.commons.compress.compressors.gzip.*;
  *  4. Rename state/in-history to state/in-history.old and rename
  *     state/in-history.new to state/in-history.  Delete
  *     state/in-history.old.
- *  5. Delete files in in/ that have been parsed in this execution.
- *  6. For each file in state/temp/:
+ *  5. For each file in state/temp/:
  *     a. Check if there's a corresponding line in state/out-history.  If
  *        so, check whether there is a file in state/full/ or out/.  If
  *        so, move the file to state/full/.  Otherwise move the file to
@@ -84,13 +83,13 @@ import org.apache.commons.compress.compressors.gzip.*;
  *        days old, move the file to state/full/.
  *     c. If b. does not apply, append a line to out-history.new and move
  *        the file to out/.
- *  7. For each file in state/full/, check whether the sanitized log is at
+ *  6. For each file in state/full/, check whether the sanitized log is at
  *     least four (4) days old and not contained in state/out-history.  If
  *     so, append a line to out-history.new and move the file to out/.
- *  8. Rename state/out-history to state/out-history.old and rename
+ *  7. Rename state/out-history to state/out-history.old and rename
  *     state/out-history.new to state/out-history.  Delete
  *     state/out-history.old.
- *  9. Delete state/lock and exit.
+ *  8. Delete state/lock and exit.
  *
  * If the program is interrupted and leaves a lock file in state/lock, it
  * requires an operator to fix the state/ directory and make it work
@@ -133,20 +132,19 @@ public class Main {
     readHistoryFiles(); /* Step 2 */
     readInDirectoryListing();
     for (File inFile : inFiles) { /* Step 3 */
-      appendToInHistory(inFile);
+      appendToInHistoryIfNotContained(inFile);
       if (!checkFileName(inFile) || checkParsedBefore(inFile)) {
         continue;
       }
       sanitizeInFile(inFile);
     }
     overwriteInHistoryFile(); /* Step 4 */
-    deleteParsedInFiles(); /* Step 5 */
-    for (String outputFileName : updatedOutputFiles) { /* Step 6 */
+    for (String outputFileName : updatedOutputFiles) { /* Step 5 */
       moveOutputFile(outputFileName);
     }
-    moveFullFilesToOut(); /* Step 7 */
-    overwriteOutHistoryFile(); /* Step 8 */
-    deleteLockFile(); /* Step 9 */
+    moveFullFilesToOut(); /* Step 6 */
+    overwriteOutHistoryFile(); /* Step 7 */
+    deleteLockFile(); /* Step 8 */
   }
 
   /* Define file and directory names. */
@@ -262,10 +260,12 @@ public class Main {
     }
     return result;
   }
-  private static void appendToInHistory(File inFile) {
-    inHistoryNewFiles.add(inFile.getAbsolutePath());
-    String line = inFile.getAbsolutePath();
-    appendToHistoryFile(stateInHistoryNewFile, line);
+  private static void appendToInHistoryIfNotContained(File inFile) {
+    if (!inHistoryNewFiles.contains(inFile.getAbsolutePath())) {
+      inHistoryNewFiles.add(inFile.getAbsolutePath());
+      String line = inFile.getAbsolutePath();
+      appendToHistoryFile(stateInHistoryNewFile, line);
+    }
   }
   private static void appendToHistoryFile(File historyFile, String line) {
     try {
@@ -286,14 +286,7 @@ public class Main {
         inFile.getName().endsWith(".gz");
   }
   private static boolean checkParsedBefore(File inFile) {
-    if (inHistoryFiles.contains(inFile.getAbsolutePath())) {
-      System.err.println("Parsed and subsequently deleted input file '"
-          + inFile.getAbsolutePath() + "' before.  It shouldn't be "
-          + "there again.  Skipping it now and not deleting it later.");
-      return true;
-    } else {
-      return false;
-    }
+    return inHistoryFiles.contains(inFile.getAbsolutePath());
   }
   private static void sanitizeInFile(File inFile) {
     try {
@@ -452,14 +445,6 @@ public class Main {
     stateInHistoryNewFile.renameTo(stateInHistoryFile);
     stateInHistoryOldFile.delete();
   }
-  private static void deleteParsedInFiles() {
-    Set<String> filesToDelete = new HashSet<String>();
-    filesToDelete.addAll(inHistoryNewFiles);
-    filesToDelete.removeAll(inHistoryFiles);
-    for (String file : filesToDelete) {
-      new File(file).delete();
-    }
-  }
   private static void moveOutputFile(String outputFileName) {
     File outFile = new File(outDirectory, outputFileName);
     File stateTempFile = new File(stateTempDirectory, outputFileName);





More information about the tor-commits mailing list