[tor-commits] [metrics-web/release] Properly skip previously imported webstats files.

karsten at torproject.org karsten at torproject.org
Sat Nov 9 21:45:06 UTC 2019


commit 9bdb6d39fc7b0ac8e7327caeafabfac43a41689f
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Mon Jan 7 11:59:19 2019 +0100

    Properly skip previously imported webstats files.
    
    Turns out we never skipped previously imported webstats files due to
    two bugs:
    
     1. While building a list of previously imported webstats files we
        reassembled their file names as ${server}_${site}_* rather than
        ${site}_${server}_* which was the file name format we chose in an
        earlier version of the CollecTor module.
    
     2. When checking whether a given webstats file already exists in the
        database we compared the full file name to the reassembled file
        name from the database with ${server} being truncated to 32
        characters.
    
    This commit fixes both bugs.
---
 src/main/java/org/torproject/metrics/stats/webstats/Main.java | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/torproject/metrics/stats/webstats/Main.java b/src/main/java/org/torproject/metrics/stats/webstats/Main.java
index a154e64..fb0a903 100644
--- a/src/main/java/org/torproject/metrics/stats/webstats/Main.java
+++ b/src/main/java/org/torproject/metrics/stats/webstats/Main.java
@@ -100,7 +100,7 @@ public class Main {
     try (ResultSet rs = st.executeQuery(queryString)) {
       while (rs.next()) {
         importedLogFileUrls.add(String.format("%s_%s_access.log_%s.xz",
-            rs.getString(1), rs.getString(2),
+            rs.getString(2), rs.getString(1),
             rs.getDate(3).toLocalDate().format(dateFormat)));
       }
     }
@@ -111,13 +111,19 @@ public class Main {
 
   static void importLogFiles(Connection connection, SortedSet<String> skipFiles,
       File... inDirectories) {
+    DateTimeFormatter dateFormat = DateTimeFormatter.ofPattern("yyyyMMdd");
     for (Descriptor descriptor : DescriptorSourceFactory
         .createDescriptorReader().readDescriptors(inDirectories)) {
       if (!(descriptor instanceof WebServerAccessLog)) {
         continue;
       }
       WebServerAccessLog logFile = (WebServerAccessLog) descriptor;
-      if (skipFiles.contains(logFile.getDescriptorFile().getName())) {
+      String logFileNameWithTruncatedParts = String.format(
+          "%s_%s_access.log_%s.xz",
+          truncateString(logFile.getVirtualHost(), 128),
+          truncateString(logFile.getPhysicalHost(), 32),
+          logFile.getLogDate().format(dateFormat));
+      if (skipFiles.contains(logFileNameWithTruncatedParts)) {
         continue;
       }
       try {





More information about the tor-commits mailing list