commit 9bdb6d39fc7b0ac8e7327caeafabfac43a41689f Author: Karsten Loesing karsten.loesing@gmx.net Date: Mon Jan 7 11:59:19 2019 +0100
Properly skip previously imported webstats files.
Turns out we never skipped previously imported webstats files due to two bugs:
1. While building a list of previously imported webstats files we reassembled their file names as ${server}_${site}_* rather than ${site}_${server}_* which was the file name format we chose in an earlier version of the CollecTor module.
2. When checking whether a given webstats file already exists in the database we compared the full file name to the reassembled file name from the database with ${server} being truncated to 32 characters.
This commit fixes both bugs. --- src/main/java/org/torproject/metrics/stats/webstats/Main.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/torproject/metrics/stats/webstats/Main.java b/src/main/java/org/torproject/metrics/stats/webstats/Main.java index a154e64..fb0a903 100644 --- a/src/main/java/org/torproject/metrics/stats/webstats/Main.java +++ b/src/main/java/org/torproject/metrics/stats/webstats/Main.java @@ -100,7 +100,7 @@ public class Main { try (ResultSet rs = st.executeQuery(queryString)) { while (rs.next()) { importedLogFileUrls.add(String.format("%s_%s_access.log_%s.xz", - rs.getString(1), rs.getString(2), + rs.getString(2), rs.getString(1), rs.getDate(3).toLocalDate().format(dateFormat))); } } @@ -111,13 +111,19 @@ public class Main {
static void importLogFiles(Connection connection, SortedSet<String> skipFiles, File... inDirectories) { + DateTimeFormatter dateFormat = DateTimeFormatter.ofPattern("yyyyMMdd"); for (Descriptor descriptor : DescriptorSourceFactory .createDescriptorReader().readDescriptors(inDirectories)) { if (!(descriptor instanceof WebServerAccessLog)) { continue; } WebServerAccessLog logFile = (WebServerAccessLog) descriptor; - if (skipFiles.contains(logFile.getDescriptorFile().getName())) { + String logFileNameWithTruncatedParts = String.format( + "%s_%s_access.log_%s.xz", + truncateString(logFile.getVirtualHost(), 128), + truncateString(logFile.getPhysicalHost(), 32), + logFile.getLogDate().format(dateFormat)); + if (skipFiles.contains(logFileNameWithTruncatedParts)) { continue; } try {