commit 9bdb6d39fc7b0ac8e7327caeafabfac43a41689f
Author: Karsten Loesing <karsten.loesing(a)gmx.net>
Date: Mon Jan 7 11:59:19 2019 +0100
Properly skip previously imported webstats files.
Turns out we never skipped previously imported webstats files due to
two bugs:
1. While building a list of previously imported webstats files we
reassembled their file names as ${server}_${site}_* rather than
${site}_${server}_* which was the file name format we chose in an
earlier version of the CollecTor module.
2. When checking whether a given webstats file already exists in the
database we compared the full file name to the reassembled file
name from the database with ${server} being truncated to 32
characters.
This commit fixes both bugs.
---
src/main/java/org/torproject/metrics/stats/webstats/Main.java | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/torproject/metrics/stats/webstats/Main.java b/src/main/java/org/torproject/metrics/stats/webstats/Main.java
index a154e64..fb0a903 100644
--- a/src/main/java/org/torproject/metrics/stats/webstats/Main.java
+++ b/src/main/java/org/torproject/metrics/stats/webstats/Main.java
@@ -100,7 +100,7 @@ public class Main {
try (ResultSet rs = st.executeQuery(queryString)) {
while (rs.next()) {
importedLogFileUrls.add(String.format("%s_%s_access.log_%s.xz",
- rs.getString(1), rs.getString(2),
+ rs.getString(2), rs.getString(1),
rs.getDate(3).toLocalDate().format(dateFormat)));
}
}
@@ -111,13 +111,19 @@ public class Main {
static void importLogFiles(Connection connection, SortedSet<String> skipFiles,
File... inDirectories) {
+ DateTimeFormatter dateFormat = DateTimeFormatter.ofPattern("yyyyMMdd");
for (Descriptor descriptor : DescriptorSourceFactory
.createDescriptorReader().readDescriptors(inDirectories)) {
if (!(descriptor instanceof WebServerAccessLog)) {
continue;
}
WebServerAccessLog logFile = (WebServerAccessLog) descriptor;
- if (skipFiles.contains(logFile.getDescriptorFile().getName())) {
+ String logFileNameWithTruncatedParts = String.format(
+ "%s_%s_access.log_%s.xz",
+ truncateString(logFile.getVirtualHost(), 128),
+ truncateString(logFile.getPhysicalHost(), 32),
+ logFile.getLogDate().format(dateFormat));
+ if (skipFiles.contains(logFileNameWithTruncatedParts)) {
continue;
}
try {