[tor-commits] [collector/release] Add webstats module with sync and local import functionality.

karsten at torproject.org karsten at torproject.org
Mon Feb 26 15:26:12 UTC 2018


commit 97e577ae73ec631ac5d7448cb9f525594baa0c8a
Author: iwakeh <iwakeh at torproject.org>
Date:   Mon Oct 9 12:23:53 2017 +0000

    Add webstats module with sync and local import functionality.
    
    Implements task-22428.
---
 CHANGELOG.md                                       |   6 +-
 build.xml                                          |   2 +-
 src/main/java/org/torproject/collector/Main.java   |   2 +
 .../torproject/collector/conf/Configuration.java   |   3 +-
 .../java/org/torproject/collector/conf/Key.java    |   9 +-
 .../collector/persist/DescriptorPersistence.java   |   2 +
 .../persist/WebServerAccessLogPersistence.java     |  73 ++++++++
 .../torproject/collector/sync/SyncPersistence.java |   7 +
 .../torproject/collector/webstats/LogFileMap.java  | 115 ++++++++++++
 .../torproject/collector/webstats/LogMetadata.java |  87 +++++++++
 .../collector/webstats/SanitizeWeblogs.java        | 198 +++++++++++++++++++++
 src/main/resources/collector.properties            |  20 ++-
 .../collector/conf/ConfigurationTest.java          |   2 +-
 .../collector/cron/CollecTorMainTest.java          |   1 +
 .../collector/sync/SyncPersistenceTest.java        |  68 +++----
 .../collector/webstats/LogFileMapTest.java         |  33 ++++
 .../collector/webstats/LogMetadataTest.java        |  82 +++++++++
 ...eotrichon.torproject.org_access.log_20151007.xz | Bin 0 -> 4056 bytes
 ...meronense.torproject.org_access.log_20170531.gz | Bin 0 -> 388 bytes
 19 files changed, 671 insertions(+), 39 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2f4cd21..a0b5d1f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,8 @@
-# Changes in version 1.?.? - 201?-??-??
+# Changes in version 1.5.0 - 2018-01-31
+
+ * Major changes
+   - Update to metrics-lib 2.2.0.
+   - Add new module for processing and sanitizing Tor web server logs.
 
  * Minor changes
    - Exclude lastModifiedMillis in index.json.
diff --git a/build.xml b/build.xml
index f004f29..48f6e33 100644
--- a/build.xml
+++ b/build.xml
@@ -11,7 +11,7 @@
   <property name="release.version" value="1.4.1-dev" />
   <property name="project-main-class" value="org.torproject.collector.Main" />
   <property name="name" value="collector"/>
-  <property name="metricslibversion" value="2.1.1" />
+  <property name="metricslibversion" value="2.2.0" />
   <property name="jarincludes" value="collector.properties logback.xml" />
 
   <patternset id="runtime" >
diff --git a/src/main/java/org/torproject/collector/Main.java b/src/main/java/org/torproject/collector/Main.java
index 50cc8be..70cdbfa 100644
--- a/src/main/java/org/torproject/collector/Main.java
+++ b/src/main/java/org/torproject/collector/Main.java
@@ -14,6 +14,7 @@ import org.torproject.collector.exitlists.ExitListDownloader;
 import org.torproject.collector.index.CreateIndexJson;
 import org.torproject.collector.onionperf.OnionPerfDownloader;
 import org.torproject.collector.relaydescs.ArchiveWriter;
+import org.torproject.collector.webstats.SanitizeWeblogs;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -51,6 +52,7 @@ public class Main {
     collecTorMains.put(Key.UpdateindexActivated, CreateIndexJson.class);
     collecTorMains.put(Key.RelaydescsActivated, ArchiveWriter.class);
     collecTorMains.put(Key.OnionPerfActivated, OnionPerfDownloader.class);
+    collecTorMains.put(Key.WebstatsActivated, SanitizeWeblogs.class);
   }
 
   private static Configuration conf = new Configuration();
diff --git a/src/main/java/org/torproject/collector/conf/Configuration.java b/src/main/java/org/torproject/collector/conf/Configuration.java
index 57f9731..72bd5fc 100644
--- a/src/main/java/org/torproject/collector/conf/Configuration.java
+++ b/src/main/java/org/torproject/collector/conf/Configuration.java
@@ -92,7 +92,8 @@ public class Configuration extends Observable implements Cloneable {
         || this.getBool(Key.BridgedescsActivated)
         || this.getBool(Key.ExitlistsActivated)
         || this.getBool(Key.UpdateindexActivated)
-        || this.getBool(Key.OnionPerfActivated))) {
+        || this.getBool(Key.OnionPerfActivated)
+        || this.getBool(Key.WebstatsActivated))) {
       throw new ConfigurationException("Nothing is activated!\n"
           + "Please edit collector.properties. Exiting.");
     }
diff --git a/src/main/java/org/torproject/collector/conf/Key.java b/src/main/java/org/torproject/collector/conf/Key.java
index e0a20a7..6454009 100644
--- a/src/main/java/org/torproject/collector/conf/Key.java
+++ b/src/main/java/org/torproject/collector/conf/Key.java
@@ -28,6 +28,7 @@ public enum Key {
   BridgeSources(SourceType[].class),
   ExitlistSources(SourceType[].class),
   OnionPerfSources(SourceType[].class),
+  WebstatsSources(SourceType[].class),
   RelayCacheOrigins(String[].class),
   RelayLocalOrigins(Path.class),
   RelaySyncOrigins(URL[].class),
@@ -35,6 +36,8 @@ public enum Key {
   BridgeLocalOrigins(Path.class),
   ExitlistSyncOrigins(URL[].class),
   OnionPerfSyncOrigins(URL[].class),
+  WebstatsSyncOrigins(URL[].class),
+  WebstatsLocalOrigins(Path.class),
   BridgedescsActivated(Boolean.class),
   BridgedescsOffsetMinutes(Integer.class),
   BridgedescsPeriodMinutes(Integer.class),
@@ -58,7 +61,11 @@ public enum Key {
   KeepDirectoryArchiveImportHistory(Boolean.class),
   ReplaceIpAddressesWithHashes(Boolean.class),
   BridgeDescriptorMappingsLimit(Integer.class),
-  OnionPerfHosts(URL[].class);
+  OnionPerfHosts(URL[].class),
+  WebstatsActivated(Boolean.class),
+  WebstatsLimits(Boolean.class),
+  WebstatsOffsetMinutes(Integer.class),
+  WebstatsPeriodMinutes(Integer.class);
 
   private Class clazz;
   private static Set<String> keys;
diff --git a/src/main/java/org/torproject/collector/persist/DescriptorPersistence.java b/src/main/java/org/torproject/collector/persist/DescriptorPersistence.java
index 3e464fe..01c9fad 100644
--- a/src/main/java/org/torproject/collector/persist/DescriptorPersistence.java
+++ b/src/main/java/org/torproject/collector/persist/DescriptorPersistence.java
@@ -19,6 +19,7 @@ public abstract class DescriptorPersistence<T extends Descriptor> {
 
   protected static final String BRIDGEDESCS = "bridge-descriptors";
   protected static final String DASH = "-";
+  protected static final String DOT = ".";
   protected static final String MICRODESC = "microdesc";
   protected static final String MICRODESCS = "microdescs";
   protected static final String RELAYDESCS = "relay-descriptors";
@@ -26,6 +27,7 @@ public abstract class DescriptorPersistence<T extends Descriptor> {
   protected static final String EXTRA_INFOS = "extra-infos";
   protected static final String SERVERDESC = "server-descriptor";
   protected static final String SERVERDESCS = "server-descriptors";
+  protected static final String WEBSTATS = "webstats";
 
   protected final T desc;
   protected final byte[] annotation;
diff --git a/src/main/java/org/torproject/collector/persist/WebServerAccessLogPersistence.java b/src/main/java/org/torproject/collector/persist/WebServerAccessLogPersistence.java
new file mode 100644
index 0000000..792d3a9
--- /dev/null
+++ b/src/main/java/org/torproject/collector/persist/WebServerAccessLogPersistence.java
@@ -0,0 +1,73 @@
+/* Copyright 2016--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.collector.persist;
+
+import org.torproject.descriptor.WebServerAccessLog;
+import org.torproject.descriptor.internal.FileType;
+import org.torproject.descriptor.log.InternalLogDescriptor;
+import org.torproject.descriptor.log.InternalWebServerAccessLog;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.time.format.DateTimeFormatter;
+
+public class WebServerAccessLogPersistence
+    extends DescriptorPersistence<WebServerAccessLog> {
+
+  public static final String SEP = InternalWebServerAccessLog.SEP;
+  public static final FileType COMPRESSION = FileType.XZ;
+  private static final Logger log
+      = LoggerFactory.getLogger(WebServerAccessLogPersistence.class);
+
+  private DateTimeFormatter yearPattern = DateTimeFormatter.ofPattern("yyyy");
+  private DateTimeFormatter monthPattern = DateTimeFormatter.ofPattern("MM");
+  private DateTimeFormatter dayPattern = DateTimeFormatter.ofPattern("dd");
+
+  /** Prepare storing the given descriptor. */
+  public WebServerAccessLogPersistence(WebServerAccessLog desc) {
+    super(desc, new byte[0]);
+    byte[] compressedBytes = null;
+    try { // The descriptor bytes have to be stored compressed.
+      compressedBytes = COMPRESSION.compress(desc.getRawDescriptorBytes());
+      ((InternalLogDescriptor)desc).setRawDescriptorBytes(compressedBytes);
+    } catch (Exception ex) {
+      log.warn("Cannot compress ’{}’.  Storing uncompressed.", ex);
+    }
+    calculatePaths();
+  }
+
+  private void calculatePaths() {
+    String name =
+        this.desc.getVirtualHost() + SEP + this.desc.getPhysicalHost()
+        + SEP + "access.log"
+        + SEP + this.desc.getLogDate().format(DateTimeFormatter.BASIC_ISO_DATE)
+        + DOT + COMPRESSION.name().toLowerCase();
+    this.recentPath = Paths.get(WEBSTATS, name).toString();
+    this.storagePath = Paths.get(
+        WEBSTATS,
+        this.desc.getVirtualHost(),
+        this.desc.getLogDate().format(yearPattern), // year
+        this.desc.getLogDate().format(monthPattern), // month
+        this.desc.getLogDate().format(dayPattern), // day
+        name).toString();
+  }
+
+  /** Logs are not appended. */
+  @Override
+  public boolean storeAll(String recentRoot, String outRoot) {
+    return storeAll(recentRoot, outRoot, StandardOpenOption.CREATE_NEW,
+        StandardOpenOption.CREATE_NEW);
+  }
+
+  /** Logs are not appended. */
+  @Override
+  public boolean storeRecent(String recentRoot) {
+    return storeRecent(recentRoot, StandardOpenOption.CREATE_NEW);
+  }
+
+}
+
diff --git a/src/main/java/org/torproject/collector/sync/SyncPersistence.java b/src/main/java/org/torproject/collector/sync/SyncPersistence.java
index e230fca..142be7a 100644
--- a/src/main/java/org/torproject/collector/sync/SyncPersistence.java
+++ b/src/main/java/org/torproject/collector/sync/SyncPersistence.java
@@ -18,6 +18,7 @@ import org.torproject.collector.persist.PersistenceUtils;
 import org.torproject.collector.persist.ServerDescriptorPersistence;
 import org.torproject.collector.persist.StatusPersistence;
 import org.torproject.collector.persist.VotePersistence;
+import org.torproject.collector.persist.WebServerAccessLogPersistence;
 import org.torproject.descriptor.BridgeExtraInfoDescriptor;
 import org.torproject.descriptor.BridgeNetworkStatus;
 import org.torproject.descriptor.BridgeServerDescriptor;
@@ -28,6 +29,7 @@ import org.torproject.descriptor.RelayNetworkStatusConsensus;
 import org.torproject.descriptor.RelayNetworkStatusVote;
 import org.torproject.descriptor.RelayServerDescriptor;
 import org.torproject.descriptor.TorperfResult;
+import org.torproject.descriptor.WebServerAccessLog;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -132,6 +134,10 @@ public class SyncPersistence {
         case "TorperfResult":
           descPersist = new OnionPerfPersistence((TorperfResult) desc);
           break;
+        case "WebServerAccessLog":
+          descPersist = new WebServerAccessLogPersistence(
+              (WebServerAccessLog) desc);
+          break;
         default:
           log.trace("Invalid descriptor type {} for sync-merge.",
               clazz.getName());
@@ -149,3 +155,4 @@ public class SyncPersistence {
     }
   }
 }
+
diff --git a/src/main/java/org/torproject/collector/webstats/LogFileMap.java b/src/main/java/org/torproject/collector/webstats/LogFileMap.java
new file mode 100644
index 0000000..c1a6802
--- /dev/null
+++ b/src/main/java/org/torproject/collector/webstats/LogFileMap.java
@@ -0,0 +1,115 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.collector.webstats;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.file.FileVisitResult;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.SimpleFileVisitor;
+import java.nio.file.attribute.BasicFileAttributes;
+import java.time.LocalDate;
+import java.util.Optional;
+import java.util.TreeMap;
+import java.util.stream.Stream;
+
+/**
+ * Processes the given path and stores metadata for log files.
+ */
+public class LogFileMap
+    extends TreeMap<String, TreeMap<String, TreeMap<LocalDate, LogMetadata>>> {
+
+  private static final Logger log = LoggerFactory.getLogger(LogFileMap.class);
+
+  /**
+   * The map to keep track of the logfiles by virtual host,
+   * physical host, and date.
+   */
+  public LogFileMap(Path startDir) {
+    collectFiles(this, startDir);
+  }
+
+  private void collectFiles(final LogFileMap logFileMap, Path startDir) {
+    try {
+      Files.walkFileTree(startDir, new SimpleFileVisitor<Path>() {
+        @Override
+        public FileVisitResult visitFile(Path path, BasicFileAttributes att)
+            throws IOException {
+            Optional<LogMetadata> optionalMetadata = LogMetadata.create(path);
+          if (optionalMetadata.isPresent()) {
+            logFileMap.add(optionalMetadata.get());
+          }
+          return FileVisitResult.CONTINUE;
+        }
+
+        @Override
+        public FileVisitResult visitFileFailed(Path path, IOException ex)
+            throws IOException {
+          return logIfError(path, ex);
+        }
+
+        @Override
+        public FileVisitResult postVisitDirectory(Path path, IOException ex)
+            throws IOException {
+          return logIfError(path, ex);
+        }
+
+        private FileVisitResult logIfError(Path path, IOException ex) {
+          if (null != ex) {
+            log.warn("Cannot process '{}'.", path, ex);
+          }
+          return FileVisitResult.CONTINUE;
+        }
+      });
+    } catch (IOException ex) {
+      log.error("Cannot read directory '{}'.", startDir, ex);
+    }
+  }
+
+  /** Add log metadata to the map structure. */
+  public void add(LogMetadata metadata) {
+    TreeMap<String, TreeMap<LocalDate, LogMetadata>> virtualHosts
+        = this.get(metadata.virtualHost);
+    if (null == virtualHosts) {
+      virtualHosts = new TreeMap<String, TreeMap<LocalDate, LogMetadata>>();
+      this.put(metadata.virtualHost, virtualHosts);
+    }
+    TreeMap<LocalDate, LogMetadata> physicalHosts
+        = virtualHosts.get(metadata.physicalHost);
+    if (null == physicalHosts) {
+      physicalHosts = new TreeMap<LocalDate, LogMetadata>();
+      virtualHosts.put(metadata.physicalHost, physicalHosts);
+    }
+    physicalHosts.put(metadata.date, metadata);
+  }
+
+  /**
+   * Takes the given metadata and returns the LogMetadata for the entry
+   * of the next day.
+   */
+  public Optional<LogMetadata> nextDayLogFor(LogMetadata metadata) {
+    TreeMap<String, TreeMap<LocalDate, LogMetadata>> virtualHosts
+        = this.get(metadata.virtualHost);
+    if (null == virtualHosts) {
+      return Optional.empty();
+    }
+    TreeMap<LocalDate, LogMetadata> physicalHosts
+        = virtualHosts.get(metadata.physicalHost);
+    if (null == physicalHosts) {
+      return Optional.empty();
+    }
+    return Optional.ofNullable(physicalHosts.get(metadata.date.plusDays(1)));
+  }
+
+  /** Returns a stream of all contained log metadata. */
+  public Stream<LogMetadata> metadataStream() {
+    return this.values().stream()
+        .flatMap((virtualHosts) -> virtualHosts.values().stream())
+        .flatMap((physicalHosts) -> physicalHosts.values().stream());
+  }
+}
+
diff --git a/src/main/java/org/torproject/collector/webstats/LogMetadata.java b/src/main/java/org/torproject/collector/webstats/LogMetadata.java
new file mode 100644
index 0000000..ee0db1a
--- /dev/null
+++ b/src/main/java/org/torproject/collector/webstats/LogMetadata.java
@@ -0,0 +1,87 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.collector.webstats;
+
+import static org.torproject.descriptor.log.WebServerAccessLogImpl.MARKER;
+
+import org.torproject.descriptor.internal.FileType;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.nio.file.Path;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
+import java.util.Optional;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class LogMetadata {
+
+  private static final Logger log
+      = LoggerFactory.getLogger(LogMetadata.class);
+
+  /** The mandatory web server log descriptor file name pattern. */
+  public static final Pattern filenamePattern
+      = Pattern.compile("(\\S*)-" + MARKER
+      + "-(\\d{8})(?:\\.?)([a-zA-Z0-9]+)$");
+
+  /** The path of the log file to be imported. */
+  public final Path path;
+
+  /** The date the log entries were created. */
+  public final LocalDate date;
+
+  /** The log's compression type. */
+  public final FileType fileType;
+
+  /** The name of the physical host. */
+  public final String physicalHost;
+
+  /** The name of the virtual host. */
+  public final String virtualHost;
+
+  private LogMetadata(Path logPath, String physicalHost, String virtualHost,
+      LocalDate logDate, FileType fileType) {
+    this.path = logPath;
+    this.date = logDate;
+    this.fileType = fileType;
+    this.physicalHost = physicalHost;
+    this.virtualHost = virtualHost;
+  }
+
+  /**
+   * Only way to create a LogMetadata object from a given log path.
+   */
+  public static Optional<LogMetadata> create(Path logPath) {
+    LogMetadata metadata = null;
+    try {
+      Path parentPath = logPath.getName(logPath.getNameCount() - 2);
+      Path file = logPath.getFileName();
+      if (null != parentPath && null != file) {
+        String physicalHost = parentPath.toString();
+        Matcher mat = filenamePattern.matcher(file.toString());
+        if (mat.find()) {
+          String virtualHost = mat.group(1);
+          // verify date given
+          LocalDate logDate
+              = LocalDate.parse(mat.group(2), DateTimeFormatter.BASIC_ISO_DATE);
+          if (null == virtualHost || null == physicalHost || null == logDate
+              || virtualHost.isEmpty() || physicalHost.isEmpty()) {
+            log.debug("Non-matching file encountered: '{}/{}'.",
+                parentPath, file);
+          } else {
+            metadata = new LogMetadata(logPath, physicalHost, virtualHost,
+                logDate, FileType.findType(mat.group(3)));
+          }
+        }
+      }
+    } catch (Throwable ex) {
+      metadata = null;
+      log.debug("Problem parsing path '{}'.", logPath, ex);
+    }
+    return Optional.ofNullable(metadata);
+  }
+}
+
diff --git a/src/main/java/org/torproject/collector/webstats/SanitizeWeblogs.java b/src/main/java/org/torproject/collector/webstats/SanitizeWeblogs.java
new file mode 100644
index 0000000..88d62fa
--- /dev/null
+++ b/src/main/java/org/torproject/collector/webstats/SanitizeWeblogs.java
@@ -0,0 +1,198 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.collector.webstats;
+
+import org.torproject.collector.conf.Configuration;
+import org.torproject.collector.conf.ConfigurationException;
+import org.torproject.collector.conf.Key;
+import org.torproject.collector.conf.SourceType;
+import org.torproject.collector.cron.CollecTorMain;
+
+import org.torproject.collector.persist.PersistenceUtils;
+import org.torproject.collector.persist.WebServerAccessLogPersistence;
+import org.torproject.descriptor.DescriptorParseException;
+import org.torproject.descriptor.WebServerAccessLog;
+import org.torproject.descriptor.log.InternalLogDescriptor;
+import org.torproject.descriptor.log.InternalWebServerAccessLog;
+import org.torproject.descriptor.log.WebServerAccessLogImpl;
+import org.torproject.descriptor.log.WebServerAccessLogLine;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.InputStreamReader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.StringJoiner;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+/**
+ * This module processes web-logs for CollecTor according to the weblog
+ * sanitation specification published on metrics.torproject.org</p>
+ */
+public class SanitizeWeblogs extends CollecTorMain {
+
+  private static final Logger log =
+      LoggerFactory.getLogger(SanitizeWeblogs.class);
+  private static final int LIMIT = 2;
+
+  private static final String WEBSTATS = "webstats";
+  private String outputPathName;
+  private String recentPathName;
+  private boolean limits;
+
+  /**
+   * Possibly privacy impacting data is replaced by dummy data producing a
+   * log-file (or files) that confirm(s) to Apache's Combined Log Format.
+   */
+  public SanitizeWeblogs(Configuration conf) {
+    super(conf);
+    this.mapPathDescriptors.put("recent/webstats", WebServerAccessLog.class);
+  }
+
+  @Override
+  public String module() {
+    return WEBSTATS;
+  }
+
+  @Override
+  protected String syncMarker() {
+    return "Webstats";
+  }
+
+  @Override
+  protected void startProcessing() throws ConfigurationException {
+    try {
+      Files.createDirectories(this.config.getPath(Key.OutputPath));
+      Files.createDirectories(this.config.getPath(Key.RecentPath));
+      this.outputPathName = this.config.getPath(Key.OutputPath).toString();
+      this.recentPathName = this.config.getPath(Key.RecentPath).toString();
+      this.limits = this.config.getBool(Key.WebstatsLimits);
+      Set<SourceType> sources = this.config.getSourceTypeSet(
+          Key.WebstatsSources);
+      if (sources.contains(SourceType.Local)) {
+        findCleanWrite(this.config.getPath(Key.WebstatsLocalOrigins));
+        PersistenceUtils.cleanDirectory(this.config.getPath(Key.RecentPath));
+      }
+    } catch (Exception e) {
+      log.error("Cannot sanitize web-logs: " + e.getMessage(), e);
+      throw new RuntimeException(e);
+    }
+  }
+
+  private void findCleanWrite(Path dir) {
+    LogFileMap fileMapIn = new LogFileMap(dir);
+    log.info("Found log files for {} virtual hosts.", fileMapIn.size());
+    for (Map.Entry<String,TreeMap<String,TreeMap<LocalDate,LogMetadata>>>
+             virtualEntry : fileMapIn.entrySet()) {
+      String virtualHost = virtualEntry.getKey();
+      for (Map.Entry<String, TreeMap<LocalDate, LogMetadata>> physicalEntry
+          : virtualEntry.getValue().entrySet()) {
+        String physicalHost = physicalEntry.getKey();
+        log.info("Processing logs for {} on {}.", virtualHost, physicalHost);
+        Map<LocalDate, List<WebServerAccessLogLine>> linesByDate
+            = physicalEntry.getValue().values().stream().parallel()
+            .flatMap((LogMetadata metadata)
+                -> lineStream(metadata).filter((line) -> line.isValid()))
+            .collect(Collectors.groupingBy(WebServerAccessLogLine::getDate,
+                Collectors.toList()));
+        LocalDate[] interval = determineInterval(linesByDate.keySet());
+        linesByDate.entrySet().stream()
+            .filter((entry) -> entry.getKey().isAfter(interval[0])
+              && entry.getKey().isBefore(interval[1]))
+            .forEach((entry) -> storeSanitized(virtualHost, physicalHost,
+              entry.getKey(), entry.getValue()));
+      }
+    }
+  }
+
+  private void storeSanitized(String virtualHost, String physicalHost,
+      LocalDate date, List<WebServerAccessLogLine> lines) {
+    String name = new StringJoiner(InternalLogDescriptor.SEP)
+        .add(virtualHost).add(physicalHost)
+        .add(InternalWebServerAccessLog.MARKER)
+        .add(date.format(DateTimeFormatter.BASIC_ISO_DATE)).toString();
+    log.debug("Sanitizing {}.", name);
+    List<String> retainedLines = lines
+        .stream().map((line) -> sanitize(line, date))
+        .filter((line) -> line.isPresent()).map((line) -> line.get())
+        .collect(Collectors.toList());
+    retainedLines.sort(null);
+    try {
+      WebServerAccessLogPersistence walp
+          = new WebServerAccessLogPersistence(
+          new WebServerAccessLogImpl(retainedLines, name));
+      log.debug("Storing {}.", name);
+      walp.storeOut(this.outputPathName);
+      walp.storeRecent(this.recentPathName);
+    } catch (DescriptorParseException dpe) {
+      log.error("Cannot store log desriptor {}.", name, dpe);
+    }
+  }
+
+  static Optional<String> sanitize(WebServerAccessLogLine logLine,
+      LocalDate date) {
+    if (!logLine.isValid()
+        || !("GET".equals(logLine.getMethod())
+             || "HEAD".equals(logLine.getMethod()))
+        || !logLine.getProtocol().startsWith("HTTP")
+        || 400 == logLine.getResponse() || 404 == logLine.getResponse()) {
+      return Optional.empty();
+    }
+    if (!logLine.getIp().startsWith("0.0.0.")) {
+      logLine.setIp("0.0.0.0");
+    }
+    int queryStart = logLine.getRequest().indexOf("?");
+    if (queryStart > 0) {
+      logLine.setRequest(logLine.getRequest().substring(0, queryStart));
+    }
+    return Optional.of(logLine.toLogString());
+  }
+
+  LocalDate[] determineInterval(Set<LocalDate> dates) {
+    SortedSet<LocalDate> sorted = new TreeSet<>();
+    sorted.addAll(dates);
+    if (this.limits) {
+      for (int i = 0; i < LIMIT - 1; i++) {
+        sorted.remove(sorted.last());
+      }
+    }
+    if (sorted.isEmpty()) {
+      return new LocalDate[]{LocalDate.MAX, LocalDate.MIN};
+    }
+    if (!this.limits) {
+      sorted.add(sorted.first().minusDays(1));
+      sorted.add(sorted.last().plusDays(1));
+    }
+    return new LocalDate[]{sorted.first(), sorted.last()};
+  }
+
+  private Stream<WebServerAccessLogLine> lineStream(LogMetadata metadata) {
+    log.debug("Processing file {}.", metadata.path);
+    try (BufferedReader br
+        = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(
+         metadata.fileType.decompress(Files.readAllBytes(metadata.path)))))) {
+      return br.lines()
+          .map((String line) -> WebServerAccessLogLine.makeLine(line))
+          .collect(Collectors.toList()).stream();
+    } catch (Exception ex) {
+      log.debug("Skipping log-file {}.", metadata.path, ex);
+    }
+    return Stream.empty();
+  }
+
+}
+
diff --git a/src/main/resources/collector.properties b/src/main/resources/collector.properties
index 0a9f932..30dda2a 100644
--- a/src/main/resources/collector.properties
+++ b/src/main/resources/collector.properties
@@ -41,6 +41,12 @@ UpdateindexActivated = false
 UpdateindexPeriodMinutes = 2
 # offset in minutes since the epoch and
 UpdateindexOffsetMinutes = 0
+# the following defines, if this module is activated
+WebstatsActivated = false
+# period in minutes
+WebstatsPeriodMinutes = 360
+# offset in minutes since the epoch and
+WebstatsOffsetMinutes = 31
 ##########################################
 ## All below can be changed at runtime.
 #####
@@ -154,4 +160,16 @@ OnionPerfSyncOrigins = https://collector.torproject.org
 ## the second, etc.:
 ## OnionPerfHosts = http://first.torproject.org/, http://second.torproject.org/
 OnionPerfHosts = https://op-us.onionperf.torproject.net/
-
+######## Tor Weblogs ########
+#
+## Define descriptor sources
+#  possible values: Local, Sync
+WebstatsSources = Local
+#  Retrieve files from the following CollecTor instances.
+#  List of URLs separated by comma.
+WebstatsSyncOrigins = https://collector.torproject.org
+## Relative path to directory to import logfiles from.
+WebstatsLocalOrigins = in/webstats
+# Default 'true' behaves as stated in section 4 of
+# https://metrics.torproject.org/web-server-logs.html
+WebstatsLimits = true
diff --git a/src/test/java/org/torproject/collector/conf/ConfigurationTest.java b/src/test/java/org/torproject/collector/conf/ConfigurationTest.java
index dfb06b2..fcaa71f 100644
--- a/src/test/java/org/torproject/collector/conf/ConfigurationTest.java
+++ b/src/test/java/org/torproject/collector/conf/ConfigurationTest.java
@@ -40,7 +40,7 @@ public class ConfigurationTest {
   public void testKeyCount() throws Exception {
     assertEquals("The number of properties keys in enum Key changed."
         + "\n This test class should be adapted.",
-        45, Key.values().length);
+        52, Key.values().length);
   }
 
   @Test()
diff --git a/src/test/java/org/torproject/collector/cron/CollecTorMainTest.java b/src/test/java/org/torproject/collector/cron/CollecTorMainTest.java
index 79c1bd7..025f96c 100644
--- a/src/test/java/org/torproject/collector/cron/CollecTorMainTest.java
+++ b/src/test/java/org/torproject/collector/cron/CollecTorMainTest.java
@@ -71,6 +71,7 @@ public class CollecTorMainTest {
         case "Bridge":
         case "Exitlist":
         case "OnionPerf":
+        case "Webstats":
           assertNotNull("Property '" + key
               + "' not specified in " + Main.CONF_FILE + ".",
               props.getProperty(key));
diff --git a/src/test/java/org/torproject/collector/sync/SyncPersistenceTest.java b/src/test/java/org/torproject/collector/sync/SyncPersistenceTest.java
index 2774c8d..489a413 100644
--- a/src/test/java/org/torproject/collector/sync/SyncPersistenceTest.java
+++ b/src/test/java/org/torproject/collector/sync/SyncPersistenceTest.java
@@ -28,6 +28,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.List;
+import java.util.stream.Collectors;
 
 @RunWith(Parameterized.class)
 public class SyncPersistenceTest {
@@ -49,6 +50,26 @@ public class SyncPersistenceTest {
          Integer.valueOf(1),
          Integer.valueOf(1)},
 
+        {"webstats/archive.torproject.org_"
+             + "archeotrichon.torproject.org_access.log_20151007.xz",
+         new String[]{"webstats/archive.torproject.org/2015/10/07/"
+             + "archive.torproject.org_archeotrichon.torproject.org"
+             + "_access.log_20151007.xz"},
+         "archeotrichon.torproject.org/archive.torproject.org_"
+             + "archeotrichon.torproject.org_access.log_20151007.xz",
+         Integer.valueOf(1),
+         Integer.valueOf(1)},
+
+        {"webstats/metrics.torproject.org_"
+             + "meronense.torproject.org_access.log_20170531.xz",
+         new String[]{"webstats/metrics.torproject.org/2017/05/31/"
+             + "metrics.torproject.org_meronense.torproject.org"
+             + "_access.log_20170531.xz"},
+         "meronense.torproject.org/metrics.torproject.org_"
+             + "meronense.torproject.org_access.log_20170531.gz",
+         Integer.valueOf(1),
+         Integer.valueOf(1)},
+
         {"relay-descriptors/server-descriptors/"
              + "2016-10-05-19-06-17-server-descriptors",
          new String[]{"relay-descriptors/server-descriptor/2016/10/e/3/"
@@ -266,6 +287,9 @@ public class SyncPersistenceTest {
 
   @Test()
   public void testRecentFileContent() throws Exception {
+    if (this.filename.contains(".log")) {
+      return; // Skip this test, because logs are compressed and sanitized.
+    }
     conf = new Configuration();
     makeTemporaryFolders();
     DescriptorParser dp = DescriptorSourceFactory.createDescriptorParser();
@@ -292,6 +316,9 @@ public class SyncPersistenceTest {
 
   @Test()
   public void testOutFileContent() throws Exception {
+    if (this.filename.contains(".log")) {
+      return; // Skip this test, because logs are compressed and sanitized.
+    }
     conf = new Configuration();
     makeTemporaryFolders();
     DescriptorParser dp = DescriptorSourceFactory.createDescriptorParser();
@@ -305,9 +332,8 @@ public class SyncPersistenceTest {
     List<String> expContent = linesFromResource(filename);
     int expContentSize = expContent.size();
     for (File file : outputList) {
-      List<String> content = Files.readAllLines(file.toPath(),
-          StandardCharsets.UTF_8);
-      for (String line : content) {
+      for (String line : Files.readAllLines(file.toPath(),
+          StandardCharsets.UTF_8)) {
         assertTrue("Couldn't remove " + line + ".", expContent.remove(line));
         assertEquals(--expContentSize, expContent.size());
       }
@@ -325,49 +351,25 @@ public class SyncPersistenceTest {
   }
 
   private byte[] bytesFromResource() throws Exception {
-    StringBuilder sb = new StringBuilder();
-    BufferedReader br = new BufferedReader(new InputStreamReader(getClass()
-        .getClassLoader().getResourceAsStream(filename)));
-    String line = br.readLine();
-    while (null != line) {
-      sb.append(line).append('\n');
-      line = br.readLine();
-    }
-    return sb.toString().getBytes();
+    return Files.readAllBytes((new File(getClass()
+        .getClassLoader().getResource(filename).toURI())).toPath());
   }
 
   private String stringFromResource() throws Exception {
-    StringBuilder sb = new StringBuilder();
     BufferedReader br = new BufferedReader(new InputStreamReader(getClass()
         .getClassLoader().getResourceAsStream(filename)));
-    String line = br.readLine();
-    while (null != line) {
-      sb.append(line).append('\n');
-      line = br.readLine();
-    }
-    return sb.toString();
+    return br.lines().collect(Collectors.joining("\n", "", "\n"));
   }
 
   private String stringFromFile(File file) throws Exception {
-    StringBuilder sb = new StringBuilder();
-    List<String> lines = Files.readAllLines(file.toPath(),
-        StandardCharsets.UTF_8);
-    for (String line : lines) {
-      sb.append(line).append('\n');
-    }
-    return sb.toString();
+    return Files.lines(file.toPath(), StandardCharsets.UTF_8)
+        .collect(Collectors.joining("\n", "", "\n"));
   }
 
   private List<String> linesFromResource(String filename) throws Exception {
-    List<String> res = new ArrayList<>();
     BufferedReader br = new BufferedReader(new InputStreamReader(getClass()
         .getClassLoader().getResourceAsStream(filename)));
-    String line = br.readLine();
-    while (null != line) {
-      res.add(line);
-      line = br.readLine();
-    }
-    return res;
+    return br.lines().collect(Collectors.toList());
   }
 
 }
diff --git a/src/test/java/org/torproject/collector/webstats/LogFileMapTest.java b/src/test/java/org/torproject/collector/webstats/LogFileMapTest.java
new file mode 100644
index 0000000..d55ba40
--- /dev/null
+++ b/src/test/java/org/torproject/collector/webstats/LogFileMapTest.java
@@ -0,0 +1,33 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.collector.webstats;
+
+import static org.junit.Assert.assertTrue;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.nio.file.Paths;
+import java.util.Optional;
+
+public class LogFileMapTest {
+
+  @Rule
+  public TemporaryFolder tmpf = new TemporaryFolder();
+
+  @Test
+  public void makeLogFileMap() throws Exception {
+    LogFileMap lfm = new LogFileMap(tmpf.newFolder().toPath());
+    for (String path : new String[] {"in/ph1/vh1-access.log-20170901.gz",
+        "in/ph1/vh1-access.log-20170902.xz"}) {
+      Optional<LogMetadata> element
+          = LogMetadata.create(Paths.get(path));
+      assertTrue(element.isPresent());
+      lfm.add(element.get());
+    }
+  }
+
+}
+
diff --git a/src/test/java/org/torproject/collector/webstats/LogMetadataTest.java b/src/test/java/org/torproject/collector/webstats/LogMetadataTest.java
new file mode 100644
index 0000000..6121e8d
--- /dev/null
+++ b/src/test/java/org/torproject/collector/webstats/LogMetadataTest.java
@@ -0,0 +1,82 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.collector.webstats;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Optional;
+
+ at RunWith(Parameterized.class)
+public class LogMetadataTest {
+
+  /** Path and expected values of LogMetadata. */
+  @Parameters
+  public static Collection<Object[]> pathResult() {
+    return Arrays.asList(new Object[][] {
+        {Paths.get("in", "ph1", "vh1-error.log-20170902.xz"),
+         "10001010", Boolean.FALSE,
+         "Non-access logs should be discarded."},
+        {Paths.get("in", "ph1", "vh1-access.log-2017.xz"),
+         "10001010", Boolean.FALSE,
+         "Log file should be discarded, because of wrong date format."},
+        {Paths.get("in", "ph1", "vh1-access.log.xz"),
+         "10001010", Boolean.FALSE,
+         "Log file should be discarded, because of the missing date."},
+        {Paths.get("vh1-access.log-20170901.gz"),
+         "10001010", Boolean.FALSE,
+         "Should be discarded because of missing physical host information."},
+        {Paths.get("in", "ph1", "vh1-access.log-20170901.gz"),
+         "20170901", Boolean.TRUE,
+         "Should have been accepted."},
+        {Paths.get("", "vh1-access.log-20170901.gz"),
+         "20170901", Boolean.FALSE,
+         "Should not result in metadata."},
+        {Paths.get("x", "vh1-access.log-.gz"),
+         "20170901", Boolean.FALSE,
+         "Should not result in metadata."},
+        {Paths.get("/collection/download/in/ph2", "vh2-access.log-20180901.xz"),
+         "20180901", Boolean.TRUE,
+         "Should have been accepted."}
+    });
+  }
+
+  private Path path;
+  private LocalDate date;
+  private boolean valid;
+  private String failureMessage;
+
+  /** Set all test values. */
+  public LogMetadataTest(Path path, String dateString, boolean valid,
+      String message) {
+    this.path = path;
+    this.date = LocalDate.parse(dateString, DateTimeFormatter.BASIC_ISO_DATE);
+    this.valid = valid;
+    this.failureMessage = message;
+  }
+
+  @Test
+  public void testCreate() throws Exception {
+    Optional<LogMetadata> element = LogMetadata.create(this.path);
+    assertEquals(this.failureMessage, this.valid, element.isPresent());
+    if (!this.valid) {
+      return;
+    }
+    LogMetadata lmd = element.get();
+    assertEquals(this.date, lmd.date);
+    assertEquals(this.path, lmd.path);
+  }
+
+}
+
diff --git a/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz b/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz
new file mode 100644
index 0000000..b459742
Binary files /dev/null and b/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz differ
diff --git a/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz
new file mode 100644
index 0000000..8c2333b
Binary files /dev/null and b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz differ





More information about the tor-commits mailing list