[tor-commits] [metrics-lib/master] Add new descriptor type for web server access logs.

karsten at torproject.org karsten at torproject.org
Wed Jan 31 12:30:03 UTC 2018


commit 3cd814d8481c87ee3609783d66ae4e2eec81d290
Author: iwakeh <iwakeh at torproject.org>
Date:   Fri Sep 15 14:07:08 2017 +0000

    Add new descriptor type for web server access logs.
    
    Implements task-22983 and is based on the log-descriptor
    specification.
---
 CHANGELOG.md                                       |   7 +
 .../org/torproject/descriptor/LogDescriptor.java   |  47 ++++++
 .../torproject/descriptor/WebServerAccessLog.java  |  65 ++++++++
 .../descriptor/impl/DescriptorParserImpl.java      |  12 +-
 .../torproject/descriptor/index/package-info.java  |   6 +-
 .../torproject/descriptor/internal/FileType.java   |  53 +++++-
 .../descriptor/internal/package-info.java          |  10 +-
 .../descriptor/log/InternalLogDescriptor.java      |  63 ++++++++
 .../descriptor/log/InternalWebServerAccessLog.java |  17 ++
 .../descriptor/log/LogDescriptorImpl.java          | 163 +++++++++++++++++++
 .../descriptor/log/WebServerAccessLogImpl.java     | 119 ++++++++++++++
 .../descriptor/log/WebServerAccessLogLine.java     | 135 ++++++++++++++++
 .../torproject/descriptor/log/package-info.java    |  14 ++
 .../org/torproject/descriptor/package-info.java    |   6 +-
 .../descriptor/log/LogDescriptorTest.java          | 178 +++++++++++++++++++++
 .../descriptor/log/WebServerAccessLogLineTest.java | 140 ++++++++++++++++
 .../descriptor/log/WebServerAccessLogTest.java     |  94 +++++++++++
 .../descriptor/log/WebServerModuleTest.java        | 113 +++++++++++++
 ...eotrichon.torproject.org_access.log_20151007.xz | Bin 0 -> 4056 bytes
 ...rver.org_dummy.host.net_access.log_20111111.bz2 | Bin 0 -> 76 bytes
 ...meronense.torproject.org_access.log_20170530.gz | Bin 0 -> 388 bytes
 ...meronense.torproject.org_access.log_20170531.gz | Bin 0 -> 388 bytes
 ...eronense.torproject.org_access.log_20170530.log |  26 +++
 ...eronense.torproject.org_access.log_20170607.log |  26 +++
 24 files changed, 1280 insertions(+), 14 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cd0dc6a..42e0e09 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,10 @@
+# Changes in version 2.2.0 - 2018-01-??
+
+ * Major changes
+   - Add new descriptor type WebServerAccessLog to parse web server
+     access logs.
+
+
 # Changes in version 2.1.1 - 2017-10-09
 
  * Minor changes
diff --git a/src/main/java/org/torproject/descriptor/LogDescriptor.java b/src/main/java/org/torproject/descriptor/LogDescriptor.java
new file mode 100644
index 0000000..ff02cae
--- /dev/null
+++ b/src/main/java/org/torproject/descriptor/LogDescriptor.java
@@ -0,0 +1,47 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor;
+
+import java.util.List;
+
+/**
+ * Contains a log file.
+ *
+ * <p>Unlike other descriptors, logs can get very large and are typically stored
+ * on disk in compressed form. However, all access to log contents through this
+ * interface and its subinterfaces is made available in uncompressed form.</p>
+ *
+ * @since 2.2.0
+ */
+public interface LogDescriptor extends Descriptor {
+
+  /**
+   * Returns the decompressed raw descriptor bytes of the log.
+   *
+   * @since 2.2.0
+   */
+  @Override
+  public byte[] getRawDescriptorBytes();
+
+  /**
+   * Returns annotations found in the log file, which may be an empty List if a
+   * log format does not support adding annotations.
+   *
+   * @since 2.2.0
+   */
+  @Override
+  public List<String> getAnnotations();
+
+  /**
+   * Returns unrecognized lines encountered while parsing the log, which may be
+   * an empty list or a fixed-size list with only a few entries, depending on
+   * the log type.
+   *
+   * @since 2.2.0
+   */
+  @Override
+  public List<String> getUnrecognizedLines();
+
+}
+
diff --git a/src/main/java/org/torproject/descriptor/WebServerAccessLog.java b/src/main/java/org/torproject/descriptor/WebServerAccessLog.java
new file mode 100644
index 0000000..b94bc30
--- /dev/null
+++ b/src/main/java/org/torproject/descriptor/WebServerAccessLog.java
@@ -0,0 +1,65 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor;
+
+import java.time.LocalDate;
+import java.util.List;
+
+/**
+ * Contains a sanitized web server access log file from a {@code torproject.org}
+ * web server.
+ *
+ * <p>Parsing non-sanitized web server access logs from {@code torproject.org}
+ * web servers or other web servers is not explicitly supported, but may work
+ * anyway.</p>
+ *
+ * @since 2.2.0
+ */
+public interface WebServerAccessLog extends LogDescriptor {
+
+  /**
+   * Returns the date when requests contained in the log have been started,
+   * which is parsed from the log file path.
+   *
+   * <p>Typical web server access logs may contain date information in their
+   * file path, too, but that would be the date when the log file was rotated,
+   * which is not necessary the same date as the date in contained request
+   * lines.</p>
+   *
+   * @since 2.2.0
+   */
+  public LocalDate getLogDate();
+
+  /**
+   * Returns the hostname of the physical host writing this log file, which is
+   * parsed from the log file path.
+   *
+   * <p>A physical host can serve multiple virtual hosts, and a virtual host can
+   * be served by multiple physical hosts.</p>
+   *
+   * @since 2.2.0
+   */
+  public String getPhysicalHost();
+
+  /**
+   * Returns the hostname of the virtual host that this log file was written
+   * for, which is parsed from the log file path.
+   *
+   * <p>A physical host can serve multiple virtual hosts, and a virtual host can
+   * be served by multiple physical hosts.</p>
+   *
+   * @since 2.2.0
+   */
+  public String getVirtualHost();
+
+  /**
+   * Returns at most three unrecognized lines encountered while parsing the log.
+   *
+   * @since 2.2.0
+   */
+  @Override
+  public List<String> getUnrecognizedLines();
+
+}
+
diff --git a/src/main/java/org/torproject/descriptor/impl/DescriptorParserImpl.java b/src/main/java/org/torproject/descriptor/impl/DescriptorParserImpl.java
index d32c031..f244abb 100644
--- a/src/main/java/org/torproject/descriptor/impl/DescriptorParserImpl.java
+++ b/src/main/java/org/torproject/descriptor/impl/DescriptorParserImpl.java
@@ -9,6 +9,10 @@ import static org.torproject.descriptor.impl.DescriptorImpl.SP;
 import org.torproject.descriptor.Descriptor;
 import org.torproject.descriptor.DescriptorParseException;
 import org.torproject.descriptor.DescriptorParser;
+import org.torproject.descriptor.log.LogDescriptorImpl;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.File;
 import java.lang.reflect.Constructor;
@@ -19,6 +23,9 @@ import java.util.List;
 
 public class DescriptorParserImpl implements DescriptorParser {
 
+  private static final Logger log
+      = LoggerFactory.getLogger(DescriptorParserImpl.class);
+
   @Override
   public Iterable<Descriptor> parseDescriptors(byte[] rawDescriptorBytes,
       File descriptorFile, String fileName) {
@@ -26,8 +33,7 @@ public class DescriptorParserImpl implements DescriptorParser {
       return this.detectTypeAndParseDescriptors(rawDescriptorBytes,
           descriptorFile, fileName);
     } catch (DescriptorParseException e) {
-      /* Looks like we attempted to parse the whole raw descriptor bytes at once
-       * below and ran into a parse issue. */
+      log.debug("Cannot parse descriptor file ’{}’.", descriptorFile, e);
       List<Descriptor> parsedDescriptors = new ArrayList<>();
       parsedDescriptors.add(new UnparseableDescriptorImpl(rawDescriptorBytes,
           new int[] { 0, rawDescriptorBytes.length }, descriptorFile, e));
@@ -124,6 +130,8 @@ public class DescriptorParserImpl implements DescriptorParser {
     } else if (firstLines.startsWith("@type torperf 1.")) {
       return TorperfResultImpl.parseTorperfResults(rawDescriptorBytes,
           descriptorFile);
+    } else if (descriptorFile.getName().contains(LogDescriptorImpl.MARKER)) {
+      return LogDescriptorImpl.parse(rawDescriptorBytes, descriptorFile);
     } else {
       throw new DescriptorParseException("Could not detect descriptor "
           + "type in descriptor starting with '" + firstLines + "'.");
diff --git a/src/main/java/org/torproject/descriptor/index/package-info.java b/src/main/java/org/torproject/descriptor/index/package-info.java
index c685c63..021cbea 100644
--- a/src/main/java/org/torproject/descriptor/index/package-info.java
+++ b/src/main/java/org/torproject/descriptor/index/package-info.java
@@ -2,14 +2,12 @@
  * See LICENSE for licensing information */
 
 /**
- * <h1>This package is still in alpha stage.</h1>
- * <p>The public interface might still change in unexpected ways.</p> 
+ * <h1>This package is part of the implementation not the public API.</h1>
+ * <p>The public interface might change in unexpected ways.</p>
  *
  * <p>Interfaces and essential classes for obtaining and processing
  * CollecTor's index.json file.</p>
  *
- * <p>Interfaces and classes make the content of index.json available.</p>
- *
  *
  * @since 1.4.0
  */
diff --git a/src/main/java/org/torproject/descriptor/internal/FileType.java b/src/main/java/org/torproject/descriptor/internal/FileType.java
index 36b5df8..353f0bb 100644
--- a/src/main/java/org/torproject/descriptor/internal/FileType.java
+++ b/src/main/java/org/torproject/descriptor/internal/FileType.java
@@ -12,6 +12,8 @@ import org.apache.commons.compress.compressors.xz.XZCompressorOutputStream;
 
 import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
 import java.io.InputStream;
 import java.io.OutputStream;
 
@@ -43,6 +45,8 @@ public enum FileType {
   /**
    * Returns <code>valueOf</code> or the default enum {@link #PLAIN}, i.e.,
    * this method doesn't throw any exceptions and allways returns a valid enum.
+   *
+   * @since 2.1.0
    */
   public static FileType findType(String ext) {
     FileType res = null;
@@ -54,16 +58,61 @@ public enum FileType {
     }
   }
 
-  /** Return the appropriate input stream. */
+  /**
+   * Return the appropriate input stream.
+   *
+   * @since 1.4.0
+   */
   public InputStream inputStream(InputStream is) throws Exception {
     return this.inClass.getConstructor(new Class[]{InputStream.class})
         .newInstance(is);
   }
 
-  /** Return the appropriate output stream. */
+  /**
+   * Return the appropriate output stream.
+   *
+   * @since 1.4.0
+   */
   public OutputStream outputStream(OutputStream os) throws Exception {
     return this.outClass.getConstructor(new Class[]{OutputStream.class})
         .newInstance(os);
   }
+
+  /**
+   * Compresses the given bytes in memory and returns the compressed bytes.
+   *
+   * @since 2.2.0
+   */
+  public byte[] compress(byte[] bytes) throws Exception {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    try (OutputStream os = this.outputStream(baos)) {
+      os.write(bytes);
+      os.flush();
+    }
+    return baos.toByteArray();
+  }
+
+  /**
+   * Decompresses the given bytes in memory and returns the decompressed bytes.
+   *
+   * @since 2.2.0
+   */
+  public byte[] decompress(byte[] bytes) throws Exception {
+    if (0 == bytes.length) {
+      return bytes;
+    }
+    try (InputStream is
+        = this.inputStream(new ByteArrayInputStream(bytes));
+        ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
+      int readByte = is.read();
+      while (readByte > 0) {
+        baos.write(readByte);
+        readByte = is.read();
+      }
+      baos.flush();
+      return baos.toByteArray();
+    }
+  }
+
 }
 
diff --git a/src/main/java/org/torproject/descriptor/internal/package-info.java b/src/main/java/org/torproject/descriptor/internal/package-info.java
index 5bc7bcb..b845921 100644
--- a/src/main/java/org/torproject/descriptor/internal/package-info.java
+++ b/src/main/java/org/torproject/descriptor/internal/package-info.java
@@ -2,11 +2,13 @@
  * See LICENSE for licensing information */
 
 /**
- * <h1>This package is part of the implementation not the public API.</h1>
- * <p>The public interface might change in unexpected ways.</p>
+ * Interfaces and essential classes for obtaining and processing
+ * descriptors.
+ *
+ * <p><strong>This package is part of the implementation not the
+ * public API.</strong></p>
  *
- * <p>Interfaces and essential classes for obtaining and processing
- *  descriptors.</p>
+ * <p>The public interface might change in unexpected ways.</p>
  *
  * @since 2.1.0
  */
diff --git a/src/main/java/org/torproject/descriptor/log/InternalLogDescriptor.java b/src/main/java/org/torproject/descriptor/log/InternalLogDescriptor.java
new file mode 100644
index 0000000..3c0039b
--- /dev/null
+++ b/src/main/java/org/torproject/descriptor/log/InternalLogDescriptor.java
@@ -0,0 +1,63 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+import org.torproject.descriptor.DescriptorParseException;
+import org.torproject.descriptor.LogDescriptor;
+
+/**
+ * This interface provides methods for internal use only.
+ *
+ * @since 2.2.0
+ */
+public interface InternalLogDescriptor extends LogDescriptor {
+
+  /** Logfile name parts separator. */
+  public static final String SEP = "_";
+
+  /**
+   * Validate log lines.
+   *
+   * @since 2.2.0
+   */
+  public void validate() throws DescriptorParseException;
+
+  /**
+   * Set the <code>Validator</code> that will perform the validation on log
+   * lines.
+   *
+   * <p>Usually set by the implementing class.</p>
+   *
+   * @since 2.2.0
+   */
+  public void setValidator(Validator validator);
+
+  /**
+   * Set the descriptor's bytes.
+   *
+   * @since 2.2.0
+   */
+  public void setRawDescriptorBytes(byte[] bytes);
+
+  /** Return the descriptor's preferred compression. */
+  public String getCompressionType();
+
+  /**
+   * Provides a single function for validating a single log line.
+   *
+   * @since 2.2.0
+   */
+  public interface Validator {
+
+    /**
+     * Verifies a log line.
+     *
+     * @since 2.2.0
+     */
+    public boolean validate(String line);
+
+  }
+
+}
+
diff --git a/src/main/java/org/torproject/descriptor/log/InternalWebServerAccessLog.java b/src/main/java/org/torproject/descriptor/log/InternalWebServerAccessLog.java
new file mode 100644
index 0000000..540f25d
--- /dev/null
+++ b/src/main/java/org/torproject/descriptor/log/InternalWebServerAccessLog.java
@@ -0,0 +1,17 @@
+/* Copyright 2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+/**
+ * This interface provides methods for internal use only.
+ *
+ * @since 2.2.0
+ */
+public interface InternalWebServerAccessLog extends InternalLogDescriptor {
+
+  /** The log's name should include this string. */
+  public static final String MARKER = "access.log";
+
+}
+
diff --git a/src/main/java/org/torproject/descriptor/log/LogDescriptorImpl.java b/src/main/java/org/torproject/descriptor/log/LogDescriptorImpl.java
new file mode 100644
index 0000000..97854e4
--- /dev/null
+++ b/src/main/java/org/torproject/descriptor/log/LogDescriptorImpl.java
@@ -0,0 +1,163 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+import org.torproject.descriptor.Descriptor;
+import org.torproject.descriptor.DescriptorParseException;
+import org.torproject.descriptor.LogDescriptor;
+import org.torproject.descriptor.internal.FileType;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+/**
+ * Base class for log descriptors.
+ *
+ * @since 2.2.0
+ */
+public abstract class LogDescriptorImpl
+    implements LogDescriptor, InternalLogDescriptor {
+
+  /** The log's file name should contain this string. */
+  public static final String MARKER = ".log";
+
+  private static final int unrecognizedLinesLimit = 3;
+
+  private static final Logger log
+      = LoggerFactory.getLogger(LogDescriptorImpl.class);
+
+  private static Pattern filenamePattern = Pattern.compile(
+      "(?:\\S*)" + MARKER + SEP + "(?:[0-9a-zA-Z]*)(?:\\.?)([a-zA-Z2]*)");
+
+  private final File descriptorFile;
+
+  /** Byte array for plain, i.e. uncompressed, log data. */
+  private byte[] logBytes;
+
+  private FileType fileType;
+
+  private List<String> unrecognizedLines = new ArrayList<>();
+
+  private Validator validator = (String line) -> true;
+
+  /**
+   * This constructor performs basic operations on the given bytes.
+   *
+   * <p>An unknown compression type (see {@link #getCompressionType})
+   * is interpreted as missing compression.  In this case the bytes
+   * will be compressed to the given compression type.</p>
+   *
+   * @since 2.2.0
+   */
+  protected LogDescriptorImpl(byte[] logBytes, File descriptorFile,
+       FileType defaultCompression) throws DescriptorParseException {
+    this.logBytes = logBytes;
+    this.descriptorFile = descriptorFile;
+    try {
+      Matcher mat = filenamePattern.matcher(descriptorFile.getName());
+      if (!mat.find()) {
+        throw new DescriptorParseException(
+            "Log file name doesn't comply to standard: " + descriptorFile);
+      }
+      this.fileType = FileType.findType(mat.group(1).toUpperCase());
+      if (FileType.PLAIN == this.fileType) {
+        this.fileType = defaultCompression;
+      } else {
+        this.logBytes = this.fileType.decompress(this.logBytes);
+      }
+    } catch (Exception ex) {
+      throw new DescriptorParseException("Cannot parse file "
+          + descriptorFile.getName(), ex);
+    }
+  }
+
+  @Override
+  public void validate() throws DescriptorParseException {
+    try (BufferedReader br
+         = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(
+         this.logBytes)))) {
+      this.unrecognizedLines.addAll(br.lines().parallel().filter((line)
+          -> null != line && !line.isEmpty() && !validator.validate(line))
+          .limit(unrecognizedLinesLimit).collect(Collectors.toList()));
+    } catch (Exception ex) {
+      throw new DescriptorParseException("Cannot validate log lines.", ex);
+    }
+  }
+
+  /**
+   * Assemble a LogDescriptor.
+   *
+   * @since 2.2.0
+   */
+  public static List<Descriptor> parse(byte[] logBytes,
+      File descriptorFile) throws DescriptorParseException {
+    if (descriptorFile.getName().contains(InternalWebServerAccessLog.MARKER)) {
+      return Arrays.asList(new Descriptor[]{
+          new WebServerAccessLogImpl(logBytes, descriptorFile)});
+    } else {
+      throw new DescriptorParseException("Cannot parse file "
+          + descriptorFile.getName());
+    }
+  }
+
+  public static byte[] collectionToBytes(Collection<String> lines) {
+    return lines.stream().collect(Collectors.joining("\n", "", "\n"))
+        .getBytes();
+  }
+
+  @Override
+  public void setValidator(Validator validator) {
+    this.validator = validator;
+  }
+
+  @Override
+  public String getCompressionType() {
+    return this.fileType.name().toLowerCase();
+  }
+
+  @Override
+  public byte[] getRawDescriptorBytes() {
+    return this.logBytes;
+  }
+
+  @Override
+  public void setRawDescriptorBytes(byte[] bytes) {
+    this.logBytes = bytes;
+  }
+
+  @Override
+  public int getRawDescriptorLength() {
+    return this.logBytes.length;
+  }
+
+  @Override
+  public List<String> getAnnotations() {
+    return Collections.emptyList();
+  }
+
+  @Override
+  public List<String> getUnrecognizedLines() {
+    return this.unrecognizedLines;
+  }
+
+  @Override
+  public File getDescriptorFile() {
+    return descriptorFile;
+  }
+
+}
+
diff --git a/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java b/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java
new file mode 100644
index 0000000..6708c3a
--- /dev/null
+++ b/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java
@@ -0,0 +1,119 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+import org.torproject.descriptor.DescriptorParseException;
+import org.torproject.descriptor.WebServerAccessLog;
+import org.torproject.descriptor.internal.FileType;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
+import java.util.Collection;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Implementation of web server access log descriptors.
+ *
+ * <p>Defines sanitization and validation for web server access logs.</p>
+ *
+ * @since 2.2.0
+ */
+public class WebServerAccessLogImpl extends LogDescriptorImpl
+    implements InternalWebServerAccessLog, WebServerAccessLog {
+
+  private static final Logger log
+      = LoggerFactory.getLogger(WebServerAccessLogImpl.class);
+
+  /** The log's name should include this string. */
+  public static final String MARKER = InternalWebServerAccessLog.MARKER;
+
+  /** The mandatory web server log descriptor file name pattern. */
+  public static final Pattern filenamePattern
+      = Pattern.compile("(\\S*)" + SEP + "(\\S*)" + SEP + "" + MARKER
+      + SEP + "(\\d*)(?:\\.?)([a-zA-Z]*)");
+
+  private final String physicalHost;
+
+  private final String virtualHost;
+
+  private final LocalDate logDate;
+
+  /**
+   * Creates a WebServerAccessLog from the given bytes and filename.
+   *
+   * <p>The given bytes are read, whereas the file is not read.</p>
+   *
+   * <p>The path of the given file has to be compliant to the following
+   * naming pattern
+   * {@code
+   * <virtualHost>-<physicalHost>-access.log-<yyyymmdd>.<compression>},
+   * where an unknown compression type (see {@link #getCompressionType})
+   * is interpreted as missing compression.  In this case the bytes
+   * will be compressed to the default compression type.
+   * The immediate parent name is taken to be the physical host collecting the
+   * logs.</p>
+   */
+  protected WebServerAccessLogImpl(byte[] logBytes, File file)
+      throws DescriptorParseException {
+    this(logBytes, file, FileType.XZ);
+  }
+
+  /** For internal use only. */
+  public WebServerAccessLogImpl(Collection<String> lines, String filename)
+      throws DescriptorParseException {
+    this(LogDescriptorImpl.collectionToBytes(lines), new File(filename));
+  }
+
+  private WebServerAccessLogImpl(byte[] logBytes, File file,
+      FileType defaultCompression) throws DescriptorParseException {
+    super(logBytes, file, defaultCompression);
+    try {
+      String fn = file.toPath().getFileName().toString();
+      Matcher mat = filenamePattern.matcher(fn);
+      if (!mat.find()) {
+        throw new DescriptorParseException(
+            "WebServerAccessLog file name doesn't comply to standard: " + fn);
+      }
+      this.virtualHost = mat.group(1);
+      this.physicalHost = mat.group(2);
+      if (null == this.virtualHost || null == this.physicalHost
+          || this.virtualHost.isEmpty() || this.physicalHost.isEmpty()) {
+        throw new DescriptorParseException(
+            "WebServerAccessLog file name doesn't comply to standard: " + fn);
+      }
+      String ymd = mat.group(3);
+      this.logDate = LocalDate.parse(ymd, DateTimeFormatter.BASIC_ISO_DATE);
+      this.setValidator((line)
+          -> WebServerAccessLogLine.makeLine(line).isValid());
+      this.validate();
+    } catch (DescriptorParseException dpe) {
+      throw dpe; // escalate
+    } catch (Exception pe) {
+      throw new DescriptorParseException(
+          "Cannot parse WebServerAccessLog file: " + file, pe);
+    }
+  }
+
+  @Override
+  public String getPhysicalHost() {
+    return this.physicalHost;
+  }
+
+  @Override
+  public String getVirtualHost() {
+    return this.virtualHost;
+  }
+
+  @Override
+  public LocalDate getLogDate() {
+    return this.logDate;
+  }
+
+}
+
diff --git a/src/main/java/org/torproject/descriptor/log/WebServerAccessLogLine.java b/src/main/java/org/torproject/descriptor/log/WebServerAccessLogLine.java
new file mode 100644
index 0000000..ab20dd2
--- /dev/null
+++ b/src/main/java/org/torproject/descriptor/log/WebServerAccessLogLine.java
@@ -0,0 +1,135 @@
+/* Copyright 2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.time.LocalDate;
+import java.time.ZoneOffset;
+import java.time.ZonedDateTime;
+import java.time.format.DateTimeFormatter;
+import java.util.Optional;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class WebServerAccessLogLine {
+
+  private static final Logger log = LoggerFactory
+      .getLogger(WebServerAccessLogLine.class);
+
+  private static final String DATE_PATTERN = "dd/MMM/yyyy";
+
+  private static final DateTimeFormatter dateTimeFormatter
+      = DateTimeFormatter.ofPattern(DATE_PATTERN + ":HH:mm:ss xxxx");
+
+  private static Pattern logLinePattern = Pattern.compile(
+      "^((?:\\d{1,3}\\.){3}\\d{1,3}) (\\S+) (\\S+) "
+      + "\\[([\\w/]+)([\\w:]+)(\\s[+\\-]\\d{4})\\] "
+      + "\"([A-Z]+) ([^\"]+) ([A-Z]+/\\d\\.\\d)\" "
+      + "(\\d{3}) (\\d+|-)(.*)");
+
+  private String ip;
+  private int response;
+  private String request;
+  private String method;
+  private String dateString;
+  private LocalDate date;
+  private String protocol;
+  private Optional<Integer> size;
+  private boolean valid = false;
+  private String type;
+
+  /** Returns a log line string. Possibly empty. */
+  public String toLogString() {
+    if (!this.valid) {
+      return "";
+    }
+    return toString();
+  }
+
+  @Override
+  public String toString() {
+    return String.format("%s - - [%s:00:00:00 +0000] \"%s %s %s\" %d %s",
+        this.ip, this.dateString, this.method, this.request, this.type,
+        this.response, this.size.isPresent() ? this.size.get() : "-");
+  }
+
+  /** Returns the string of the date using 'yyyymmdd' format. */
+  public String getDateString() {
+    return dateString;
+  }
+
+  /** Returns a string containing the ip. */
+  public String getIp() {
+    return this.ip;
+  }
+
+  /** Only used internally during sanitization. */
+  public void setIp(String ip) {
+    this.ip = ip;
+  }
+
+  public String getMethod() {
+    return this.method;
+  }
+
+  public String getProtocol() {
+    return this.protocol;
+  }
+
+  public String getRequest() {
+    return this.request;
+  }
+
+  public int getResponse() {
+    return this.response;
+  }
+
+  /** Only used internally during sanitization. */
+  public void setRequest(String request) {
+    this.request = request;
+  }
+
+  public LocalDate getDate() {
+    return this.date;
+  }
+
+  public boolean isValid() {
+    return this.valid;
+  }
+
+  /** Creates a Line from a string. */
+  public static WebServerAccessLogLine makeLine(String line) {
+    WebServerAccessLogLine res = new WebServerAccessLogLine();
+    try {
+      Matcher mat = logLinePattern.matcher(line);
+      if (mat.find()) {
+        res.response = Integer.valueOf(mat.group(10));
+        res.method = mat.group(7);
+        res.protocol = mat.group(9);
+        String dateTimeString = mat.group(4) + mat.group(5) + mat.group(6);
+        res.date = ZonedDateTime.parse(dateTimeString,
+            dateTimeFormatter).withZoneSameInstant(ZoneOffset.UTC)
+            .toLocalDate();
+        res.dateString = res.date
+            .format(DateTimeFormatter.ofPattern(DATE_PATTERN));
+        res.ip = mat.group(1);
+        res.request = mat.group(8);
+        res.type = mat.group(9);
+        if ("-".equals(mat.group(11))) {
+          res.size = Optional.empty();
+        } else {
+          res.size = Optional.of(Integer.valueOf(mat.group(11)));
+        }
+        res.valid = true;
+      }
+    } catch (Throwable th) {
+      log.debug("Unmatchable line: '{}'.", line, th);
+      return new WebServerAccessLogLine();
+    }
+    return res;
+  }
+
+}
diff --git a/src/main/java/org/torproject/descriptor/log/package-info.java b/src/main/java/org/torproject/descriptor/log/package-info.java
new file mode 100644
index 0000000..68bcfa1
--- /dev/null
+++ b/src/main/java/org/torproject/descriptor/log/package-info.java
@@ -0,0 +1,14 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+/**
+ * <h1>This package is part of the implementation not the public API.</h1>
+ * <p>The public interface might change in unexpected ways.</p>
+ *
+ * <p>Interfaces and essential classes for obtaining and processing
+ * log descriptors.</p>
+ *
+ * @since 2.2.0
+ */
+package org.torproject.descriptor.log;
+
diff --git a/src/main/java/org/torproject/descriptor/package-info.java b/src/main/java/org/torproject/descriptor/package-info.java
index 0410bac..d844d40 100644
--- a/src/main/java/org/torproject/descriptor/package-info.java
+++ b/src/main/java/org/torproject/descriptor/package-info.java
@@ -65,9 +65,11 @@
  * connected to the Tor network rather than by the Tor software.  This
  * group comprises descriptors by the bridge distribution service BridgeDB
  * ({@link org.torproject.descriptor.BridgePoolAssignment}), the exit list
- * service TorDNSEL ({@link org.torproject.descriptor.ExitList}), and the
+ * service TorDNSEL ({@link org.torproject.descriptor.ExitList}), the
  * performance measurement service Torperf
- * ({@link org.torproject.descriptor.TorperfResult}).</li>
+ * ({@link org.torproject.descriptor.TorperfResult}), and sanitized access logs
+ * of Tor's web servers
+ * ({@link org.torproject.descriptor.WebServerAccessLog}).</li>
  * </ol>
  *
  * @since 1.0.0
diff --git a/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java b/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java
new file mode 100644
index 0000000..b12cfc0
--- /dev/null
+++ b/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java
@@ -0,0 +1,178 @@
+
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.torproject.descriptor.Descriptor;
+import org.torproject.descriptor.DescriptorParser;
+import org.torproject.descriptor.DescriptorReader;
+import org.torproject.descriptor.DescriptorSourceFactory;
+import org.torproject.descriptor.LogDescriptor;
+import org.torproject.descriptor.UnparseableDescriptor;
+import org.torproject.descriptor.WebServerAccessLog;
+
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+
+ at RunWith(Parameterized.class)
+public class LogDescriptorTest {
+
+  /** Temporary folder containing all files for this test. */
+  @Rule
+  public TemporaryFolder temp = new TemporaryFolder();
+
+  /** Directory containing two input descriptor files. */
+  protected File indir;
+
+  /** Descriptor reader used in this test. */
+  protected DescriptorReader reader
+      = DescriptorSourceFactory.createDescriptorReader();
+
+  protected int size;
+  protected String[] pan;
+  protected Class<LogDescriptor> type;
+  protected boolean isDecompressionTest;
+
+  /** All types of data that can be encountered during sync. */
+  @Parameters
+  public static Collection<Object[]> pathAndName() {
+    return Arrays.asList(new Object[][] {
+        {Boolean.TRUE, 1878, new String[]{"meronense.torproject.org",
+            "metrics.torproject.org_meronense.torproject.org_access.log"
+            + "_20170530.gz",
+            "metrics.torproject.org", "20170530", "gz"},
+         WebServerAccessLog.class},
+        {Boolean.FALSE, 1878, new String[]{"meronense.torproject.org",
+            "xy.host.org_meronense.torproject.org_access.log_20170530.log",
+            "metrics.torproject.org", "20170530", "xz"},
+         WebServerAccessLog.class},
+        {Boolean.TRUE, 70730, new String[]{"archeotrichon.torproject.org",
+            "archive.torproject.org_archeotrichon.torproject.org_access.log_"
+            + "20151007.xz",
+            "archive.torproject.org", "20151007", "xz"},
+         WebServerAccessLog.class},
+        {Boolean.TRUE, 0, new String[]{"dummy.host.net",
+            "nix.server.org_dummy.host.net_access.log_20111111.bz2",
+            "nix.server.org", "20111111", "bz2"},
+         WebServerAccessLog.class}});
+  }
+
+  /** This constructor receives the above defined data for each run. */
+  public LogDescriptorTest(boolean decompression, int size, String[] pan,
+      Class<LogDescriptor> type) {
+    this.pan = pan;
+    this.size = size;
+    this.type = type;
+    this.isDecompressionTest = decompression;
+  }
+
+  /** Prepares the temporary folder and writes files to it for this test. */
+  private void createTemporaryFolderAndContents() throws IOException {
+    this.indir = this.temp.newFolder();
+    String path = this.pan[0];
+    String name = this.pan[1];
+    File logdir = new File(indir, path);
+    logdir.mkdir();
+    File accessLogFile = new File(logdir, name);
+    Files.copy(getClass().getClassLoader().getResource(path + "/" + name)
+        .openStream(), accessLogFile.toPath());
+  }
+
+  /** Read the test files. */
+  @Before
+  public void readAll() throws IOException {
+    createTemporaryFolderAndContents();
+    Iterator<Descriptor> descs = this.reader
+        .readDescriptors(this.indir).iterator();
+    while (descs.hasNext()) {
+      descs.next();
+    }
+  }
+
+  protected List<Descriptor> retrieve() throws Exception {
+    assertEquals(1, this.reader.getParsedFiles().size());
+    File logFile = new File(this.reader.getParsedFiles().firstKey());
+    byte[] raw = Files.readAllBytes(logFile.toPath());
+    DescriptorParser dp = DescriptorSourceFactory.createDescriptorParser();
+    List<Descriptor> descs = new ArrayList<>();
+    for (Descriptor desc
+        : dp.parseDescriptors(raw, logFile, logFile.getName())) {
+      descs.add(desc);
+    }
+    return descs;
+  }
+
+  @Test
+  public void testParsing() throws Exception {
+    List<Descriptor> descs = retrieve();
+    assertTrue("Wrong type. " + dataUsed(),
+        (descs.get(0) instanceof LogDescriptor));
+    InternalLogDescriptor ld = (InternalLogDescriptor) descs.get(0);
+    assertEquals("Wrong compression type string. " + dataUsed(),
+        pan[4], ld.getCompressionType());
+  }
+
+  private String dataUsed() {
+    return "Used data: " + Arrays.toString(pan);
+  }
+
+  @Test
+  public void testUnknownLogType() throws Exception {
+    assertEquals(dataUsed(), 1, this.reader.getParsedFiles().size());
+    File logFile = new File(this.reader.getParsedFiles().firstKey());
+    DescriptorParser dp = DescriptorSourceFactory.createDescriptorParser();
+    File invalidFile = new File(this.reader.getParsedFiles().firstKey()
+        .replace("access", "-"));
+    List<Descriptor> descs = new ArrayList<>();
+    for (Descriptor desc // note: only 'invalidFile' is used by LogDescriptor
+        : dp.parseDescriptors(new byte[]{}, invalidFile, logFile.getName())) {
+      descs.add(desc);
+    }
+    assertTrue(dataUsed() + "\nWrong type: "
+        + Arrays.toString(descs.get(0).getClass().getInterfaces()),
+        (descs.get(0) instanceof UnparseableDescriptor));
+  }
+
+  @Test
+  public void testCompressionInvalid() throws Exception {
+    if (!isDecompressionTest) {
+      return;
+    }
+    assertEquals(1, this.reader.getParsedFiles().size());
+    File logFile = new File(this.reader.getParsedFiles().firstKey());
+    byte[] raw = Files.readAllBytes(logFile.toPath());
+    for (int i = 0; i < 3; i++) {
+      raw[0] = (byte) i;
+    }
+    DescriptorParser dp = DescriptorSourceFactory.createDescriptorParser();
+    List<Descriptor> descs = new ArrayList<>();
+    for (Descriptor desc
+           : dp.parseDescriptors(raw, logFile, logFile.getName())) {
+      descs.add(desc);
+    }
+    assertTrue(dataUsed() + "\nWrong type: "
+        + Arrays.toString(descs.get(0).getClass().getInterfaces()),
+        (descs.get(0) instanceof UnparseableDescriptor));
+    assertArrayEquals(dataUsed(), raw, descs.get(0).getRawDescriptorBytes());
+  }
+}
+
diff --git a/src/test/java/org/torproject/descriptor/log/WebServerAccessLogLineTest.java b/src/test/java/org/torproject/descriptor/log/WebServerAccessLogLineTest.java
new file mode 100644
index 0000000..ec23b61
--- /dev/null
+++ b/src/test/java/org/torproject/descriptor/log/WebServerAccessLogLineTest.java
@@ -0,0 +1,140 @@
+/* Copyright 2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameter;
+import org.junit.runners.Parameterized.Parameters;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+ at RunWith(Parameterized.class)
+public class WebServerAccessLogLineTest {
+
+  /** Test data structure:
+   * reference date, real log line, cleaned line, is valid.
+   */
+  @Parameters
+  public static Collection<Object[]> logData() {
+    return Arrays.asList(new Object[][] {
+        { "0.0.0.0 - - [22/Jan/2018:00:00:00 +0000] \"GET "
+          + "/collector/archive HTTP/1.1\" 301 -",
+          "0.0.0.0 - - [22/Jan/2018:00:00:00 +0000] \"GET "
+          + "/collector/archive HTTP/1.1\" 301 -", Boolean.TRUE},
+        { "0.0.0.0 - - [22/Jan/2018:00:00:00 +0000] \"GET "
+          + "/collector/archive HTTP/1.1\" 301 X \"ccc\"",
+          "", Boolean.FALSE},
+        { "123.98.100.23 xyz xyz [22/Jan/2018:01:20:03 +0000] \"GET "
+          + "/collector/archive HTTP/1.1\" 301 - xyz abc xxxXXXXXXXX",
+          "123.98.100.23 - - [22/Jan/2018:00:00:00 +0000] \"GET "
+          + "/collector/archive HTTP/1.1\" 301 -", Boolean.TRUE},
+        { "127.0.0.1 abc xyz [03/May/2017:06:07:08 +0000] "
+          + "\"GET /server-status HTTP/1.1\" 303 294 "
+          + "\"-\" \"munin/2.0.25-1+deb8u3 (libwww-perl/6.08)\"",
+          "127.0.0.1 - - [03/May/2017:00:00:00 +0000] \"GET /server-status"
+          + " HTTP/1.1\" 303 294", Boolean.TRUE},
+        { "127.0.0.1 abc xyz [03/May/2017:06:07:08 +0000] "
+          + "\"GET /server-status?auto HTTP/1.1\" 303 294 "
+          + "\"-\" \"munin/2.0.25-1+deb8u3 (libwww-perl/6.08)\"",
+          "127.0.0.1 - - [03/May/2017:00:00:00 +0000] \"GET /server-status"
+          + "?auto HTTP/1.1\" 303 294", Boolean.TRUE},
+        { "42.41.40.39 - - [04/May/2017:06:07:08 +0000] "
+          + "\"HEAD /server-status?auto HTTP/1.1\" 200 294 "
+          + "\"-\" \"munin/2.0.25-1+deb8u3 (libwww-perl/6.08)\"",
+          "42.41.40.39 - - [04/May/2017:00:00:00 +0000] \"HEAD /server-status"
+          + "?auto HTTP/1.1\" 200 294", Boolean.TRUE},
+        { "42.41.39 - - [04/May/2017:06:07:08 +0000] "
+          + "\"HEAD /server-status?auto HTTP/1.1\" 200 294 "
+          + "\"-\" \"munin/2.0.25-1+deb8u3 (libwww-perl/6.08)\"",
+          "", Boolean.FALSE},
+        { "42.41.40.1039 - - [04/May/2017:06:07:08 +0000] "
+          + "\"HEAD /server-status?auto HTTP/1.1\" 200 294 "
+          + "\"-\" \"munin/2.0.25-1+deb8u3 (libwww-perl/6.08)\"",
+          "", Boolean.FALSE},
+        { "42.41.40_039 - - [04/May/2017:06:07:08 +0000] "
+          + "\"HEAD /server-status?auto HTTP/1.1\" 200 294 "
+          + "\"-\" \"munin/2.0.25-1+deb8u3 (libwww-perl/6.08)\"",
+          "", Boolean.FALSE},
+        { "0.0.0.2 - - [05/May/2017:15:16:17 +0000] "
+          + "\"GET http://metrics.torproject.org/favicon.ico HTTP/1.1\" "
+          + "404 536 \"-\" \"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0;"
+          + " SLCC1; .NET CLR 2.0.50; Media Center PC 5.0; .NET CLR 3.5.2;\"",
+          "0.0.0.2 - - [05/May/2017:00:00:00 +0000] "
+          + "\"GET http://metrics.torproject.org/favicon.ico HTTP/1.1\" "
+          + "404 536",
+          Boolean.TRUE},
+        { "0.0.0.99 - - [05/June/2017:15:16:17 +0000] "
+          + "\"GET http://metrics.torproject.org/favicon.ico FTP/1.0\" "
+          + "300 536 \"-\" \"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0;"
+          + " SLCC1; .NET CLR 2.0.50; Media Center PC 5.0; .NET CLR 3.5.2;\"",
+          "", Boolean.FALSE},
+        { "0.0.0.99 - - [05/Jun/2017:15:16:17 +0000] "
+          + "\"GET http://metrics.torproject.org/favicon.ico FTP/1.0\" "
+          + "300 536 \"-\" \"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0;"
+          + " SLCC1; .NET CLR 2.0.50; Media Center PC 5.0; .NET CLR 3.5.2;\"",
+          "0.0.0.99 - - [05/Jun/2017:00:00:00 +0000] "
+          + "\"GET http://metrics.torproject.org/favicon.ico FTP/1.0\" 300 536",
+          Boolean.TRUE},
+        { "0.0.0.7 - - [06/May/2017:00:16:17 +0100] "
+          + "\"GET http://metrics.torproject.org/favicon.ico HTTP/1.1\" "
+          + "333 536 \"-\" \"Mozilla/4.0 (compatible; Opera 7.0; Windows 6.0;"
+          + " funky values ; \"",
+          "0.0.0.7 - - [05/May/2017:00:00:00 +0000] "
+          + "\"GET http://metrics.torproject.org/favicon.ico HTTP/1.1\" "
+          + "333 536", Boolean.TRUE},
+        { "0.0.0.1 - - [07/Dec/2016:20:16:18 -1000] "
+          + "\"GET http://t3.torproject.org/?query=what HTTP/1.1\" "
+          + "200 777 \"-\" \"Mozilla/4.0 (compatible; MSIE 7.0; Windows 10;"
+          + " SLCC1; .NET CLR 2.0; Media Center PC 5.0; .NET CLR 3.5.2)\"",
+          "0.0.0.1 - - [08/Dec/2016:00:00:00 +0000] "
+          + "\"GET http://t3.torproject.org/?query=what HTTP/1.1\" 200 777",
+          Boolean.TRUE},
+        { "abcdefghijklmnop1234567890", "", Boolean.FALSE},
+        { "", "", Boolean.FALSE},
+        { "0.0.0.7 - - [06/May/2017:00:16:17 +0000] "
+          + "\"GET http://metrics.torproject.org/favicon.ico HTTP/1.1\" "
+          + "333 536 \"-\" \"Mozilla/4.0 (compatible; Opera 7.0; Windows 8.0;",
+          "0.0.0.7 - - [06/May/2017:00:00:00 +0000] "
+          + "\"GET http://metrics.torproject.org/favicon.ico HTTP/1.1\" "
+          + "333 536", Boolean.TRUE},
+        { "0.0.0.7 - - [06/May/2017:00:16:17 +0000] "
+          + "\"GET http://metrics.torproject.org/favicon.ico HTTP/1.1\" "
+          + "333 536 \"-\" \"Mozilla/4.0 (compatible; Opera 7.0; Windows XT;",
+          "0.0.0.7 - - [06/May/2017:00:00:00 +0000] "
+          + "\"GET http://metrics.torproject.org/favicon.ico HTTP/1.1\" "
+          + "333 536", Boolean.TRUE},
+        { "0.0.0.0 - - [08/May/2017:00:00:00 +0000] "
+          + "\"GET /server-status HTTP/1.1\" 200 1294",
+          "0.0.0.0 - - [08/May/2017:00:00:00 +0000] \"GET "
+          + "/server-status HTTP/1.1\" 200 1294", Boolean.TRUE}
+        });
+  }
+
+  @Parameter(0)
+  public String real;
+
+  @Parameter(1)
+  public String clean;
+
+  @Parameter(2)
+  public boolean valid;
+
+  @Test
+  public void testValidation() {
+    WebServerAccessLogLine line = WebServerAccessLogLine.makeLine(real);
+    assertEquals("Failed on line: " + real, valid, line.isValid());
+    assertEquals("Failed on line: " + real, clean, line.toLogString());
+    if (valid && !"".equals(clean)) { // A cleaned, accepted line is valid.
+      assertEquals("Failed on line: " + clean, clean,
+          WebServerAccessLogLine.makeLine(clean).toLogString());
+    }
+  }
+
+}
+
diff --git a/src/test/java/org/torproject/descriptor/log/WebServerAccessLogTest.java b/src/test/java/org/torproject/descriptor/log/WebServerAccessLogTest.java
new file mode 100644
index 0000000..3e98f13
--- /dev/null
+++ b/src/test/java/org/torproject/descriptor/log/WebServerAccessLogTest.java
@@ -0,0 +1,94 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.Collection;
+
+ at RunWith(Parameterized.class)
+public class WebServerAccessLogTest {
+
+  /** Test data structure: given line, cleaned line, valid, filename. */
+  @Parameters
+  public static Collection<Object[]> logData() {
+    return Arrays.asList(new Object[][] {
+        { "0.0.0.0 - - [20/Sep/2017:00:00:00 +0000] "
+          + "\"GET /fonts/WOFF/OTF/SourceSansPro-It.otf.woff HTTP/1.1\" "
+          + "200 50556 \"https://metrics.torproject.org/\" \"-\" -",
+          "0.0.0.0 - - [20/Sep/2017:00:00:00 +0000] \"GET "
+          + "/fonts/WOFF/OTF/SourceSansPro-It.otf.woff HTTP/1.1\" 200 50556\n",
+          Boolean.TRUE, "virt.host0_phys.host1a_access.log_20170920"},
+        { "127.0.0.1 qwer 123 [30/May/2017:06:07:08 +0000] "
+          + "\"GET /server-status?auto HTTP/1.1\" 333 294 "
+          + "\"-\" \"munin/2.0.25-1+deb8u3 (libwww-perl/6.08)\"",
+          "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET /server-status"
+          + " HTTP/1.1\" 333 294\n", Boolean.TRUE,
+          "virt.host1_phys.host2a_access.log_20170530"},
+        { "0.0.0.3 abc 567 [30/May/2017:06:07:08 +0000] "
+          + "\"GET /server-status?auto HTTP/1.1\" 333 294 "
+          + "\"-\" \"munin/2.0.25-1+deb8u3 (libwww-perl/6.08)\"",
+          "0.0.0.3 - - [30/May/2017:00:00:00 +0000] \"GET /server-status"
+          + " HTTP/1.1\" 333 294\n", Boolean.TRUE,
+          "virt-host1_phys.host2a_access.log_20170530"},
+        { "11.22.33.44 - - [30/Jul/2017:15:16:17 +0000] "
+          + "\"GET http://www.torproject.org/favicon.ico HTTP/1.1\" "
+          + "100 536 \"-\" \"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0;"
+          + " SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.5.2; "
+          + ".NET CLR 3.5.30729; .NET CLR 3.0.30618)\"",
+          "0.0.0.0 - - [30/Jul/2017:00:00:00 +0000] "
+          + "\"GET http://www.torproject.org/favicon.ico HTTP/1.1\" "
+          + "100 536\n", Boolean.TRUE,
+          "virt.host1_phys.host2b_access.log_20170730"},
+        { "abcdefghijklmnop1234567890", "", Boolean.FALSE,
+          "vhost1_phys.host2c_access.log_20170731.log"},
+        { "", "", Boolean.FALSE, "host2d_host1_access.log_20170731.log"},
+        { "0.0.0.0 - - [30/May/2017:00:00:00 +0000] "
+          + "\"GET /server-status HTTP/1.1\" 200 1294 \"-\" \"-\" -",
+          "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET "
+          + "/server-status HTTP/1.1\" 200 1294\n", Boolean.TRUE,
+          "some/other/path/virtual_physical_access.log_20170530.log"}
+        });
+  }
+
+  private String real;
+  private String clean;
+  private int count;
+  private boolean valid;
+  private String fn;
+  private File file;
+
+  /** Set the above test data. */
+  public WebServerAccessLogTest(String in, String out, boolean valid,
+      String filename) {
+    this.real = in;
+    this.clean = out;
+    this.valid = valid;
+    this.fn = filename;
+    this.file = new File(filename);
+  }
+
+  @Test
+  public void testValidation() throws Exception {
+    WebServerAccessLogImpl wsal
+        = new WebServerAccessLogImpl(real.getBytes(), file);
+    wsal.validate();
+    if (valid) {
+      assertEquals(0, wsal.getUnrecognizedLines().size());
+    } else {
+      if (!real.isEmpty()) {
+        assertEquals(real, wsal.getUnrecognizedLines().get(0));
+      }
+    }
+  }
+
+}
+
diff --git a/src/test/java/org/torproject/descriptor/log/WebServerModuleTest.java b/src/test/java/org/torproject/descriptor/log/WebServerModuleTest.java
new file mode 100644
index 0000000..a11bc30
--- /dev/null
+++ b/src/test/java/org/torproject/descriptor/log/WebServerModuleTest.java
@@ -0,0 +1,113 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.descriptor.log;
+
+import static org.junit.Assert.assertEquals;
+
+import org.torproject.descriptor.DescriptorParseException;
+
+import org.hamcrest.Matchers;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.stream.Collectors;
+
+/** This class contains various tests for the webstats module. */
+public class WebServerModuleTest {
+
+  @Rule
+  public ExpectedException thrown = ExpectedException.none();
+
+  @Test
+  public void testWrongFormat() throws Exception {
+    String filename = "h1_phys1_access.log_no-date.log";
+    thrown.expect(DescriptorParseException.class);
+    thrown.expectMessage(Matchers
+         .containsString("Cannot parse WebServerAccessLog file: "
+         + filename));
+    new WebServerAccessLogImpl(new byte[0],
+        new File(filename));
+  }
+
+  @Test
+  public void testDateFormat() throws Exception {
+    String filename = "h2_phys2_access.log_05001713";
+    thrown.expect(DescriptorParseException.class);
+    thrown.expectMessage(Matchers
+         .containsString("Cannot parse WebServerAccessLog file: "
+         + filename));
+    new WebServerAccessLogImpl(new byte[0],
+        new File(filename));
+  }
+
+  @Test
+  public void testNoParentPathRoot() throws Exception {
+    String filename = "h3_access.log_05001213";
+    thrown.expect(DescriptorParseException.class);
+    thrown.expectMessage(Matchers
+         .containsString("WebServerAccessLog "
+         + "file name doesn't comply to standard: " + filename));
+    new WebServerAccessLogImpl(new byte[0],
+        new File(filename));
+  }
+
+  @Test
+  public void testNoParentPathThis() throws Exception {
+    String filename = "_h3_access.log_05001213";
+    thrown.expect(DescriptorParseException.class);
+    thrown.expectMessage(Matchers
+         .containsString("WebServerAccessLog "
+         + "file name doesn't comply to standard: " + filename));
+    new WebServerAccessLogImpl(new byte[0],
+        new File(filename));
+  }
+
+  @Test
+  public void testNoParentPathParent() throws Exception {
+    String filename = "h3__access.log_05001213";
+    thrown.expect(DescriptorParseException.class);
+    thrown.expectMessage(Matchers
+         .containsString("WebServerAccessLog "
+         + "file name doesn't comply to standard: " + filename));
+    new WebServerAccessLogImpl(new byte[0],
+        new File(filename));
+  }
+
+  private static String[] logLines = {
+      "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET "
+      + "/server-status HTTP/1.1\" 200 1205",
+      "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET "
+      + "/server-status HTTP/1.1\" 200 1203",
+      "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET "
+      + "/server-status HTTP/1.1\" 200 1207",
+      "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET "
+      + "/server-status HTTP/1.1\" 200 1204",
+      "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET "
+      + "/server-status HTTP/1.1\" 200 1202",
+      "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET "
+      + "/server-status HTTP/1.1\" 200 1206",
+      "0.0.0.0 - - [30/May/2017:00:00:00 +0000] \"GET "
+      + "/server-status HTTP/1.1\" 200 1201"
+  };
+
+  private static String logText = Arrays.asList(logLines).stream()
+      .map((String line) -> line + (" some content"))
+      .collect(Collectors.joining("\n"));
+
+  @Test
+  public void testBasics() throws Exception {
+    WebServerAccessLogImpl wsal = new WebServerAccessLogImpl(logText.getBytes(),
+        new File("vhost_host7_access.log_20170530"));
+    assertEquals(wsal.getAnnotations().size(), 0);
+    assertEquals(logText, new String(wsal.getRawDescriptorBytes()));
+    assertEquals("host7", wsal.getPhysicalHost());
+    assertEquals("vhost", wsal.getVirtualHost());
+  }
+
+}
+
diff --git a/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz b/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz
new file mode 100644
index 0000000..b459742
Binary files /dev/null and b/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz differ
diff --git a/src/test/resources/dummy.host.net/nix.server.org_dummy.host.net_access.log_20111111.bz2 b/src/test/resources/dummy.host.net/nix.server.org_dummy.host.net_access.log_20111111.bz2
new file mode 100644
index 0000000..17f335d
Binary files /dev/null and b/src/test/resources/dummy.host.net/nix.server.org_dummy.host.net_access.log_20111111.bz2 differ
diff --git a/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170530.gz b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170530.gz
new file mode 100644
index 0000000..8c2333b
Binary files /dev/null and b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170530.gz differ
diff --git a/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz
new file mode 100644
index 0000000..8c2333b
Binary files /dev/null and b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz differ
diff --git a/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170530.log b/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170530.log
new file mode 100644
index 0000000..eee478b
--- /dev/null
+++ b/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170530.log
@@ -0,0 +1,26 @@
+123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2873 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:37:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2877 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:39:44 +0000] "GET / HTTP/1.1" 200 868 "-" "check_http/v2.1.1 (monitoring-plugins 2.1.1)"
+123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:42:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:49:50 +0000] "GET / HTTP/1.1" 200 868 "-" "Wget/1.15 (linux-gnu)"
+123.456.789.0 - - [30/May/2017:06:52:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2872 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:54:44 +0000] "GET / HTTP/1.1" 200 868 "-" "check_http/v2.1.1 (monitoring-plugins 2.1.1)"
+123.456.789.0 - - [30/May/2017:06:56:54 +0000] "-" 408 0 "-" "-"
+123.456.789.0 - - [30/May/2017:06:57:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2876 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+
diff --git a/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170607.log b/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170607.log
new file mode 100644
index 0000000..eee478b
--- /dev/null
+++ b/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170607.log
@@ -0,0 +1,26 @@
+123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2873 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:37:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2877 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:39:44 +0000] "GET / HTTP/1.1" 200 868 "-" "check_http/v2.1.1 (monitoring-plugins 2.1.1)"
+123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:42:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:49:50 +0000] "GET / HTTP/1.1" 200 868 "-" "Wget/1.15 (linux-gnu)"
+123.456.789.0 - - [30/May/2017:06:52:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2872 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "apache-munin/1.0libwww-perl/6.08"
+123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+123.456.789.0 - - [30/May/2017:06:54:44 +0000] "GET / HTTP/1.1" 200 868 "-" "check_http/v2.1.1 (monitoring-plugins 2.1.1)"
+123.456.789.0 - - [30/May/2017:06:56:54 +0000] "-" 408 0 "-" "-"
+123.456.789.0 - - [30/May/2017:06:57:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2876 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"
+



More information about the tor-commits mailing list