[tor-commits] [metrics-web/master] Adapt webstats to read logs from CollecTor.

karsten at torproject.org karsten at torproject.org
Sat Mar 24 09:32:22 UTC 2018


commit 7be397890d2ff66d7479b52aa245afdd3d487f9d
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Thu Mar 22 21:37:28 2018 +0100

    Adapt webstats to read logs from CollecTor.
    
    In this change we:
     - update metrics-lib to 2.2.0,
     - start downloading and processing logs from CollecTor rather than
       from webstats.torproject.org,
     - change log line counts from int to long,
     - remove webstats tests which are now contained in metrics-lib,
     - update the CollecTor page, and
     - take out the beta notice from the Tor web server logs page.
    
    Implements #25520.
---
 build.xml                                          |   2 +-
 .../metrics/stats/collectdescs/Main.java           |   3 +-
 .../torproject/metrics/stats/webstats/Main.java    | 223 ++++++---------------
 src/main/resources/spec/web-server-logs.xml        |   4 -
 src/main/resources/web/jsps/collector.jsp          |  38 ++++
 src/main/resources/web/jsps/web-server-logs.jsp    |  16 +-
 src/main/sql/webstats/init-webstats.sql            |   2 +-
 .../metrics/stats/webstats/MainTest.java           | 110 ----------
 8 files changed, 106 insertions(+), 292 deletions(-)

diff --git a/build.xml b/build.xml
index e98757e..57eab68 100644
--- a/build.xml
+++ b/build.xml
@@ -9,7 +9,7 @@
   <property name="javadoc-title" value="MetricsWeb API Documentation"/>
   <property name="implementation-title" value="metrics-web" />
   <property name="release.version" value="1.0.3-dev" />
-  <property name="metricslibversion" value="2.1.1" />
+  <property name="metricslibversion" value="2.2.0" />
   <property name="jetty.version" value="-9.2.21.v20170120" />
   <property name="warfile"
             value="metrics-web-${release.version}.war"/>
diff --git a/src/main/java/org/torproject/metrics/stats/collectdescs/Main.java b/src/main/java/org/torproject/metrics/stats/collectdescs/Main.java
index 04dc86d..4c64425 100644
--- a/src/main/java/org/torproject/metrics/stats/collectdescs/Main.java
+++ b/src/main/java/org/torproject/metrics/stats/collectdescs/Main.java
@@ -24,7 +24,8 @@ public class Main {
             "/recent/relay-descriptors/consensuses/",
             "/recent/relay-descriptors/extra-infos/",
             "/recent/relay-descriptors/server-descriptors/",
-            "/recent/torperf/"
+            "/recent/torperf/",
+            "/recent/webstats/"
         }, 0L, new File("../../shared/in"), true);
   }
 }
diff --git a/src/main/java/org/torproject/metrics/stats/webstats/Main.java b/src/main/java/org/torproject/metrics/stats/webstats/Main.java
index f70963f..5d11114 100644
--- a/src/main/java/org/torproject/metrics/stats/webstats/Main.java
+++ b/src/main/java/org/torproject/metrics/stats/webstats/Main.java
@@ -3,14 +3,19 @@
 
 package org.torproject.metrics.stats.webstats;
 
-import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
+import static java.util.stream.Collectors.counting;
+import static java.util.stream.Collectors.groupingByConcurrent;
+
+import org.torproject.descriptor.Descriptor;
+import org.torproject.descriptor.DescriptorParseException;
+import org.torproject.descriptor.DescriptorSourceFactory;
+import org.torproject.descriptor.WebServerAccessLog;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.BufferedReader;
+import java.io.File;
 import java.io.IOException;
-import java.io.InputStreamReader;
-import java.net.URL;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -23,19 +28,17 @@ import java.sql.ResultSet;
 import java.sql.SQLException;
 import java.sql.Statement;
 import java.text.DateFormat;
-import java.text.ParseException;
 import java.text.SimpleDateFormat;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
 import java.util.ArrayList;
 import java.util.Calendar;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.SortedSet;
 import java.util.TimeZone;
 import java.util.TreeSet;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 /** Main class of the webstats module that downloads log files from the server,
  * imports them into a database, and exports aggregate statistics to a CSV
@@ -45,26 +48,6 @@ public class Main {
   /** Logger for this class. */
   private static Logger log = LoggerFactory.getLogger(Main.class);
 
-  /** Pattern for links contained in directory listings. */
-  static final Pattern URL_STRING_PATTERN =
-      Pattern.compile(".*<a href=\"([^\"]+)\">.*");
-
-  static final Pattern LOG_FILE_URL_PATTERN =
-      Pattern.compile("^.*/([^/]+)/([^/]+)-access.log-(\\d{8}).xz$");
-
-  private static DateFormat logDateFormat;
-
-  static {
-    logDateFormat = new SimpleDateFormat("yyyyMMdd");
-    logDateFormat.setLenient(false);
-    logDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
-  }
-
-  static final Pattern LOG_LINE_PATTERN = Pattern.compile(
-      "^0.0.0.[01] - - \\[\\d{2}/\\w{3}/\\d{4}:00:00:00 \\+0000\\] "
-      + "\"(GET|HEAD) ([^ ]{1,2048}) HTTP[^ ]+\" (\\d+) (-|\\d+) \"-\" \"-\" "
-      + "-$");
-
   private static final String LOG_DATE = "log_date";
 
   private static final String REQUEST_TYPE = "request_type";
@@ -88,12 +71,10 @@ public class Main {
     log.info("Starting webstats module.");
     String dbUrlString = "jdbc:postgresql:webstats";
     Connection connection = connectToDatabase(dbUrlString);
-    SortedSet<String> previouslyImportedLogFileUrls =
-        queryImportedFiles(connection);
-    String baseUrl = "https://webstats.torproject.org/out/";
-    SortedSet<String> newLogFileUrls = downloadDirectoryListings(baseUrl,
-        previouslyImportedLogFileUrls);
-    importLogFiles(connection, newLogFileUrls);
+    SortedSet<String> skipFiles = queryImportedFileNames(connection);
+    importLogFiles(connection, skipFiles,
+        new File("../../shared/in/recent/webstats"),
+        new File("../../shared/in/archive/webstats"));
     SortedSet<String> statistics = queryWebstats(connection);
     writeStatistics(Paths.get("stats", "webstats.csv"), statistics);
     disconnectFromDatabase(connection);
@@ -109,79 +90,55 @@ public class Main {
     return connection;
   }
 
-  static SortedSet<String> queryImportedFiles(Connection connection)
+  static SortedSet<String> queryImportedFileNames(Connection connection)
       throws SQLException {
-    log.info("Querying URLs of previously imported log files.");
+    log.info("Querying previously imported log files.");
     SortedSet<String> importedLogFileUrls = new TreeSet<>();
     Statement st = connection.createStatement();
-    String queryString = "SELECT url FROM files";
+    String queryString = "SELECT server, site, log_date FROM files";
+    DateTimeFormatter dateFormat = DateTimeFormatter.ofPattern("yyyyMMdd");
     try (ResultSet rs = st.executeQuery(queryString)) {
       while (rs.next()) {
-        importedLogFileUrls.add(rs.getString(1));
+        importedLogFileUrls.add(String.format("%s_%s_access.log_%s.xz",
+            rs.getString(1), rs.getString(2),
+            rs.getDate(3).toLocalDate().format(dateFormat)));
       }
     }
-    log.info("Found {} URLs of previously imported log files.",
+    log.info("Found {} previously imported log files.",
         importedLogFileUrls.size());
     return importedLogFileUrls;
   }
 
-  static SortedSet<String> downloadDirectoryListings(String baseUrl,
-      SortedSet<String> importedLogFileUrls) throws IOException {
-    log.info("Downloading directory listings from {}.", baseUrl);
-    List<String> directoryListings = new ArrayList<>();
-    directoryListings.add(baseUrl);
-    SortedSet<String> newLogFileUrls = new TreeSet<>();
-    while (!directoryListings.isEmpty()) {
-      String urlString = directoryListings.remove(0);
-      if (urlString.endsWith("/")) {
-        directoryListings.addAll(downloadDirectoryListing(urlString));
-      } else if (!urlString.endsWith(".xz")) {
-        log.debug("Skipping unrecognized URL {}.", urlString);
-      } else if (!importedLogFileUrls.contains(urlString)) {
-        newLogFileUrls.add(urlString);
+  static void importLogFiles(Connection connection, SortedSet<String> skipFiles,
+      File... inDirectories) {
+    for (Descriptor descriptor : DescriptorSourceFactory
+        .createDescriptorReader().readDescriptors(inDirectories)) {
+      if (!(descriptor instanceof WebServerAccessLog)) {
+        continue;
       }
-    }
-    log.info("Found {} URLs of log files that have not yet been imported.",
-        newLogFileUrls.size());
-    return newLogFileUrls;
-  }
-
-  static List<String> downloadDirectoryListing(String urlString)
-      throws IOException {
-    log.debug("Downloading directory listing from {}.", urlString);
-    List<String> urlStrings = new ArrayList<>();
-    try (BufferedReader br = new BufferedReader(new InputStreamReader(
-        new URL(urlString).openStream()))) {
-      String line;
-      while ((line = br.readLine()) != null) {
-        Matcher matcher = URL_STRING_PATTERN.matcher(line);
-        if (matcher.matches() && !matcher.group(1).startsWith("/")) {
-          urlStrings.add(urlString + matcher.group(1));
-        }
+      WebServerAccessLog logFile = (WebServerAccessLog) descriptor;
+      if (skipFiles.contains(logFile.getDescriptorFile().getName())) {
+        continue;
       }
-    }
-    return urlStrings;
-  }
-
-  static void importLogFiles(Connection connection,
-      SortedSet<String> newLogFileUrls) {
-    log.info("Downloading, parsing, and importing {} log files.",
-        newLogFileUrls.size());
-    for (String urlString : newLogFileUrls) {
       try {
-        Object[] metaData = parseMetaData(urlString);
-        if (metaData == null) {
-          continue;
-        }
-        Map<String, Integer> parsedLogLines = downloadAndParseLogFile(
-            urlString);
-        importLogLines(connection, urlString, metaData, parsedLogLines);
-      } catch (IOException | ParseException exc) {
-        log.warn("Cannot download or parse log file with URL {}.  Retrying "
-            + "in the next run.", urlString, exc);
+        Map<String, Long> parsedLogLines = logFile.logLines().parallel()
+            /* The following mapping can be removed with metrics-lib
+               version > 2.2.0 */
+            .map(line -> (WebServerAccessLog.Line) line)
+            .collect(groupingByConcurrent(line
+                -> String.format("%s %s %d", line.getMethod().name(),
+                truncateString(line.getRequest(), 2048), line.getResponse()),
+                counting()));
+        importLogLines(connection, logFile.getDescriptorFile().getName(),
+            logFile.getPhysicalHost(), logFile.getVirtualHost(),
+            logFile.getLogDate(), parsedLogLines);
+      } catch (DescriptorParseException exc) {
+        log.warn("Cannot parse log file with file name {}.  Retrying in the "
+            + "next run.", logFile.getDescriptorFile().getName(), exc);
       } catch (SQLException exc) {
-        log.warn("Cannot import log file with URL {} into the database.  "
-            + "Rolling back and retrying in the next run.", urlString, exc);
+        log.warn("Cannot import log file with file name {} into the database. "
+            + "Rolling back and retrying in the next run.",
+            logFile.getDescriptorFile().getName(), exc);
         try {
           connection.rollback();
         } catch (SQLException exceptionWhileRollingBack) {
@@ -191,68 +148,9 @@ public class Main {
     }
   }
 
-  private static Object[] parseMetaData(String urlString)
-      throws ParseException {
-    log.debug("Importing log file {}.", urlString);
-    if (urlString.contains("-ssl-access.log-")) {
-      log.debug("Skipping log file containing SSL requests with URL {}.",
-          urlString);
-      return null;
-    }
-    Matcher logFileUrlMatcher = LOG_FILE_URL_PATTERN.matcher(urlString);
-    if (!logFileUrlMatcher.matches()) {
-      log.debug("Skipping log file with unrecognized URL {}.", urlString);
-      return null;
-    }
-    String server = logFileUrlMatcher.group(1);
-    String site = logFileUrlMatcher.group(2);
-    long logDateMillis = logDateFormat.parse(logFileUrlMatcher.group(3))
-        .getTime();
-    return new Object[] { server, site, logDateMillis };
-  }
-
-  static Map<String, Integer> downloadAndParseLogFile(String urlString)
-      throws IOException {
-    int skippedLines = 0;
-    Map<String, Integer> parsedLogLines = new HashMap<>();
-    try (BufferedReader br = new BufferedReader(new InputStreamReader(
-        new XZCompressorInputStream(new URL(urlString).openStream())))) {
-      String line;
-      while ((line = br.readLine()) != null) {
-        if (!parseLogLine(line, parsedLogLines)) {
-          skippedLines++;
-        }
-      }
-    }
-    if (skippedLines > 0) {
-      log.debug("Skipped {} lines while parsing log file {}.", skippedLines,
-          urlString);
-    }
-    return parsedLogLines;
-  }
-
-  static boolean parseLogLine(String logLine,
-      Map<String, Integer> parsedLogLines) {
-    Matcher logLineMatcher = LOG_LINE_PATTERN.matcher(logLine);
-    if (!logLineMatcher.matches()) {
-      return false;
-    }
-    String method = logLineMatcher.group(1);
-    String resource = logLineMatcher.group(2);
-    int responseCode = Integer.parseInt(logLineMatcher.group(3));
-    String combined = String.format("%s %s %d", method, resource,
-        responseCode);
-    if (!parsedLogLines.containsKey(combined)) {
-      parsedLogLines.put(combined, 1);
-    } else {
-      parsedLogLines.put(combined, parsedLogLines.get(combined) + 1);
-    }
-    return true;
-  }
-
   private static void importLogLines(Connection connection, String urlString,
-      Object[] metaData, Map<String, Integer> parsedLogLines)
-      throws SQLException {
+      String server, String site, LocalDate logDate,
+      Map<String, Long> parsedLogLines) throws SQLException {
     PreparedStatement psFiles = connection.prepareStatement(
         "INSERT INTO files (url, server, site, " + LOG_DATE + ") "
         + "VALUES (?, ?, ?, ?)", Statement.RETURN_GENERATED_KEYS);
@@ -264,20 +162,17 @@ public class Main {
     PreparedStatement psRequests = connection.prepareStatement(
         "INSERT INTO requests (file_id, method, resource_id, response_code, "
         + COUNT + ") VALUES (?, CAST(? AS method), ?, ?, ?)");
-    String server = (String) metaData[0];
-    String site = (String) metaData[1];
-    long logDateMillis = (long) metaData[2];
-    int fileId = insertFile(psFiles, urlString, server, site, logDateMillis);
+    int fileId = insertFile(psFiles, urlString, server, site, logDate);
     if (fileId < 0) {
       log.debug("Skipping previously imported log file {}.", urlString);
       return;
     }
-    for (Map.Entry<String, Integer> requests : parsedLogLines.entrySet()) {
+    for (Map.Entry<String, Long> requests : parsedLogLines.entrySet()) {
       String[] keyParts = requests.getKey().split(" ");
       String method = keyParts[0];
       String resource = keyParts[1];
       int responseCode = Integer.parseInt(keyParts[2]);
-      int count = requests.getValue();
+      long count = requests.getValue();
       int resourceId = insertResource(psResourcesSelect, psResourcesInsert,
           resource);
       if (resourceId < 0) {
@@ -290,18 +185,18 @@ public class Main {
           count);
     }
     connection.commit();
-    log.debug("Finished importing log file with URL {} into database.",
+    log.debug("Finished importing log file with file name {} into database.",
         urlString);
   }
 
   private static int insertFile(PreparedStatement psFiles, String urlString,
-      String server, String site, long logDateMillis) throws SQLException {
+      String server, String site, LocalDate logDate) throws SQLException {
     int fileId = -1;
     psFiles.clearParameters();
     psFiles.setString(1, truncateString(urlString, 2048));
     psFiles.setString(2, truncateString(server, 32));
     psFiles.setString(3, truncateString(site, 128));
-    psFiles.setDate(4, new Date(logDateMillis));
+    psFiles.setDate(4, Date.valueOf(logDate));
     psFiles.execute();
     try (ResultSet rs = psFiles.getGeneratedKeys()) {
       if (rs.next()) {
@@ -312,14 +207,14 @@ public class Main {
   }
 
   private static void insertRequest(PreparedStatement psRequests, int fileId,
-      String method, int resourceId, int responseCode, int count)
+      String method, int resourceId, int responseCode, long count)
       throws SQLException {
     psRequests.clearParameters();
     psRequests.setInt(1, fileId);
     psRequests.setString(2, method);
     psRequests.setInt(3, resourceId);
     psRequests.setInt(4, responseCode);
-    psRequests.setInt(5, count);
+    psRequests.setLong(5, count);
     psRequests.execute();
   }
 
diff --git a/src/main/resources/spec/web-server-logs.xml b/src/main/resources/spec/web-server-logs.xml
index c180f8c..5c2011f 100644
--- a/src/main/resources/spec/web-server-logs.xml
+++ b/src/main/resources/spec/web-server-logs.xml
@@ -20,10 +20,6 @@
   </front>
   <middle>
     <section title="Purpose of this document">
-      <t>BETA: As of November 14, 2017, this document is still under
-      discussion and subject to change without prior notice. Feel free
-      to <eref target="/about.html#contact">contact us</eref> for questions or
-      concerns regarding this document.</t>
       <t>Tor's web servers, like most web servers, keep request logs for
       maintenance and informational purposes.</t>
       <t>However, unlike most other web servers, Tor's web servers use a
diff --git a/src/main/resources/web/jsps/collector.jsp b/src/main/resources/web/jsps/collector.jsp
index 33ae7dd..13865ba 100644
--- a/src/main/resources/web/jsps/collector.jsp
+++ b/src/main/resources/web/jsps/collector.jsp
@@ -168,6 +168,15 @@
   <td><a href="/collector/recent/torperf/" class="btn btn-primary btn-xs pull-left"><i class="fa fa-chevron-right" aria-hidden="true"></i> recent</a>
       <a href="/collector/archive/torperf/" class="btn btn-primary btn-xs pull-right"><i class="fa fa-chevron-right" aria-hidden="true"></i> archive</a></td>
 </tr>
+<tr class="tableHeadline">
+  <td colspan="3"><b><a href="#webstats">Tor web server logs</a></b></td>
+</tr>
+<tr>
+  <td><a href="#type-webstats">Tor web server logs</a></td>
+  <td></td>
+  <td><a href="/collector/recent/webstats/" class="btn btn-primary btn-xs pull-left"><i class="fa fa-chevron-right" aria-hidden="true"></i> recent</a>
+      <a href="/collector/archive/webstats/" class="btn btn-primary btn-xs pull-right"><i class="fa fa-chevron-right" aria-hidden="true"></i> archive</a></td>
+</tr>
 </tbody>
 </table>
 
@@ -694,6 +703,35 @@ measurement; optional.</li>
 <li><code>SOURCEADDRESS:</code> Public IP address of the OnionPerf host obtained by connecting to well-known servers and finding the IP address in the result, which may be <code>"unknown"</code> if OnionPerf was not able to find this information; optional.</li>
 </ul>
 
+
+
+<br>
+<h2 id="webstats" class="hover">Tor web server logs
+<a href="#webstats" class="anchor">#</a>
+</h2>
+
+<p>
+Tor's web servers, like most web servers, keep request logs for maintenance and
+informational purposes.
+However, unlike most other web servers, Tor's web servers use a privacy-aware
+log format that avoids logging too sensitive data about their users.
+Also unlike most other web server logs, Tor's logs are neither archived nor
+analyzed before performing a number of post-processing steps to further reduce
+any privacy-sensitive parts.
+</p>
+
+<h3 id="type-webstats" class="hover">Tor web server logs
+<a href="/collector/recent/webstats/" class="btn btn-primary btn-xs"><i class="fa fa-chevron-right" aria-hidden="true"></i> recent</a>
+<a href="/collector/archive/webstats/" class="btn btn-primary btn-xs"><i class="fa fa-chevron-right" aria-hidden="true"></i> archive</a>
+<a href="#type-webstats" class="anchor">#</a>
+</h3>
+
+<p>
+The data format and sanitizing steps for Tor web server logs are specified in
+detail on a separate <a href="web-server-logs.html">page</a>.
+</p>
+
+
     </div>
 
     <br>
diff --git a/src/main/resources/web/jsps/web-server-logs.jsp b/src/main/resources/web/jsps/web-server-logs.jsp
index 8832b2a..530b2ab 100644
--- a/src/main/resources/web/jsps/web-server-logs.jsp
+++ b/src/main/resources/web/jsps/web-server-logs.jsp
@@ -22,37 +22,31 @@
 "#rfc.section.1">1.</a> <a href=
 "#n-purpose-of-this-document">Purpose of this document</a></h2>
 <div id="rfc.section.1.p.1">
-<p>BETA: As of November 14, 2017, this document is still under
-discussion and subject to change without prior notice. Feel free to
-<a href="/about.html#contact">contact us</a> for questions or
-concerns regarding this document.</p>
-</div>
-<div id="rfc.section.1.p.2">
 <p>Tor's web servers, like most web servers, keep request logs for
 maintenance and informational purposes.</p>
 </div>
-<div id="rfc.section.1.p.3">
+<div id="rfc.section.1.p.2">
 <p>However, unlike most other web servers, Tor's web servers use a
 privacy-aware log format that avoids logging too sensitive data
 about their users.</p>
 </div>
-<div id="rfc.section.1.p.4">
+<div id="rfc.section.1.p.3">
 <p>Also unlike most other web server logs, Tor's logs are neither
 archived nor analyzed before performing a number of post-processing
 steps to further reduce any privacy-sensitive parts.</p>
 </div>
-<div id="rfc.section.1.p.5">
+<div id="rfc.section.1.p.4">
 <p>This document describes 1) meta-data contained in log file names
 written by Tor's web servers, 2) the privacy-aware log format used
 in these files, and 3) subsequent sanitizing steps that are applied
 before archiving and analyzing these log files.</p>
 </div>
-<div id="rfc.section.1.p.6">
+<div id="rfc.section.1.p.5">
 <p>As a basis for our current implementation this document also
 describes the naming conventions for the input log files, which is
 just a description of the current state and subject to change.</p>
 </div>
-<div id="rfc.section.1.p.7">
+<div id="rfc.section.1.p.6">
 <p>As a convention for this document, all format strings conform to
 the format strings used by <a href=
 "http://httpd.apache.org/docs/current/mod/mod_log_config.html">Apache's
diff --git a/src/main/sql/webstats/init-webstats.sql b/src/main/sql/webstats/init-webstats.sql
index e44205f..1396fa5 100644
--- a/src/main/sql/webstats/init-webstats.sql
+++ b/src/main/sql/webstats/init-webstats.sql
@@ -22,7 +22,7 @@ CREATE TABLE requests (
   method METHOD NOT NULL,
   resource_id INTEGER REFERENCES resources (resource_id) NOT NULL,
   response_code SMALLINT NOT NULL,
-  count INTEGER NOT NULL,
+  count BIGINT NOT NULL,
   UNIQUE (file_id, method, resource_id, response_code)
 );
 
diff --git a/src/test/java/org/torproject/metrics/stats/webstats/MainTest.java b/src/test/java/org/torproject/metrics/stats/webstats/MainTest.java
deleted file mode 100644
index a4e88d1..0000000
--- a/src/test/java/org/torproject/metrics/stats/webstats/MainTest.java
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright 2017--2018 The Tor Project
- * See LICENSE for licensing information */
-
-package org.torproject.metrics.stats.webstats;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import org.junit.Test;
-
-import java.util.regex.Matcher;
-
-public class MainTest {
-
-  static final String SAMPLE_LOG_FILE_NAME =
-      "metrics.torproject.org-access.log-20170117.xz";
-
-  static final String SAMPLE_SUBDIRECTORY_NAME = "meronense.torproject.org/";
-
-  static final String SAMPLE_LOG_FILE_URL =
-      "https://webstats.torproject.org/out/meronense.torproject.org/"
-      + "metrics.torproject.org-access.log-20170117.xz";
-
-  static final String[] SAMPLE_LOG_LINES = new String[] {
-      "0.0.0.0 - - [17/Jan/2017:00:00:00 +0000] "
-      + "\"GET / HTTP/1.0\" 200 10532 \"-\" \"-\" -",
-      "0.0.0.0 - - [17/Jan/2017:00:00:00 +0000] "
-      + "\"HEAD /bubbles.html HTTP/1.1\" 200 - \"-\" \"-\" -"
-  };
-
-  @Test
-  public void testUrlStringPatternComplete() {
-    Matcher matcher = Main.URL_STRING_PATTERN.matcher(
-        "<img src=\"/icons/unknown.gif\" alt=\"[   ]\"> "
-        + "<a href=\"" + SAMPLE_LOG_FILE_NAME + "\">" + SAMPLE_LOG_FILE_NAME
-        + "</a> 2017-01-19 19:43  5.6K  ");
-    assertTrue(matcher.matches());
-    assertEquals(SAMPLE_LOG_FILE_NAME, matcher.group(1));
-  }
-
-  @Test
-  public void testUrlStringPatternOnlyATag() {
-    Matcher matcher = Main.URL_STRING_PATTERN.matcher("<a href=\""
-        + SAMPLE_LOG_FILE_NAME + "\">" + SAMPLE_LOG_FILE_NAME + "</a>");
-    assertTrue(matcher.matches());
-    assertEquals(SAMPLE_LOG_FILE_NAME, matcher.group(1));
-  }
-
-  @Test
-  public void testUrlStringPatternSubdirectory() {
-    Matcher matcher = Main.URL_STRING_PATTERN.matcher(
-        "<a href=\"" + SAMPLE_SUBDIRECTORY_NAME + "\">"
-        + SAMPLE_SUBDIRECTORY_NAME + "/</a>");
-    assertTrue(matcher.matches());
-    assertEquals(SAMPLE_SUBDIRECTORY_NAME, matcher.group(1));
-  }
-
-  @Test
-  public void testUrlStringPatternAnythingBetweenDoubleQuotesHtml() {
-    Matcher matcher = Main.URL_STRING_PATTERN.matcher(
-        "<a href=\"anything-between-double-quotes.html\">Link/</a>");
-    assertTrue(matcher.matches());
-    assertEquals("anything-between-double-quotes.html", matcher.group(1));
-  }
-
-  @Test
-  public void testLogFileUrlPatternComplete() {
-    Matcher matcher = Main.LOG_FILE_URL_PATTERN.matcher(SAMPLE_LOG_FILE_URL);
-    assertTrue(matcher.matches());
-    assertEquals("meronense.torproject.org", matcher.group(1));
-    assertEquals("metrics.torproject.org", matcher.group(2));
-    assertEquals("20170117", matcher.group(3));
-  }
-
-  @Test
-  public void testLogLinePatternGetSlash() {
-    Matcher matcher = Main.LOG_LINE_PATTERN.matcher(SAMPLE_LOG_LINES[0]);
-    assertTrue(matcher.matches());
-    assertEquals("GET", matcher.group(1));
-    assertEquals("/", matcher.group(2));
-    assertEquals("200", matcher.group(3));
-  }
-
-  @Test
-  public void testLogLinePatternHeadBubbles() {
-    Matcher matcher = Main.LOG_LINE_PATTERN.matcher(SAMPLE_LOG_LINES[1]);
-    assertTrue(matcher.matches());
-    assertEquals("HEAD", matcher.group(1));
-    assertEquals("/bubbles.html", matcher.group(2));
-    assertEquals("200", matcher.group(3));
-  }
-
-  @Test
-  public void testLogLinePatternMaxLength() {
-    int maxLength = 2048;
-    String pre = "0.0.0.0 - - [17/Jan/2017:00:00:00 +0000] \"GET ";
-    String post = " HTTP/1.0\" 200 10532 \"-\" \"-\" -";
-    StringBuilder sb = new StringBuilder();
-    while (sb.length() <= maxLength) {
-      sb.append("/https://www.torproject.org");
-    }
-    String tooLongLogLine = pre + sb.toString() + post;
-    assertFalse(Main.LOG_LINE_PATTERN.matcher(tooLongLogLine).matches());
-    String notTooLongLogLine = pre + sb.toString().substring(0, maxLength)
-        + post;
-    assertTrue(Main.LOG_LINE_PATTERN.matcher(notTooLongLogLine).matches());
-  }
-}
-



More information about the tor-commits mailing list