[tor-commits] [metrics-web/master] Add webstats module and webstats-tb graph.

karsten at torproject.org karsten at torproject.org
Mon Jan 23 19:11:32 UTC 2017


commit 1c0ec1e13a507baa9621156645d2dc28d85c8748
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Mon Jan 23 20:09:38 2017 +0100

    Add webstats module and webstats-tb graph.
    
    Implements #21236.
---
 modules/webstats/.gitignore                        |   3 +
 modules/webstats/build.xml                         |  27 ++
 .../java/org/torproject/metrics/webstats/Main.java | 406 +++++++++++++++++++++
 .../webstats/src/main/resources/init-webstats.sql  | 164 +++++++++
 modules/webstats/src/main/resources/write-RData.R  |   9 +
 .../org/torproject/metrics/webstats/MainTest.java  |  93 +++++
 shared/bin/90-run-webstats-stats.sh                |  10 +
 shared/bin/99-copy-stats-files.sh                  |   2 +
 shared/build-base.xml                              |  42 +++
 shared/build.xml                                   |   2 +-
 website/etc/categories.json                        |   5 +-
 website/etc/metrics.json                           |  14 +
 website/etc/web.xml                                |   4 +
 website/rserve/graphs.R                            |  31 ++
 .../metrics/web/research/ResearchStatsServlet.java |   1 +
 website/web/WEB-INF/sources.jsp                    |   6 +
 website/web/WEB-INF/stats.jsp                      |  34 +-
 17 files changed, 849 insertions(+), 4 deletions(-)

diff --git a/modules/webstats/.gitignore b/modules/webstats/.gitignore
new file mode 100644
index 0000000..a8e4d02
--- /dev/null
+++ b/modules/webstats/.gitignore
@@ -0,0 +1,3 @@
+/stats/*.csv
+/RData/*.RData
+
diff --git a/modules/webstats/build.xml b/modules/webstats/build.xml
new file mode 100644
index 0000000..bcfe251
--- /dev/null
+++ b/modules/webstats/build.xml
@@ -0,0 +1,27 @@
+<project default="run" name="webstats" basedir=".">
+
+  <property name="sources" value="src/main/java"/>
+  <property name="testsources" value="src/test/java"/>
+
+  <include file="../../shared/build-base.xml" as="basetask"/>
+  <target name="clean" depends="basetask.clean"/>
+  <target name="compile" depends="basetask.compile"/>
+  <target name="test" depends="basetask.test"/>
+
+  <path id="classpath">
+    <pathelement path="${classes}"/>
+    <path refid="base.classpath" />
+    <fileset dir="${libs}">
+      <include name="postgresql-jdbc3-9.2.jar"/>
+    </fileset>
+  </path>
+
+  <target name="run" depends="compile">
+    <java fork="true"
+          maxmemory="1g"
+          classname="org.torproject.metrics.webstats.Main">
+      <classpath refid="classpath"/>
+    </java>
+  </target>
+</project>
+
diff --git a/modules/webstats/src/main/java/org/torproject/metrics/webstats/Main.java b/modules/webstats/src/main/java/org/torproject/metrics/webstats/Main.java
new file mode 100644
index 0000000..b6e2f96
--- /dev/null
+++ b/modules/webstats/src/main/java/org/torproject/metrics/webstats/Main.java
@@ -0,0 +1,406 @@
+/* Copyright 2016--2017 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.metrics.webstats;
+
+import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.sql.Connection;
+import java.sql.Date;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.text.DateFormat;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.SortedSet;
+import java.util.TimeZone;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/** Main class of the webstats module that downloads log files from the server,
+ * imports them into a database, and exports aggregate statistics to a CSV
+ * file. */
+public class Main {
+
+  /** Logger for this class. */
+  private static Logger log = LoggerFactory.getLogger(Main.class);
+
+  /** Pattern for links contained in directory listings. */
+  static final Pattern URL_STRING_PATTERN =
+      Pattern.compile(".*<a href=\"([^\"]+)\">.*");
+
+  static final Pattern LOG_FILE_URL_PATTERN =
+      Pattern.compile("^.*/([^/]+)/([^/]+)-access.log-(\\d{8}).xz$");
+
+  private static DateFormat logDateFormat;
+
+  static {
+    logDateFormat = new SimpleDateFormat("yyyyMMdd");
+    logDateFormat.setLenient(false);
+    logDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+  }
+
+  static final Pattern LOG_LINE_PATTERN = Pattern.compile(
+      "^0.0.0.[01] - - \\[\\d{2}/\\w{3}/\\d{4}:00:00:00 \\+0000\\] "
+      + "\"(GET|HEAD) ([^ ]+) HTTP[^ ]+\" (\\d+) (-|\\d+) \"-\" \"-\" -$");
+
+  private static final String LOG_DATE = "log_date";
+
+  private static final String REQUEST_TYPE = "request_type";
+
+  private static final String PLATFORM = "platform";
+
+  private static final String CHANNEL = "channel";
+
+  private static final String LOCALE = "locale";
+
+  private static final String INCREMENTAL = "incremental";
+
+  private static final String COUNT = "count";
+
+  private static final String ALL_COLUMNS = LOG_DATE + "," + REQUEST_TYPE + ","
+      + PLATFORM + "," + CHANNEL + "," + LOCALE + "," + INCREMENTAL + ","
+      + COUNT;
+
+  /** Executes this data-processing module. */
+  public static void main(String[] args) throws Exception {
+    log.info("Starting webstats module.");
+    String dbUrlString = "jdbc:postgresql:webstats";
+    Connection connection = connectToDatabase(dbUrlString);
+    SortedSet<String> previouslyImportedLogFileUrls =
+        queryImportedFiles(connection);
+    String baseUrl = "https://webstats.torproject.org/out/";
+    SortedSet<String> newLogFileUrls = downloadDirectoryListings(baseUrl,
+        previouslyImportedLogFileUrls);
+    importLogFiles(connection, newLogFileUrls);
+    SortedSet<String> statistics = queryWebstats(connection);
+    writeStatistics(Paths.get("stats", "webstats.csv"), statistics);
+    disconnectFromDatabase(connection);
+    log.info("Terminated webstats module.");
+  }
+
+  private static Connection connectToDatabase(String jdbcString)
+      throws SQLException {
+    log.info("Connecting to database.");
+    Connection connection = DriverManager.getConnection(jdbcString);
+    connection.setAutoCommit(false);
+    log.info("Successfully connected to database.");
+    return connection;
+  }
+
+  static SortedSet<String> queryImportedFiles(Connection connection)
+      throws SQLException {
+    log.info("Querying URLs of previously imported log files.");
+    SortedSet<String> importedLogFileUrls = new TreeSet<>();
+    Statement st = connection.createStatement();
+    String queryString = "SELECT url FROM files";
+    try (ResultSet rs = st.executeQuery(queryString)) {
+      while (rs.next()) {
+        importedLogFileUrls.add(rs.getString(1));
+      }
+    }
+    log.info("Found {} URLs of previously imported log files.",
+        importedLogFileUrls.size());
+    return importedLogFileUrls;
+  }
+
+  static SortedSet<String> downloadDirectoryListings(String baseUrl,
+      SortedSet<String> importedLogFileUrls) throws IOException {
+    log.info("Downloading directory listings from {}.", baseUrl);
+    List<String> directoryListings = new ArrayList<>();
+    directoryListings.add(baseUrl);
+    SortedSet<String> newLogFileUrls = new TreeSet<>();
+    while (!directoryListings.isEmpty()) {
+      String urlString = directoryListings.remove(0);
+      if (urlString.endsWith("/")) {
+        directoryListings.addAll(downloadDirectoryListing(urlString));
+      } else if (!urlString.endsWith(".xz")) {
+        log.debug("Skipping unrecognized URL {}.", urlString);
+      } else if (!importedLogFileUrls.contains(urlString)) {
+        newLogFileUrls.add(urlString);
+      }
+    }
+    log.info("Found {} URLs of log files that have not yet been imported.",
+        newLogFileUrls.size());
+    return newLogFileUrls;
+  }
+
+  static List<String> downloadDirectoryListing(String urlString)
+      throws IOException {
+    log.debug("Downloading directory listing from {}.", urlString);
+    List<String> urlStrings = new ArrayList<>();
+    try (BufferedReader br = new BufferedReader(new InputStreamReader(
+        new URL(urlString).openStream()))) {
+      String line;
+      while ((line = br.readLine()) != null) {
+        Matcher matcher = URL_STRING_PATTERN.matcher(line);
+        if (matcher.matches() && !matcher.group(1).startsWith("/")) {
+          urlStrings.add(urlString + matcher.group(1));
+        }
+      }
+    }
+    return urlStrings;
+  }
+
+  static void importLogFiles(Connection connection,
+      SortedSet<String> newLogFileUrls) {
+    log.info("Downloading, parsing, and importing {} log files.",
+        newLogFileUrls.size());
+    for (String urlString : newLogFileUrls) {
+      try {
+        Object[] metaData = parseMetaData(urlString);
+        if (metaData == null) {
+          continue;
+        }
+        List<String> downloadedLogLines = downloadLogFile(urlString);
+        Map<String, Integer> parsedLogLines = parseLogLines(urlString,
+            downloadedLogLines);
+        importLogLines(connection, urlString, metaData, parsedLogLines);
+      } catch (IOException | ParseException exc) {
+        log.warn("Cannot download or parse log file with URL {}.  Retrying "
+            + "in the next run.", urlString, exc);
+      } catch (SQLException exc) {
+        log.warn("Cannot import log file with URL {} into the database.  "
+            + "Rolling back and retrying in the next run.", urlString, exc);
+        try {
+          connection.rollback();
+        } catch (SQLException exceptionWhileRollingBack) {
+          /* Ignore. */
+        }
+      }
+    }
+  }
+
+  private static Object[] parseMetaData(String urlString)
+      throws ParseException {
+    log.debug("Importing log file {}.", urlString);
+    if (urlString.contains("-ssl-access.log-")) {
+      log.debug("Skipping log file containing SSL requests with URL {}.",
+          urlString);
+      return null;
+    }
+    Matcher logFileUrlMatcher = LOG_FILE_URL_PATTERN.matcher(urlString);
+    if (!logFileUrlMatcher.matches()) {
+      log.debug("Skipping log file with unrecognized URL {}.", urlString);
+      return null;
+    }
+    String server = logFileUrlMatcher.group(1);
+    String site = logFileUrlMatcher.group(2);
+    long logDateMillis = logDateFormat.parse(logFileUrlMatcher.group(3))
+        .getTime();
+    return new Object[] { server, site, new Long(logDateMillis) };
+  }
+
+  static List<String> downloadLogFile(String urlString) throws IOException {
+    List<String> downloadedLogLines = new ArrayList<>();
+    try (BufferedReader br = new BufferedReader(new InputStreamReader(
+        new XZCompressorInputStream(new URL(urlString).openStream())))) {
+      String line;
+      while ((line = br.readLine()) != null) {
+        downloadedLogLines.add(line);
+      }
+    }
+    return downloadedLogLines;
+  }
+
+  static Map<String, Integer> parseLogLines(String urlString,
+      List<String> logLines) {
+    int skippedLines = 0;
+    Map<String, Integer> parsedLogLines = new HashMap<>();
+    for (String logLine : logLines) {
+      Matcher logLineMatcher = LOG_LINE_PATTERN.matcher(logLine);
+      if (!logLineMatcher.matches()) {
+        skippedLines++;
+        continue;
+      }
+      String method = logLineMatcher.group(1);
+      String resource = logLineMatcher.group(2);
+      int responseCode = Integer.parseInt(logLineMatcher.group(3));
+      String combined = String.format("%s %s %d", method, resource,
+          responseCode);
+      if (!parsedLogLines.containsKey(combined)) {
+        parsedLogLines.put(combined, 1);
+      } else {
+        parsedLogLines.put(combined, parsedLogLines.get(combined) + 1);
+      }
+    }
+    if (skippedLines > 0) {
+      log.debug("Skipped {} lines while parsing log file {}.", skippedLines,
+          urlString);
+    }
+    return parsedLogLines;
+  }
+
+  private static void importLogLines(Connection connection, String urlString,
+      Object[] metaData, Map<String, Integer> parsedLogLines)
+      throws SQLException {
+    PreparedStatement psFiles = connection.prepareStatement(
+        "INSERT INTO files (url, server, site, " + LOG_DATE + ") "
+        + "VALUES (?, ?, ?, ?)", Statement.RETURN_GENERATED_KEYS);
+    PreparedStatement psResourcesSelect = connection.prepareStatement(
+        "SELECT resource_id FROM resources WHERE resource_string = ?");
+    PreparedStatement psResourcesInsert = connection.prepareStatement(
+        "INSERT INTO resources (resource_string) VALUES (?)",
+        Statement.RETURN_GENERATED_KEYS);
+    PreparedStatement psRequests = connection.prepareStatement(
+        "INSERT INTO requests (file_id, method, resource_id, response_code, "
+        + COUNT + ") VALUES (?, CAST(? AS method), ?, ?, ?)");
+    String server = (String) metaData[0];
+    String site = (String) metaData[1];
+    long logDateMillis = (long) metaData[2];
+    int fileId = insertFile(psFiles, urlString, server, site, logDateMillis);
+    if (fileId < 0) {
+      log.debug("Skipping previously imported log file {}.", urlString);
+      return;
+    }
+    for (Map.Entry<String, Integer> requests : parsedLogLines.entrySet()) {
+      String[] keyParts = requests.getKey().split(" ");
+      String method = keyParts[0];
+      String resource = keyParts[1];
+      int responseCode = Integer.parseInt(keyParts[2]);
+      int count = requests.getValue();
+      int resourceId = insertResource(psResourcesSelect, psResourcesInsert,
+          resource);
+      if (resourceId < 0) {
+        log.error("Could not retrieve auto-generated key for new resources "
+            + "entry.");
+        connection.rollback();
+        return;
+      }
+      insertRequest(psRequests, fileId, method, resourceId, responseCode,
+          count);
+    }
+    connection.commit();
+    log.debug("Finished importing log file with URL {} into database.",
+        urlString);
+  }
+
+  private static int insertFile(PreparedStatement psFiles, String urlString,
+      String server, String site, long logDateMillis) throws SQLException {
+    int fileId = -1;
+    psFiles.clearParameters();
+    psFiles.setString(1, truncateString(urlString, 2048));
+    psFiles.setString(2, truncateString(server, 32));
+    psFiles.setString(3, truncateString(site, 128));
+    psFiles.setDate(4, new Date(logDateMillis));
+    psFiles.execute();
+    try (ResultSet rs = psFiles.getGeneratedKeys()) {
+      if (rs.next()) {
+        fileId = rs.getInt(1);
+      }
+    }
+    return fileId;
+  }
+
+  private static void insertRequest(PreparedStatement psRequests, int fileId,
+      String method, int resourceId, int responseCode, int count)
+      throws SQLException {
+    psRequests.clearParameters();
+    psRequests.setInt(1, fileId);
+    psRequests.setString(2, method);
+    psRequests.setInt(3, resourceId);
+    psRequests.setInt(4, responseCode);
+    psRequests.setInt(5, count);
+    psRequests.execute();
+  }
+
+  private static int insertResource(PreparedStatement psResourcesSelect,
+      PreparedStatement psResourcesInsert, String resource)
+      throws SQLException {
+    int resourceId = -1;
+    String truncatedResource = truncateString(resource, 2048);
+    psResourcesSelect.clearParameters();
+    psResourcesSelect.setString(1, truncatedResource);
+    try (ResultSet rs = psResourcesSelect.executeQuery()) {
+      if (rs.next()) {
+        resourceId = rs.getInt(1);
+      }
+    }
+    if (resourceId < 0) {
+      /* There's a small potential for a race condition between the previous
+       * SELECT and this INSERT INTO, but that will be resolved by the UNIQUE
+       * constraint when committing the transaction. */
+      psResourcesInsert.clearParameters();
+      psResourcesInsert.setString(1, truncatedResource);
+      psResourcesInsert.execute();
+      try (ResultSet rs = psResourcesInsert.getGeneratedKeys()) {
+        if (rs.next()) {
+          resourceId = rs.getInt(1);
+        }
+      }
+    }
+    return resourceId;
+  }
+
+  private static String truncateString(String originalString,
+      int truncateAfter) {
+    if (originalString.length() > truncateAfter) {
+      originalString = originalString.substring(0, truncateAfter);
+    }
+    return originalString;
+  }
+
+  static SortedSet<String> queryWebstats(Connection connection)
+      throws SQLException {
+    log.info("Querying statistics from database.");
+    SortedSet<String> statistics = new TreeSet<>();
+    Statement st = connection.createStatement();
+    String queryString = "SELECT " + ALL_COLUMNS + " FROM webstats";
+    DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd", Locale.US);
+    dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+    try (ResultSet rs = st.executeQuery(queryString)) {
+      while (rs.next()) {
+        statistics.add(String.format("%s,%s,%s,%s,%s,%s,%d",
+            dateFormat.format(rs.getDate(LOG_DATE)),
+            emptyNull(rs.getString(REQUEST_TYPE)),
+            emptyNull(rs.getString(PLATFORM)),
+            emptyNull(rs.getString(CHANNEL)),
+            emptyNull(rs.getString(LOCALE)),
+            emptyNull(rs.getString(INCREMENTAL)),
+            rs.getLong(COUNT)));
+      }
+    }
+    return statistics;
+  }
+
+  private static String emptyNull(String text) {
+    return null == text ? "" : text;
+  }
+
+  static void writeStatistics(Path webstatsPath,
+      SortedSet<String> statistics) throws IOException {
+    webstatsPath.toFile().getParentFile().mkdirs();
+    List<String> lines = new ArrayList<String>();
+    lines.add(ALL_COLUMNS);
+    lines.addAll(statistics);
+    log.info("Writing {} lines to {}.", lines.size(),
+        webstatsPath.toFile().getAbsolutePath());
+    Files.write(webstatsPath, lines, StandardCharsets.UTF_8);
+  }
+
+  private static void disconnectFromDatabase(Connection connection)
+      throws SQLException {
+    log.info("Disconnecting from database.");
+    connection.close();
+  }
+}
+
diff --git a/modules/webstats/src/main/resources/init-webstats.sql b/modules/webstats/src/main/resources/init-webstats.sql
new file mode 100644
index 0000000..98bb758
--- /dev/null
+++ b/modules/webstats/src/main/resources/init-webstats.sql
@@ -0,0 +1,164 @@
+-- Copyright 2016--2017 The Tor Project
+-- See LICENSE for licensing information
+
+CREATE TYPE method AS ENUM ('GET', 'HEAD');
+
+CREATE TABLE files (
+  file_id SERIAL PRIMARY KEY,
+  url CHARACTER VARYING(2048) UNIQUE NOT NULL,
+  server CHARACTER VARYING(32) NOT NULL,
+  site CHARACTER VARYING(128) NOT NULL,
+  log_date DATE NOT NULL,
+  UNIQUE (server, site, log_date)
+);
+
+CREATE TABLE resources (
+  resource_id SERIAL PRIMARY KEY,
+  resource_string CHARACTER VARYING(2048) UNIQUE NOT NULL
+);
+
+CREATE TABLE requests (
+  file_id INTEGER REFERENCES files (file_id) NOT NULL,
+  method METHOD NOT NULL,
+  resource_id INTEGER REFERENCES resources (resource_id) NOT NULL,
+  response_code SMALLINT NOT NULL,
+  count INTEGER NOT NULL,
+  UNIQUE (file_id, method, resource_id, response_code)
+);
+
+CREATE OR REPLACE VIEW webstats AS
+  SELECT log_date,
+    CASE WHEN resource_string LIKE '%.asc' THEN 'tbsd'
+      ELSE 'tbid' END AS request_type,
+    CASE WHEN resource_string LIKE '%.exe%' THEN 'w'
+      WHEN resource_string LIKE '%.dmg%' THEN 'm'
+      WHEN resource_string LIKE '%.tar.xz%' THEN 'l'
+      ELSE 'o' END AS platform,
+    CASE WHEN resource_string LIKE '%-hardened%' THEN 'h'
+      WHEN resource_string LIKE '%/%.%a%/%' THEN 'a'
+      ELSE 'r' END AS channel,
+    COALESCE(SUBSTRING(resource_string
+      FROM '.*_([a-zA-Z]{2}|[a-zA-Z]{2}-[a-zA-Z]{2})[\._-].*'), '??') AS locale,
+    NULL::BOOLEAN AS incremental,
+    SUM(count) AS count
+  FROM files NATURAL JOIN requests NATURAL JOIN resources
+  WHERE (resource_string LIKE '%/torbrowser/%.exe'
+    OR resource_string LIKE '%/torbrowser/%.dmg'
+    OR resource_string LIKE '%/torbrowser/%.tar.xz'
+    OR resource_string LIKE '%/torbrowser/%.exe.asc'
+    OR resource_string LIKE '%/torbrowser/%.dmg.asc'
+    OR resource_string LIKE '%/torbrowser/%.tar.xz.asc')
+  AND response_code = 200
+  AND method = 'GET'
+  GROUP BY log_date, request_type, platform, channel, locale, incremental
+  UNION
+  SELECT log_date,
+    'tbup' AS request_type,
+    CASE WHEN resource_string LIKE '%/WINNT%' THEN 'w'
+      WHEN resource_string LIKE '%/Darwin%' THEN 'm'
+      ELSE 'l' END AS platform,
+    CASE WHEN resource_string LIKE '%/hardened/%' THEN 'h'
+      WHEN resource_string LIKE '%/alpha/%' THEN 'a'
+      WHEN resource_string LIKE '%/release/%' THEN 'r'
+      ELSE 'o' END AS channel,
+    COALESCE(SUBSTRING(resource_string
+      FROM '.*/([a-zA-Z]{2}|[a-zA-Z]{2}-[a-zA-Z]{2})\??$'), '??') AS locale,
+    NULL::BOOLEAN AS incremental,
+    SUM(count) AS count
+  FROM files NATURAL JOIN requests NATURAL JOIN resources
+  WHERE resource_string LIKE '%/torbrowser/update_2/%'
+  AND resource_string NOT LIKE '%.xml'
+  AND response_code = 200
+  AND method = 'GET'
+  GROUP BY log_date, request_type, platform, channel, locale, incremental
+  UNION
+  SELECT log_date,
+    'tbur' AS request_type,
+    CASE WHEN resource_string LIKE '%-win32-%' THEN 'w'
+      WHEN resource_string LIKE '%-osx%' THEN 'm'
+      ELSE 'l' END AS platform,
+    CASE WHEN resource_string LIKE '%-hardened%' THEN 'h'
+      WHEN resource_string LIKE '%/%.%a%/%' THEN 'a'
+      ELSE 'r' END AS channel,
+    COALESCE(SUBSTRING(resource_string
+      FROM '.*_([a-zA-Z]{2}|[a-zA-Z]{2}-[a-zA-Z]{2})[\._-].*'), '??') AS locale,
+    CASE WHEN resource_string LIKE '%.incremental.%' THEN TRUE
+      ELSE FALSE END AS incremental,
+    SUM(count) AS count
+  FROM files NATURAL JOIN requests NATURAL JOIN resources
+  WHERE resource_string LIKE '%/torbrowser/%.mar'
+  AND response_code = 302
+  AND method = 'GET'
+  GROUP BY log_date, request_type, platform, channel, locale, incremental
+  UNION
+  SELECT log_date,
+    'tmid' AS request_type,
+    CASE WHEN resource_string LIKE '%.exe' THEN 'w'
+      WHEN resource_string LIKE '%.dmg' THEN 'm'
+      WHEN resource_string LIKE '%.tar.xz' THEN 'l'
+      ELSE 'o' END AS platform,
+    NULL AS channel,
+    COALESCE(SUBSTRING(resource_string
+      FROM '.*_([a-zA-Z]{2}|[a-zA-Z]{2}-[a-zA-Z]{2})[\._-].*'), '??') AS locale,
+    NULL::BOOLEAN AS incremental,
+    SUM(count) AS count
+  FROM files NATURAL JOIN requests NATURAL JOIN resources
+  WHERE (resource_string LIKE '%/tormessenger/%.exe'
+    OR resource_string LIKE '%/tormessenger/%.dmg'
+    OR resource_string LIKE '%/tormessenger/%.tar.xz')
+  AND response_code = 200
+  AND method = 'GET'
+  GROUP BY log_date, request_type, platform, channel, locale, incremental
+  UNION
+  SELECT log_date,
+    'tmup' AS request_type,
+    CASE WHEN resource_string LIKE '%/WINNT%' THEN 'w'
+      WHEN resource_string LIKE '%/Darwin%' THEN 'm'
+      WHEN resource_string LIKE '%/Linux%' THEN 'l'
+      ELSE 'o' END AS platform,
+    NULL AS channel,
+    COALESCE(SUBSTRING(resource_string
+      FROM '.*/([a-zA-Z]{2}|[a-zA-Z]{2}-[a-zA-Z]{2})\??$'), '??') AS locale,
+    NULL::BOOLEAN AS incremental,
+    SUM(count) AS count
+  FROM files NATURAL JOIN requests NATURAL JOIN resources
+  WHERE resource_string LIKE '%/tormessenger/update_2/%'
+  AND resource_string NOT LIKE '%.xml'
+  AND resource_string NOT LIKE '%/'
+  AND resource_string NOT LIKE '%/?'
+  AND response_code = 200
+  AND method = 'GET'
+  GROUP BY log_date, request_type, platform, channel, locale, incremental
+  UNION
+  SELECT log_date,
+    'twhph' AS request_type,
+    NULL AS platform,
+    NULL AS channel,
+    NULL AS locale,
+    NULL::BOOLEAN AS incremental,
+    SUM(count) AS count
+  FROM files NATURAL JOIN requests NATURAL JOIN resources
+  WHERE (resource_string = '/'
+    OR resource_string LIKE '/index%')
+  AND response_code = 200
+  AND (site = 'torproject.org'
+    OR site = 'www.torproject.org')
+  AND method = 'GET'
+  GROUP BY log_date, request_type, platform, channel, locale, incremental
+  UNION
+  SELECT log_date,
+    'twdph' AS request_type,
+    NULL AS platform,
+    NULL AS channel,
+    NULL AS locale,
+    NULL::BOOLEAN AS incremental,
+    SUM(count) AS count
+  FROM files NATURAL JOIN requests NATURAL JOIN resources
+  WHERE (resource_string LIKE '/download/download%'
+    OR resource_string LIKE '/projects/torbrowser.html%')
+  AND response_code = 200
+  AND (site = 'torproject.org'
+    OR site = 'www.torproject.org')
+  AND method = 'GET'
+  GROUP BY log_date, request_type, platform, channel, locale, incremental;
+
diff --git a/modules/webstats/src/main/resources/write-RData.R b/modules/webstats/src/main/resources/write-RData.R
new file mode 100644
index 0000000..2cb8917
--- /dev/null
+++ b/modules/webstats/src/main/resources/write-RData.R
@@ -0,0 +1,9 @@
+dir.create("RData", showWarnings = FALSE)
+
+d <- read.csv("stats/webstats.csv", stringsAsFactors = FALSE)
+d <- d[d$request_type %in% c('tbid', 'tbsd', 'tbup', 'tbur'), ]
+data <- aggregate(list(count = d$count),
+    by = list(log_date = as.Date(d$log_date), request_type = d$request_type),
+    FUN = sum)
+save(data, file = "RData/webstats-tb.RData")
+
diff --git a/modules/webstats/src/test/java/org/torproject/metrics/webstats/MainTest.java b/modules/webstats/src/test/java/org/torproject/metrics/webstats/MainTest.java
new file mode 100644
index 0000000..1c4f0bc
--- /dev/null
+++ b/modules/webstats/src/test/java/org/torproject/metrics/webstats/MainTest.java
@@ -0,0 +1,93 @@
+/* Copyright 2017 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.metrics.webstats;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.junit.Test;
+
+import java.util.regex.Matcher;
+
+public class MainTest {
+
+  static final String SAMPLE_LOG_FILE_NAME =
+      "metrics.torproject.org-access.log-20170117.xz";
+
+  static final String SAMPLE_SUBDIRECTORY_NAME = "meronense.torproject.org/";
+
+  static final String SAMPLE_LOG_FILE_URL =
+      "https://webstats.torproject.org/out/meronense.torproject.org/"
+      + "metrics.torproject.org-access.log-20170117.xz";
+
+  static final String[] SAMPLE_LOG_LINES = new String[] {
+      "0.0.0.0 - - [17/Jan/2017:00:00:00 +0000] "
+      + "\"GET / HTTP/1.0\" 200 10532 \"-\" \"-\" -",
+      "0.0.0.0 - - [17/Jan/2017:00:00:00 +0000] "
+      + "\"HEAD /bubbles.html HTTP/1.1\" 200 - \"-\" \"-\" -"
+  };
+
+  @Test
+  public void testUrlStringPatternComplete() {
+    Matcher matcher = Main.URL_STRING_PATTERN.matcher(
+        "<img src=\"/icons/unknown.gif\" alt=\"[   ]\"> "
+        + "<a href=\"" + SAMPLE_LOG_FILE_NAME + "\">" + SAMPLE_LOG_FILE_NAME
+        + "</a> 2017-01-19 19:43  5.6K  ");
+    assertTrue(matcher.matches());
+    assertEquals(SAMPLE_LOG_FILE_NAME, matcher.group(1));
+  }
+
+  @Test
+  public void testUrlStringPatternOnlyATag() {
+    Matcher matcher = Main.URL_STRING_PATTERN.matcher("<a href=\""
+        + SAMPLE_LOG_FILE_NAME + "\">" + SAMPLE_LOG_FILE_NAME + "</a>");
+    assertTrue(matcher.matches());
+    assertEquals(SAMPLE_LOG_FILE_NAME, matcher.group(1));
+  }
+
+  @Test
+  public void testUrlStringPatternSubdirectory() {
+    Matcher matcher = Main.URL_STRING_PATTERN.matcher(
+        "<a href=\"" + SAMPLE_SUBDIRECTORY_NAME + "\">"
+        + SAMPLE_SUBDIRECTORY_NAME + "/</a>");
+    assertTrue(matcher.matches());
+    assertEquals(SAMPLE_SUBDIRECTORY_NAME, matcher.group(1));
+  }
+
+  @Test
+  public void testUrlStringPatternAnythingBetweenDoubleQuotesHtml() {
+    Matcher matcher = Main.URL_STRING_PATTERN.matcher(
+        "<a href=\"anything-between-double-quotes.html\">Link/</a>");
+    assertTrue(matcher.matches());
+    assertEquals("anything-between-double-quotes.html", matcher.group(1));
+  }
+
+  @Test
+  public void testLogFileUrlPatternComplete() {
+    Matcher matcher = Main.LOG_FILE_URL_PATTERN.matcher(SAMPLE_LOG_FILE_URL);
+    assertTrue(matcher.matches());
+    assertEquals("meronense.torproject.org", matcher.group(1));
+    assertEquals("metrics.torproject.org", matcher.group(2));
+    assertEquals("20170117", matcher.group(3));
+  }
+
+  @Test
+  public void testLogLinePatternGetSlash() {
+    Matcher matcher = Main.LOG_LINE_PATTERN.matcher(SAMPLE_LOG_LINES[0]);
+    assertTrue(matcher.matches());
+    assertEquals("GET", matcher.group(1));
+    assertEquals("/", matcher.group(2));
+    assertEquals("200", matcher.group(3));
+  }
+
+  @Test
+  public void testLogLinePatternHeadBubbles() {
+    Matcher matcher = Main.LOG_LINE_PATTERN.matcher(SAMPLE_LOG_LINES[1]);
+    assertTrue(matcher.matches());
+    assertEquals("HEAD", matcher.group(1));
+    assertEquals("/bubbles.html", matcher.group(2));
+    assertEquals("200", matcher.group(3));
+  }
+}
+
diff --git a/shared/bin/90-run-webstats-stats.sh b/shared/bin/90-run-webstats-stats.sh
new file mode 100755
index 0000000..37091b4
--- /dev/null
+++ b/shared/bin/90-run-webstats-stats.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cd modules/webstats/
+
+ant run | grep "\[java\]"
+
+R --slave -f src/main/resources/write-RData.R > /dev/null 2>&1
+
+cd ../../
+
diff --git a/shared/bin/99-copy-stats-files.sh b/shared/bin/99-copy-stats-files.sh
index d236630..a828686 100755
--- a/shared/bin/99-copy-stats-files.sh
+++ b/shared/bin/99-copy-stats-files.sh
@@ -6,7 +6,9 @@ cp -a modules/advbwdist/stats/advbwdist.csv shared/stats/
 cp -a modules/hidserv/stats/hidserv.csv shared/stats/
 cp -a modules/clients/stats/clients*.csv shared/stats/
 cp -a modules/clients/stats/userstats-combined.csv shared/stats/
+cp -a modules/webstats/stats/webstats.csv shared/stats/
 
 mkdir -p shared/RData
 cp -a modules/clients/RData/*.RData shared/RData/
+cp -a modules/webstats/RData/*.RData shared/RData/
 
diff --git a/shared/build-base.xml b/shared/build-base.xml
index 7051f49..759e1d0 100644
--- a/shared/build-base.xml
+++ b/shared/build-base.xml
@@ -1,9 +1,11 @@
 <project basedir=".">
 
   <property name="sources" value="src"/>
+  <property name="testsources" value="src/test/java"/>
   <property name="libs" value="../../shared/lib"/>
   <property name="generated" value="generated"/>
   <property name="classes" value="${generated}/classes/"/>
+  <property name="testclasses" value="${generated}/test-classes/"/>
   <property name="source-and-target-java-version" value="1.7" />
   <property name="descriptorversion" value="1.5.0" />
 
@@ -21,6 +23,21 @@
     </fileset>
   </path>
 
+  <path id="base.testclasspath">
+    <pathelement path="${base.testclasses}"/>
+    <pathelement path="base.classpath"/>
+    <fileset dir="${libs}">
+      <include name="hamcrest-core-1.3.jar"/>
+      <include name="junit4-4.11.jar"/>
+    </fileset>
+  </path>
+
+  <path id="testclasspath">
+    <pathelement path="${testclasses}"/>
+    <path refid="base.testclasspath" />
+    <path refid="base.classpath" />
+  </path>
+
   <target name="clean">
     <delete includeEmptyDirs="true" quiet="true">
       <fileset dir="${generated}" defaultexcludes="false" includes="**" />
@@ -29,6 +46,7 @@
 
   <target name="init">
     <mkdir dir="${classes}"/>
+    <mkdir dir="${testclasses}"/>
   </target>
 
   <target name="compile" depends="init" >
@@ -45,6 +63,30 @@
     </javac>
   </target>
 
+  <target name="testcompile" depends="compile" >
+    <javac destdir="${testclasses}"
+           srcdir="${testsources}"
+           source="${source-and-target-java-version}"
+           target="${source-and-target-java-version}"
+           debug="true" debuglevel="lines,source"
+           deprecation="true"
+           optimize="false"
+           failonerror="true"
+           includeantruntime="false">
+      <classpath refid="testclasspath"/>
+    </javac>
+  </target>
+
+  <target name="test" depends="testcompile">
+    <junit fork="true" haltonfailure="true" printsummary="off">
+      <classpath refid="testclasspath"/>
+      <formatter type="plain" usefile="false"/>
+      <batchtest>
+        <fileset dir="${testclasses}"
+                 includes="**/*Test.class"/>
+      </batchtest>
+    </junit>
+  </target>
 
 </project>
 
diff --git a/shared/build.xml b/shared/build.xml
index 13a09f7..cb51d5f 100644
--- a/shared/build.xml
+++ b/shared/build.xml
@@ -26,9 +26,9 @@
       <fileset dir="../modules/clients/src" includes="**/*.java"/>
       <fileset dir="../modules/collectdescs/src" includes="**/*.java"/>
       <fileset dir="../modules/connbidirect/src" includes="**/*.java"/>
-      <fileset dir="../modules/disagreement/src" includes="**/*.java"/>
       <fileset dir="../modules/hidserv/src" includes="**/*.java"/>
       <fileset dir="../modules/legacy/src" includes="**/*.java"/>
+      <fileset dir="../modules/webstats/src" includes="**/*.java"/>
       <classpath>
         <path refid="checkstyle.classpath" />
       </classpath>
diff --git a/website/etc/categories.json b/website/etc/categories.json
index 8b4ea77..6825634 100644
--- a/website/etc/categories.json
+++ b/website/etc/categories.json
@@ -78,6 +78,9 @@
     "icon": "fa-download",
     "header": "Applications",
     "summary": "How many Tor applications, like Tor Browser, have been downloaded or updated.",
-    "metrics": []
+    "description": "The following application statistics are based on the analysis of requests to <code>torproject.org</code> web servers.",
+    "metrics": [
+      "webstats-tb"
+    ]
   }
 ]
diff --git a/website/etc/metrics.json b/website/etc/metrics.json
index f7666be..4a97ca1 100644
--- a/website/etc/metrics.json
+++ b/website/etc/metrics.json
@@ -402,5 +402,19 @@
     "title": "Network churn rate by relay flag",
     "type": "Link",
     "description": "<p>This image shows the churn rate of the Tor network by <a href=\"glossary.html#relay-flag\">relay flag</a> in a given month.  The churn rate, a value in the interval <b>[0,1]</b>, captures the rate of <a href=\"glossary.html#relay\">relays</a> joining and leaving the network from one <a href=\"glossary.html#consensus\">consensus</a> to the next (that is, within one hour).  The complete image gallery can be found on <a href=\"https://nymity.ch/sybilhunting/churn-values/\">Philipp Winter's homepage</a>.</p><p><a href=\"https://nymity.ch/sybilhunting/churn-values/\"><img src=\"images/networkchurn.png\" alt=\"Network churn rate by relay flag\"></a></p>"
+  },
+  {
+    "id": "webstats-tb",
+    "title": "Tor Browser downloads and updates",
+    "type": "Graph",
+    "description": "<p>This graph shows absolute numbers of requests to Tor's web servers by request type.  It is based on data from <a href=\"https://webstats.torproject.org/\" target=\"_blank\"><code>webstats.torproject.org</code></a> which collects logs from <code>torproject.org</code> web servers and provides them as a stripped-down version of Apache's "combined" log format without IP addresses, log times, HTTP parameters, referers, and user agent strings.  <em>Initial downloads</em> and <em>signature downloads</em> are requests made by the user to download a Tor Browser executable or a corresponding signature file from the Tor website.  <em>Update pings</em> and <em>update requests</em> are requests made by Tor Browser to check whether a newer version is available or to download a newer version.</p>",
+    "function": "plot_webstats_tb",
+    "parameters": [
+      "start",
+      "end"
+    ],
+    "data": [
+      "webstats"
+    ]
   }
 ]
diff --git a/website/etc/web.xml b/website/etc/web.xml
index 7444cf5..916984c 100644
--- a/website/etc/web.xml
+++ b/website/etc/web.xml
@@ -46,6 +46,7 @@
     <url-pattern>/hidserv-dir-onions-seen.html</url-pattern>
     <url-pattern>/hidserv-rend-relayed-cells.html</url-pattern>
     <url-pattern>/hidserv-frac-reporting.html</url-pattern>
+    <url-pattern>/webstats-tb.html</url-pattern>
   </servlet-mapping>
 
   <servlet>
@@ -177,6 +178,9 @@
     <url-pattern>/hidserv-frac-reporting.png</url-pattern>
     <url-pattern>/hidserv-frac-reporting.pdf</url-pattern>
     <url-pattern>/hidserv-frac-reporting.svg</url-pattern>
+    <url-pattern>/webstats-tb.png</url-pattern>
+    <url-pattern>/webstats-tb.pdf</url-pattern>
+    <url-pattern>/webstats-tb.svg</url-pattern>
   </servlet-mapping>
 
   <servlet>
diff --git a/website/rserve/graphs.R b/website/rserve/graphs.R
index 9f8daa7..fc6cfb6 100644
--- a/website/rserve/graphs.R
+++ b/website/rserve/graphs.R
@@ -1095,3 +1095,34 @@ plot_hidserv_frac_reporting <- function(start, end, path) {
   ggsave(filename = path, width = 8, height = 5, dpi = 72)
 }
 
+plot_webstats_tb <- function(start, end, path) {
+  end <- min(end, as.character(Sys.Date() - 2))
+  load("/srv/metrics.torproject.org/metrics/shared/RData/webstats-tb.RData")
+  d <- data
+  d <- d[d$log_date >= start & d$log_date <= end, ]
+  date_breaks <- date_breaks(as.numeric(max(d$log_date) - min(d$log_date)))
+  d$request_type <- factor(d$request_type)
+  levels(d$request_type) <- list(
+      'Initial downloads' = 'tbid',
+      'Signature downloads' = 'tbsd',
+      'Update pings' = 'tbup',
+      'Update requests' = 'tbur')
+  formatter <- function(x, ...) {
+    format(x, ..., scientific = FALSE, big.mark = ' ') }
+  ggplot(d, aes(x = log_date, y = count)) +
+    geom_point() +
+    geom_line() +
+    expand_limits(y = 0) +
+    facet_grid(request_type ~ ., scales = "free_y") +
+    scale_x_date(name = paste("\nThe Tor Project - ",
+        "https://metrics.torproject.org/", sep = ""),
+        labels = date_format(date_breaks$format),
+        breaks = date_breaks$major,
+        minor_breaks = date_breaks$minor) +
+    scale_y_continuous(name = 'Requests per day\n', labels = formatter) +
+    theme(strip.text.y = element_text(angle = 0, hjust = 0, size = rel(1.5)),
+          strip.background = element_rect(fill = NA)) +
+    ggtitle("Tor Browser downloads and updates\n")
+  ggsave(filename = path, width = 8, height = 5, dpi = 72)
+}
+
diff --git a/website/src/org/torproject/metrics/web/research/ResearchStatsServlet.java b/website/src/org/torproject/metrics/web/research/ResearchStatsServlet.java
index 8f5c399..de5715e 100644
--- a/website/src/org/torproject/metrics/web/research/ResearchStatsServlet.java
+++ b/website/src/org/torproject/metrics/web/research/ResearchStatsServlet.java
@@ -39,6 +39,7 @@ public class ResearchStatsServlet extends HttpServlet {
     this.availableStatisticsFiles.add("advbwdist");
     this.availableStatisticsFiles.add("hidserv");
     this.availableStatisticsFiles.add("disagreement");
+    this.availableStatisticsFiles.add("webstats");
   }
 
   @Override
diff --git a/website/web/WEB-INF/sources.jsp b/website/web/WEB-INF/sources.jsp
index 64ca85b..b36d43f 100644
--- a/website/web/WEB-INF/sources.jsp
+++ b/website/web/WEB-INF/sources.jsp
@@ -39,6 +39,12 @@
     </div>
 
     <div class="container">
+      <ul>
+        <li><a href="https://webstats.torproject.org/" target="_blank"><code>webstats.torproject.org</code></a> collects logs from <code>torproject.org</code> web servers and provides them as a stripped-down version of Apache's "combined" log format without IP addresses, log times, HTTP parameters, referers, and user agent strings.</li>
+      </ul>
+    </div>
+
+    <div class="container">
       <h2>Measurement tools <a href="#measurement" name="measurement" class="anchor">#</a></h2>
       <p>The following tools perform active measurements in the Tor network.</p>
       <ul>
diff --git a/website/web/WEB-INF/stats.jsp b/website/web/WEB-INF/stats.jsp
index fc676ba..fbecc0f 100644
--- a/website/web/WEB-INF/stats.jsp
+++ b/website/web/WEB-INF/stats.jsp
@@ -483,8 +483,38 @@ given attribute.</li>
 
 </ul>
 
-    </div>
-  </div>
+</div>
+
+<div class="container">
+<h2>Requests to <code>torproject.org</code> web servers <a href="#webstats" name="webstats" class="anchor">#</a></h2>
+
+<p>The following data file contains aggregate statistics on requests to <code>torproject.org</code> web servers.</p>
+
+<p><b>Download as <a href="stats/webstats.csv">CSV file</a>.</b></p>
+
+<p>The statistics file contains the following columns:</p>
+<ul>
+<li><b>log_date:</b> UTC date (YYYY-MM-DD) when requests to <code>torproject.org</code> web servers have been logged.</li>
+<li><b>request_type:</b> Request type with fixed identifiers as follows:
+<ul>
+<li><b>"tbid":</b> Tor Browser initial downloads: GET requests to all sites with resource strings <code>'%/torbrowser/%.exe'</code>, <code>'%/torbrowser/%.dmg'</code>, and <code>'%/torbrowser/%.tar.xz'</code> and response code 200.</li>
+<li><b>"tbsd":</b> Tor Browser signature downloads: GET requests to all sites with resource strings <code>'%/torbrowser/%.exe.asc'</code>, <code>'%/torbrowser/%.dmg.asc'</code>, and <code>'%/torbrowser/%.tar.xz.asc'</code> and response code 200.</li>
+<li><b>"tbup":</b> Tor Browser update pings: GET requests to all sites with resource strings <code>'%/torbrowser/update_2/%'</code> and response code 200.</li>
+<li><b>"tbur":</b> Tor Browser update requests: GET requests to all sites with resource strings <code>'%/torbrowser/%.mar'</code> and response code 302.</li>
+<li><b>"tmid":</b> Tor Messenger initial downloads: GET requests to all sites with resource strings <code>'%/tormessenger/%.exe'</code>, <code>'%/tormessenger/%.dmg'</code>, and <code>'%/tormessenger/%.tar.xz'</code> and response code 200.</li>
+<li><b>"tmup":</b> Tor Messenger update pings: GET requests to all sites with resource strings <code>'%/tormessenger/update_2/%'</code> and response code 200.</li>
+<li><b>"twhph":</b> Tor website home page hits: GET requests to sites <code>'torproject.org'</code> and <code>'www.torproject.org'</code> with resource strings <code>'/'</code> and <code>'/index%'</code> and response code 200.</li>
+<li><b>"twdph":</b> Tor website download page hits: GET requests to sites <code>'torproject.org'</code> and <code>'www.torproject.org'</code> with resource strings <code>'/download/download%'</code> and <code>'/projects/torbrowser.html%'</code> and response code 200.</li>
+</ul>
+</li>
+<li><b>platform:</b> Platform string, like <b>"w"</b> for Windows, <b>"m"</b> for macOS, or <b>"l"</b> for Linux, <b>"o"</b> for other platforms, and the empty string for all platforms.</li>
+<li><b>channel:</b> Release channel, like <b>"r"</b> for stable releases, <b>"a"</b> for alpha releases, <b>"h"</b> for hardened releases, and the empty string for all channels.</li>
+<li><b>locale:</b> Locale, like <b>"en-US"</b> for English (United States), <b>"de"</b> for German, etc., <b>"??"</b> for unrecognized locales, and the empty string for all locales.</li>
+<li><b>incremental:</b> Incremental update, with <b>"t"</b> for incremental updates, <b>"f"</b> for non-incremental (full) updates, and the empty string for all updates types.</li>
+<li><b>count:</b> Number of request for the given request type, platform, etc.</li>
+</ul>
+
+</div>
 
 <jsp:include page="bottom.jsp"/>
 



More information about the tor-commits mailing list