[tor-commits] [metrics-web/master] Add two BridgeDB request graphs.

karsten at torproject.org karsten at torproject.org
Wed Dec 18 19:53:00 UTC 2019


commit 1e056fc82a4d22fbeefb99c2a1cc96246de0afa5
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Mon Dec 16 17:34:31 2019 +0100

    Add two BridgeDB request graphs.
    
    Implements #32135.
---
 src/main/R/rserver/rserve-init.R                   |  56 ++++++++
 .../torproject/metrics/stats/bridgedb/Main.java    | 148 +++++++++++++++++++++
 .../metrics/stats/collectdescs/Main.java           |   1 +
 .../org/torproject/metrics/stats/main/Main.java    |   6 +-
 src/main/resources/web.xml                         |   8 ++
 src/main/resources/web/json/categories.json        |   2 +
 src/main/resources/web/json/metrics.json           |  22 +++
 .../resources/web/jsps/reproducible-metrics.jsp    |  42 ++++++
 src/main/resources/web/jsps/stats.jsp              |  41 ++++++
 9 files changed, 324 insertions(+), 2 deletions(-)

diff --git a/src/main/R/rserver/rserve-init.R b/src/main/R/rserver/rserve-init.R
index 5eac4d7..c4f1b7f 100644
--- a/src/main/R/rserver/rserve-init.R
+++ b/src/main/R/rserver/rserve-init.R
@@ -1721,3 +1721,59 @@ write_userstats_censorship_events <- function(start, end, path) {
   write.csv(r, path, quote = FALSE, row.names = FALSE)
 }
 
+prepare_bridgedb_transport <- function(start_p = NULL, end_p = NULL) {
+  read_csv(file = paste(stats_dir, "bridgedb-stats.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        distributor = col_skip(),
+        transport = col_character(),
+        requests = col_double())) %>%
+    filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
+    filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
+    group_by(date, transport) %>%
+    summarize(requests = sum(requests)) %>%
+    arrange(date, transport)
+}
+
+plot_bridgedb_transport <- function(start_p, end_p, path_p) {
+  prepare_bridgedb_transport(start_p, end_p) %>%
+    complete(date = full_seq(date, period = 1), nesting(transport)) %>%
+    ggplot(aes(x = date, y = requests, colour = transport)) +
+    geom_line(na.rm = TRUE) +
+    scale_x_date(name = "", breaks = custom_breaks,
+      labels = custom_labels, minor_breaks = custom_minor_breaks) +
+    scale_y_continuous(name = "", labels = formatter, limits = c(0, NA)) +
+    scale_colour_hue(name = "") +
+    ggtitle("BridgeDB requests by requested transport") +
+    labs(caption = copyright_notice)
+  ggsave(filename = path_p, width = 8, height = 5, dpi = 150)
+}
+
+prepare_bridgedb_distributor <- function(start_p = NULL, end_p = NULL) {
+  read_csv(file = paste(stats_dir, "bridgedb-stats.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        distributor = col_character(),
+        transport = col_skip(),
+        requests = col_double())) %>%
+    filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
+    filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
+    group_by(date, distributor) %>%
+    summarize(requests = sum(requests)) %>%
+    arrange(date, distributor)
+}
+
+plot_bridgedb_distributor <- function(start_p, end_p, path_p) {
+  prepare_bridgedb_distributor(start_p, end_p) %>%
+    complete(date = full_seq(date, period = 1), nesting(distributor)) %>%
+    ggplot(aes(x = date, y = requests, colour = distributor)) +
+    geom_line(na.rm = TRUE) +
+    scale_x_date(name = "", breaks = custom_breaks,
+      labels = custom_labels, minor_breaks = custom_minor_breaks) +
+    scale_y_continuous(name = "", labels = formatter, limits = c(0, NA)) +
+    scale_colour_hue(name = "") +
+    ggtitle("BridgeDB requests by distributor") +
+    labs(caption = copyright_notice)
+  ggsave(filename = path_p, width = 8, height = 5, dpi = 150)
+}
+
diff --git a/src/main/java/org/torproject/metrics/stats/bridgedb/Main.java b/src/main/java/org/torproject/metrics/stats/bridgedb/Main.java
new file mode 100644
index 0000000..16c3c21
--- /dev/null
+++ b/src/main/java/org/torproject/metrics/stats/bridgedb/Main.java
@@ -0,0 +1,148 @@
+/* Copyright 2019 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.metrics.stats.bridgedb;
+
+import org.torproject.descriptor.BridgedbMetrics;
+import org.torproject.descriptor.Descriptor;
+import org.torproject.descriptor.DescriptorSourceFactory;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+public class Main {
+
+  private static Logger log = LoggerFactory.getLogger(Main.class);
+
+  private static final Path bridgedbStatsCsvFile
+      = org.torproject.metrics.stats.main.Main.modulesDir.toPath()
+      .resolve("bridgedb/stats/bridgedb-stats.csv");
+
+  private static File[] descriptorsDirectories = new File[] {
+      new File(org.torproject.metrics.stats.main.Main.descriptorsDir,
+          "archive/bridgedb-metrics"),
+      new File(org.torproject.metrics.stats.main.Main.descriptorsDir,
+          "recent/bridgedb-metrics") };
+
+  /** Executes this data-processing module. */
+  public static void main(String[] args) throws IOException {
+    SortedMap<String, Long> previousStatistics
+        = readBridgedbStatsFile(bridgedbStatsCsvFile);
+    SortedMap<String, Long> currentStatistics = parseStatistics(
+        previousStatistics, descriptorsDirectories);
+    writeBridgedbStatsFile(bridgedbStatsCsvFile, currentStatistics);
+  }
+
+  static SortedMap<String, Long>
+      readBridgedbStatsFile(Path bridgedbStatsCsvFile) throws IOException {
+    SortedMap<String, Long> readStatistics = new TreeMap<>();
+    if (Files.exists(bridgedbStatsCsvFile)) {
+      for (String line : Files.readAllLines(bridgedbStatsCsvFile)) {
+        if (line.startsWith("date")) {
+          continue;
+        }
+        String[] lineParts = line.split(",");
+        if (lineParts.length != 4) {
+          log.warn("Skipping unrecognized line '{}' in {}.", line,
+              bridgedbStatsCsvFile.toAbsolutePath());
+          continue;
+        }
+        String key = String.format("%s,%s,%s", lineParts[0], lineParts[1],
+            lineParts[2]);
+        long value = Long.parseLong(lineParts[3]);
+        readStatistics.put(key, value);
+      }
+      log.debug("Read {} containing {} non-header lines.", bridgedbStatsCsvFile,
+          readStatistics.size());
+    }
+    return readStatistics;
+  }
+
+  static SortedMap<String, Long> parseStatistics(
+      SortedMap<String, Long> previousStatistics,
+      File[] descriptorsDirectories) {
+    SortedMap<String, Long> currentStatistics
+        = new TreeMap<>(previousStatistics);
+    for (Descriptor descriptor : DescriptorSourceFactory
+        .createDescriptorReader().readDescriptors(descriptorsDirectories)) {
+      if (!(descriptor instanceof BridgedbMetrics)) {
+        continue;
+      }
+      BridgedbMetrics bridgedbMetrics = (BridgedbMetrics) descriptor;
+      if (!"1".equals(bridgedbMetrics.bridgedbMetricsVersion())) {
+        log.warn("Unable to process BridgeDB metrics version {} != 1.",
+            bridgedbMetrics.bridgedbMetricsVersion());
+        continue;
+      }
+      if (!bridgedbMetrics.bridgedbMetricCounts().isPresent()) {
+        continue;
+      }
+      String bridgedbMetricsEndDate = bridgedbMetrics.bridgedbMetricsEnd()
+          .toLocalDate().toString();
+      SortedMap<String, Long> parsedStatistics = new TreeMap<>();
+      for (Map.Entry<String, Long> bridgedbMetricCount
+          : bridgedbMetrics.bridgedbMetricCounts().get().entrySet()) {
+        String[] keyParts = bridgedbMetricCount.getKey().split("\\.");
+        if (keyParts.length < 3) {
+          /* Unable to extract relevant key parts. */
+          continue;
+        }
+        if (bridgedbMetricCount.getValue() < 10) {
+          log.warn("Skipping too small BridgeDB metric count {} < 10 in {}.",
+              bridgedbMetricCount.getValue(),
+              descriptor.getDescriptorFile().getAbsolutePath());
+          continue;
+        }
+        String distributor = keyParts[0];
+        String transport = keyParts[1];
+        String ccOrEmail = keyParts[2];
+        if (ccOrEmail.equals("zz")) {
+          /* Skip requests coming in over Tor exits. */
+          continue;
+        }
+        String key = String.format("%s,%s,%s", bridgedbMetricsEndDate,
+            distributor, transport);
+        long countsSoFar = parsedStatistics.getOrDefault(key, 0L);
+        countsSoFar += bridgedbMetricCount.getValue() - 5L;
+        parsedStatistics.put(key, countsSoFar);
+      }
+      if (!Collections.disjoint(currentStatistics.keySet(),
+          parsedStatistics.keySet())) {
+        /* Statististics for this date (and any combination of distributor and
+         * transport) are already contained. */
+        continue;
+      }
+      currentStatistics.putAll(parsedStatistics);
+    }
+    return currentStatistics;
+  }
+
+  static void writeBridgedbStatsFile(Path bridgedbStatsCsvFile,
+      SortedMap<String, Long> currentStatistics) throws IOException {
+    if (!Files.exists(bridgedbStatsCsvFile.getParent())) {
+      Files.createDirectories(bridgedbStatsCsvFile.getParent());
+    }
+    List<String> lines = new ArrayList<>();
+    lines.add("date,distributor,transport,requests");
+    for (Map.Entry<String, Long> statistic : currentStatistics.entrySet()) {
+      lines.add(String.format("%s,%d", statistic.getKey(),
+          statistic.getValue()));
+    }
+    Files.write(bridgedbStatsCsvFile, lines, StandardOpenOption.CREATE);
+    log.debug("Wrote {} containing {} non-header lines.", bridgedbStatsCsvFile,
+        lines.size() - 1);
+  }
+}
+
diff --git a/src/main/java/org/torproject/metrics/stats/collectdescs/Main.java b/src/main/java/org/torproject/metrics/stats/collectdescs/Main.java
index 1d0d840..26911ab 100644
--- a/src/main/java/org/torproject/metrics/stats/collectdescs/Main.java
+++ b/src/main/java/org/torproject/metrics/stats/collectdescs/Main.java
@@ -20,6 +20,7 @@ public class Main {
         DescriptorSourceFactory.createDescriptorCollector();
     collector.collectDescriptors(
         "https://collector.torproject.org", new String[] {
+            "/recent/bridgedb-metrics/",
             "/recent/bridge-descriptors/extra-infos/",
             "/recent/bridge-descriptors/server-descriptors/",
             "/recent/bridge-descriptors/statuses/",
diff --git a/src/main/java/org/torproject/metrics/stats/main/Main.java b/src/main/java/org/torproject/metrics/stats/main/Main.java
index 1dec6a3..4ea15b4 100644
--- a/src/main/java/org/torproject/metrics/stats/main/Main.java
+++ b/src/main/java/org/torproject/metrics/stats/main/Main.java
@@ -56,7 +56,8 @@ public class Main {
         org.torproject.metrics.stats.clients.Main.class,
         org.torproject.metrics.stats.servers.Main.class,
         org.torproject.metrics.stats.webstats.Main.class,
-        org.torproject.metrics.stats.totalcw.Main.class
+        org.torproject.metrics.stats.totalcw.Main.class,
+        org.torproject.metrics.stats.bridgedb.Main.class
     };
 
     for (Class<?> module : modules) {
@@ -84,7 +85,8 @@ public class Main {
         new File(modulesDir, "clients/stats/userstats-combined.csv"),
         new File(modulesDir, "servers/stats"),
         new File(modulesDir, "webstats/stats"),
-        new File(modulesDir, "totalcw/stats")
+        new File(modulesDir, "totalcw/stats"),
+        new File(modulesDir, "bridgedb/stats")
     };
     List<String> copiedFiles = new ArrayList<>();
     for (File moduleStatsDir : moduleStatsDirs) {
diff --git a/src/main/resources/web.xml b/src/main/resources/web.xml
index 10d12ee..08a3bcd 100644
--- a/src/main/resources/web.xml
+++ b/src/main/resources/web.xml
@@ -57,6 +57,8 @@
     <url-pattern>/bridges-ipv6.html</url-pattern>
     <url-pattern>/advbw-ipv6.html</url-pattern>
     <url-pattern>/totalcw.html</url-pattern>
+    <url-pattern>/bridgedb-transport.html</url-pattern>
+    <url-pattern>/bridgedb-distributor.html</url-pattern>
   </servlet-mapping>
 
   <servlet>
@@ -201,6 +203,12 @@
     <url-pattern>/totalcw.png</url-pattern>
     <url-pattern>/totalcw.pdf</url-pattern>
     <url-pattern>/totalcw.csv</url-pattern>
+    <url-pattern>/bridgedb-transport.png</url-pattern>
+    <url-pattern>/bridgedb-transport.pdf</url-pattern>
+    <url-pattern>/bridgedb-transport.csv</url-pattern>
+    <url-pattern>/bridgedb-distributor.png</url-pattern>
+    <url-pattern>/bridgedb-distributor.pdf</url-pattern>
+    <url-pattern>/bridgedb-distributor.csv</url-pattern>
   </servlet-mapping>
 
   <servlet>
diff --git a/src/main/resources/web/json/categories.json b/src/main/resources/web/json/categories.json
index 3771631..d091ed7 100644
--- a/src/main/resources/web/json/categories.json
+++ b/src/main/resources/web/json/categories.json
@@ -11,6 +11,8 @@
       "userstats-bridge-transport",
       "userstats-bridge-combined",
       "userstats-bridge-version",
+      "bridgedb-transport",
+      "bridgedb-distributor",
       "userstats-relay-table",
       "userstats-censorship-events",
       "userstats-bridge-table",
diff --git a/src/main/resources/web/json/metrics.json b/src/main/resources/web/json/metrics.json
index 54ce78b..b8921c7 100644
--- a/src/main/resources/web/json/metrics.json
+++ b/src/main/resources/web/json/metrics.json
@@ -447,5 +447,27 @@
       "start",
       "end"
     ]
+  },
+  {
+    "id": "bridgedb-transport",
+    "title": "BridgeDB requests by requested transport",
+    "type": "Graph",
+    "description": "<p>This graph shows the number of BridgeDB requests for each requested transport. BridgeDB requests over Tor are not included in these numbers.</p>",
+    "function": "bridgedb_transport",
+    "parameters": [
+      "start",
+      "end"
+    ]
+  },
+  {
+    "id": "bridgedb-distributor",
+    "title": "BridgeDB requests by distributor",
+    "type": "Graph",
+    "description": "<p>This graph shows the number of BridgeDB requests for each distributor. HTTPS requests over Tor are not included in these numbers.</p>",
+    "function": "bridgedb_distributor",
+    "parameters": [
+      "start",
+      "end"
+    ]
   }
 ]
diff --git a/src/main/resources/web/jsps/reproducible-metrics.jsp b/src/main/resources/web/jsps/reproducible-metrics.jsp
index 9b21aa7..ed3ad4c 100644
--- a/src/main/resources/web/jsps/reproducible-metrics.jsp
+++ b/src/main/resources/web/jsps/reproducible-metrics.jsp
@@ -230,6 +230,48 @@ We therefore refer to Step 4 of the <a href="#relay-users">Relay users</a> descr
 </div>
 
 <div class="container">
+<h3 id="bridgedb-requests" class="hover">BridgeDB requests
+<a href="#bridgedb-requests" class="anchor">#</a>
+</h3>
+
+<p>BridgeDB metrics contain aggregated information about requests to the BridgeDB service.
+BridgeDB keeps track of each request per distribution method (HTTPS, moat, email), per bridge type (e.g., <code>vanilla</code> or <code>obfs4</code>) per country code or email provider (e.g., <code>"ru"</code> or <code>"gmail"</code>) per request success (<code>"success"</code> or <code>"fail"</code>).
+Every 24 hours, BridgeDB writes these metrics to disk and then begins a new measurement interval.</p>
+
+<p>The following description applies to the following graphs:</p>
+
+<ul>
+<li>BridgeDB requests by requested transport <a href="/bridgedb-transport.html" class="btn btn-primary btn-xs"><i class="fa fa-chevron-right" aria-hidden="true"></i> graph</a></li>
+<li>BridgeDB requests by distributor <a href="/bridgedb-distributor.html" class="btn btn-primary btn-xs"><i class="fa fa-chevron-right" aria-hidden="true"></i> graph</a></li>
+</ul>
+
+<h4>Step 1: Parse BridgeDB metrics to obtain reported request numbers</h4>
+
+<p>Obtain BridgeDB metrics from <a href="/collector.html#type-bridgedb-metrics">CollecTor</a>.
+Refer to the <a href="https://gitweb.torproject.org/bridgedb.git/tree/doc/bridgedb-metrics-spec.txt">BridgeDB metrics specification</a> for details on the descriptor format.</p>
+
+<h4>Step 2: Skip requests coming in over Tor exits</h4>
+
+<p>Skip any request counts with <code>"zz"</code> as their <code>CC/EMAIL</code> metrics key part.
+We use the <code>"zz"</code> pseudo country code for requests originating from Tor exit relays.
+We're discarding these requests because <a href="https://bugs.torproject.org/32117">bots use the Tor network to crawl BridgeDB</a>, and including bot requests would provide a
+false sense of how users interact with BridgeDB.
+Note that BridgeDB maintains a separate distribution pool for requests coming from Tor exit relays.</p>
+
+<h4>Step 3: Aggregate requests by date, distributor, and transport</h4>
+
+<p>BridgeDB metrics contain request numbers broken down by distributor, bridge type, and a few more dimensions.
+For our purposes we only care about total request numbers by date and either distributor or transport.
+Our total request number includes both successful (i.e., the user ended up getting bridge lines)
+and unsuccessful (e.g., the user failed to solve the CAPTCHA) requests.
+We're using request sums by these three dimensions as aggregates and we are subtracting <code>bin_size/2</code>
+from each count to better approximate the count before binning.
+As date we're using the date of the BridgeDB metrics interval end.
+If we encounter more than one BridgeDB metrics interval end on the same UTC date (which shouldn't be possible with an interval length of 24 hours), we arbitrarily keep whichever we process first.</p>
+
+</div>
+
+<div class="container">
 <h2><i class="fa fa-server fa-fw" aria-hidden="true"></i>
 Servers <a href="#servers" name="servers" class="anchor">#</a></h2>
 
diff --git a/src/main/resources/web/jsps/stats.jsp b/src/main/resources/web/jsps/stats.jsp
index 3ce295d..25eb239 100644
--- a/src/main/resources/web/jsps/stats.jsp
+++ b/src/main/resources/web/jsps/stats.jsp
@@ -54,6 +54,7 @@ https://metrics.torproject.org/identifier.csv
 <li><b>June 2, 2019:</b> Added <a href="#onionperf-throughput">Throughput</a> graph.</li>
 <li><b>August 5, 2019:</b> Re-added the <a href="#bandwidth">Total relay bandwidth</a> graph due to popular demand.</li>
 <li><b>October 2, 2019:</b> Added <a href="#webstats-tb-channel">Tor Browser updates by release channel</a> graph.</li>
+<li><b>December 18, 2019:</b> Added <a href="#bridgedb-transport">BridgeDB requests by requested transport</a> and <a href="#bridgedb-distributor">BridgeDB requests by distributor</a> graphs.</li>
 </ul>
 
 </div>
@@ -178,6 +179,46 @@ using bridges, which can be either <b>"v4"</b> or <b>"v6"</b>.</li>
 <li><b>frac:</b> Fraction of bridges in percent that the estimate is based on.</li>
 </ul>
 
+<h3>BridgeDB requests by requested transport
+<a href="/bridgedb-transport.html" class="btn btn-primary btn-xs"><i class="fa fa-chevron-right" aria-hidden="true"></i> graph</a>
+<a href="/bridgedb-transport.csv" class="btn btn-primary btn-xs"><i class="fa fa-chevron-right" aria-hidden="true"></i> data</a>
+<a href="#bridgedb-transport" name="bridgedb-transport" class="anchor">#</a></h3>
+
+<h4>Parameters</h4>
+
+<ul>
+<li><b>start:</b> First UTC date (YYYY-MM-DD) to include in the file.</li>
+<li><b>end:</b> Last UTC date (YYYY-MM-DD) to include in the file.</li>
+</ul>
+
+<h4>Columns</h4>
+
+<ul>
+<li><b>date:</b> UTC date (YYYY-MM-DD) when requests were sent to BridgeDB.</li>
+<li><b>transport:</b> Name of the pluggable transport protocol, which includes <code>"obfs2"</code>, <code>"obfs3"</code>, <code>"obfs4"</code>, <code>"scramblesuit"</code>, and <code>"fte"</code>, and which will change in the future.</li>
+<li><b>requests:</b> Approximate number of requests for the given transport.</li>
+</ul>
+
+<h3>BridgeDB requests by distributor
+<a href="/bridgedb-distributor.html" class="btn btn-primary btn-xs"><i class="fa fa-chevron-right" aria-hidden="true"></i> graph</a>
+<a href="/bridgedb-distributor.csv" class="btn btn-primary btn-xs"><i class="fa fa-chevron-right" aria-hidden="true"></i> data</a>
+<a href="#bridgedb-distributor" name="bridgedb-distributor" class="anchor">#</a></h3>
+
+<h4>Parameters</h4>
+
+<ul>
+<li><b>start:</b> First UTC date (YYYY-MM-DD) to include in the file.</li>
+<li><b>end:</b> Last UTC date (YYYY-MM-DD) to include in the file.</li>
+</ul>
+
+<h4>Columns</h4>
+
+<ul>
+<li><b>date:</b> UTC date (YYYY-MM-DD) when requests were sent to BridgeDB.</li>
+<li><b>distributor:</b> Name of BridgeDB's distributor, which includes <code>"https"</code>, <code>"email"</code>, and <code>"moat"</code>, and which may change in the future.</li>
+<li><b>requests:</b> Approximate number of requests for the given distributor.</li>
+</ul>
+
 </div>
 
 <div class="container">



More information about the tor-commits mailing list