[tor-commits] [metrics-tasks/master] Add code behind bridge user counting report (#5807).

karsten at torproject.org karsten at torproject.org
Wed Oct 24 15:35:00 UTC 2012


commit d301479e3fdd8c1d17f02f6d72a8cf6eaa87e17e
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Wed Oct 24 11:34:10 2012 -0400

    Add code behind bridge user counting report (#5807).
---
 task-5807/bridge-dirreq-stats.R          |  222 +++++++++++
 task-5807/run.sh                         |    3 +
 task-5807/src/EvalBridgeDirreqStats.java |  603 ++++++++++++++++++++++++++++++
 3 files changed, 828 insertions(+), 0 deletions(-)

diff --git a/task-5807/bridge-dirreq-stats.R b/task-5807/bridge-dirreq-stats.R
new file mode 100644
index 0000000..2a07a3a
--- /dev/null
+++ b/task-5807/bridge-dirreq-stats.R
@@ -0,0 +1,222 @@
+library(ggplot2)
+library(reshape)
+library(scales)
+
+# Commented out, because this graph takes a while to draw...
+#d <- read.csv("out/dirreq-responses", stringsAsFactors = FALSE,
+#  header = FALSE)
+#d <- data.frame(date = as.Date(d$V1), requests = d$V4,
+#  asrelay = ifelse(d$V3, "also seen as\nnon-bridge relays",
+#  "only seen as\nbridges"))
+#ggplot(d, aes(x = date, y = requests)) +
+#geom_point() +
+#facet_grid(asrelay ~ .) +
+#scale_x_date(name = "",
+#  labels = date_format("%b %Y"),
+#  minor_breaks = date_breaks("1 month")) +
+#scale_y_continuous(name = "", labels = comma_format(digits = 1))
+#ggsave("graphs/responses-single-bridges.png", width = 6, height = 3.5,
+#  dpi = 100)
+
+# ALTERNATIVE: out/bridge-dirreq-stats-no-relays
+b <- read.csv("out/bridge-dirreq-stats-all-bridges",
+  stringsAsFactors = FALSE)
+b <- b[b$date >= "2011-07-01" & b$date <= "2012-09-30", ]
+
+x <- data.frame(date = b$date,
+  value = (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb) /
+          ((b$ha + b$hc) * b$nabcd))
+x <- melt(x, id = "date")
+ggplot(x, aes(x = as.Date(date), y = value)) +
+geom_line() +
+scale_x_date(name = "",
+  labels = date_format("%b %Y"),
+  minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", limit = c(0, 1), labels = percent)
+ggsave("graphs/fraction.pdf", width = 6, height = 3, dpi = 100)
+
+ggplot(b, aes(x = as.Date(date), y = (ra + rb) / 86400)) +
+geom_line() +
+scale_x_date(name = "",
+  labels = date_format("%b %Y"),
+  minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/responses.pdf", width = 6, height = 3, dpi = 72)
+
+x <- data.frame(
+  date = as.Date(b$date),
+  responses = (b$ra + b$rb) / 86400,
+  fraction = (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb) /
+    ((b$ha + b$hc) * b$nabcd),
+  totalresponses = ((b$ra + b$rb) * (b$ha + b$hc) *
+    b$nabcd) / (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb) / 86400)
+x <- melt(x, id = "date")
+x <- data.frame(date = x$date, value = x$value, variable =
+  ifelse(x$variable == "responses",
+    "1. Reported directory\nrequests",
+  ifelse(x$variable == "fraction", paste("2. Estimated fraction\n",
+    "of bridges reporting\ndirectory requests", sep = ""),
+    "3. Estimated directory\nrequests in the\nnetwork")))
+ggplot(x, aes(x = as.Date(date), y = value)) +
+geom_line() +
+facet_grid(variable ~ ., scales = "free_y") +
+scale_x_date(name = "",
+  labels = date_format("%b %Y"),
+  minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/extrapolated-responses.pdf", width = 6, height = 5,
+  dpi = 72)
+
+ggplot(b, aes(x = as.Date(date), y = (na + nb) / nabcd)) +
+geom_line() +
+scale_x_date(name = "",
+  labels = date_format("%b %Y"),
+  minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", limit = c(0, 1), labels = percent)
+ggsave("graphs/fraction-unweighted.pdf", width = 6, height = 3, dpi = 72)
+
+x <- data.frame(date = b$date,
+  #x1 = (b$ra + b$rb) * b$nabcd / (b$na + b$nb),
+  x2 = ((b$ra + b$rb) * (b$ha + b$hc) *
+  b$nabcd) / (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb))
+#x <- melt(x, id = "date")
+ggplot(x, aes(x = as.Date(date), y = x2 / 86400)) +
+geom_line() +
+scale_x_date(name = "",
+  labels = date_format("%b %Y"),
+  minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/totalresponses.pdf", width = 6, height = 3, dpi = 72)
+
+n <- data.frame(date = b$date, na = b$na / 86400, nb = b$nb / 86400,
+  nc = b$nc / 86400, nd = (b$nabcd - b$na - b$nb - b$nc) / 86400)
+n <- melt(n, id = "date")
+ggplot(n, aes(x = as.Date(date), y = value)) +
+geom_line() +
+facet_grid(variable ~ .) +
+scale_x_date(name = "",
+  labels = date_format("%b %Y"),
+  minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/n.pdf", width = 6, height = 7, dpi = 100)
+
+h <- data.frame(date = b$date, value = (b$ha + b$hc) / 86400)
+ggplot(h, aes(x = as.Date(date), y = value)) +
+geom_line() +
+scale_x_date(name = "",
+  labels = date_format("%b %Y"),
+  minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/history-bytes.pdf", width = 6, height = 3, dpi = 100)
+
+h <- data.frame(date = b$date, ha = b$ha / 86400, hc = b$hc / 86400)
+h <- melt(h, id = "date")
+ggplot(h, aes(x = as.Date(date), y = value)) +
+geom_line() +
+facet_grid(variable ~ .) +
+scale_x_date(name = "",
+  labels = date_format("%b %Y"),
+  minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/h.pdf", width = 6, height = 5, dpi = 100)
+
+r <- data.frame(date = b$date, ra = b$ra / 86400, rb = b$rb / 86400)
+r <- melt(r, id = "date")
+ggplot(r, aes(x = as.Date(date), y = value)) +
+geom_line() +
+facet_grid(variable ~ .) +
+scale_x_date(name = "",
+  labels = date_format("%b %Y"),
+  minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/r.pdf", width = 6, height = 5, dpi = 100)
+
+x <- data.frame(date = b$date,
+  value = ((b$ra + b$rb) * (b$ha + b$hc) *
+  b$nabcd) / (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb) / 864000,
+  stringsAsFactors = FALSE)
+x <- melt(x, id = "date")
+ggplot(x, aes(x = as.Date(date), y = value)) +
+geom_line() +
+scale_x_date(name = "",
+  labels = date_format("%b %Y"),
+  minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/totalusers.pdf", width = 6, height = 3, dpi = 100)
+x <- x[x$date >= '2012-07-01', ]
+#max_y <- max(x$value / 864000, na.omit = FALSE)
+ggplot(x, aes(x = as.Date(date), y = value)) +
+geom_line() +
+scale_x_date(name = "",
+  labels = date_format("%b %Y"),
+  breaks = date_breaks("1 month"),
+  minor_breaks = date_breaks("1 week")) +
+scale_y_continuous(name = "", #limit = c(0, max_y),
+  labels = comma_format(digits = 1))
+ggsave("graphs/totalusers-q3-2012.pdf", width = 6, height = 3, dpi = 100)
+
+ggplot(b, aes(x = as.Date(date), y = consensuses)) +
+geom_point() +
+geom_hline(yintercept = 19.5, linetype = 2) +
+scale_x_date(name = "",
+  labels = date_format("%b %Y"),
+  minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/consensuses.pdf", width = 6, height = 3, dpi = 100)
+
+x <- data.frame(date = b$date,
+  value = (b$sy * (b$ra + b$rb) * (b$ha + b$hc) *
+  b$nabcd) / (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb))
+x <- melt(x, id = "date")
+ggplot(x, aes(x = as.Date(date), y = value / 864000)) +
+geom_line() +
+scale_x_date(name = "",
+  labels = date_format("%b %Y"),
+  minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/syusers.pdf", width = 6, height = 3, dpi = 100)
+
+u <- read.csv("bridge-users.csv", stringsAsFactors = FALSE)
+u <- u[u$date >= "2011-07-01" & u$date <= "2012-09-30", ]
+u <- data.frame(date = u$date, all = u$all)
+ggplot(u, aes(x = as.Date(date), y = all)) +
+geom_line() +
+scale_x_date(name = "",
+  labels = date_format("%b %Y"),
+  minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/totalusers-oldapproach.pdf", width = 6, height = 3,
+  dpi = 100)
+
+u <- read.csv("bridge-users.csv", stringsAsFactors = FALSE)
+u <- u[u$date >= "2011-07-01" & u$date <= "2012-09-30", ]
+u <- data.frame(date = u$date, value = u$all,
+  variable = "old approach based on\nunique IP addresses",
+  stringsAsFactors = FALSE)
+x <- data.frame(date = b$date,
+  value = ((b$ra + b$rb) * (b$ha + b$hc) *
+  b$nabcd) / (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb) / 864000,
+  variable = "new approach based on\ndirectory requests",
+  stringsAsFactors = FALSE)
+u <- rbind(u, x)
+ggplot(u, aes(x = as.Date(date), y = value)) +
+geom_line() +
+facet_grid(variable ~ ., scales = "free_y") +
+scale_x_date(name = "",
+  labels = date_format("%b %Y"),
+  minor_breaks = date_breaks("1 month")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/compare-totalusers.pdf", width = 6, height = 4,
+  dpi = 100)
+u <- u[u$date >= '2012-07-01', ]
+ggplot(u, aes(x = as.Date(date), y = value)) +
+geom_line() +
+facet_grid(variable ~ ., scales = "free_y") +
+scale_x_date(name = "",
+  labels = date_format("%b %Y"),
+  breaks = date_breaks("1 month"),
+  minor_breaks = date_breaks("1 week")) +
+scale_y_continuous(name = "", labels = comma_format(digits = 1))
+ggsave("graphs/compare-totalusers-q3-2012.pdf", width = 6, height = 4,
+  dpi = 100)
+
diff --git a/task-5807/run.sh b/task-5807/run.sh
new file mode 100755
index 0000000..52d1ee7
--- /dev/null
+++ b/task-5807/run.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+javac -d bin/ -cp lib/commons-codec-1.4.jar:lib/commons-compress-1.3.jar:lib/descriptor.jar src/EvalBridgeDirreqStats.java && time java -Xmx6g -cp bin/:lib/commons-codec-1.4.jar:lib/commons-compress-1.3.jar:lib/descriptor.jar EvalBridgeDirreqStats
+
diff --git a/task-5807/src/EvalBridgeDirreqStats.java b/task-5807/src/EvalBridgeDirreqStats.java
new file mode 100644
index 0000000..c996a26
--- /dev/null
+++ b/task-5807/src/EvalBridgeDirreqStats.java
@@ -0,0 +1,603 @@
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.TimeZone;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+import org.apache.commons.codec.binary.Hex;
+import org.apache.commons.codec.digest.DigestUtils;
+import org.torproject.descriptor.BridgeNetworkStatus;
+import org.torproject.descriptor.Descriptor;
+import org.torproject.descriptor.DescriptorFile;
+import org.torproject.descriptor.DescriptorReader;
+import org.torproject.descriptor.DescriptorSourceFactory;
+import org.torproject.descriptor.ExtraInfoDescriptor;
+import org.torproject.descriptor.NetworkStatusEntry;
+import org.torproject.descriptor.RelayNetworkStatusConsensus;
+
+/* Extract relevant pieces of information from relay consensuses and
+ * bridge descriptors to estimate daily bridge users.  See README for
+ * usage instructions. */
+public class EvalBridgeDirreqStats {
+  public static void main(String[] args) throws Exception {
+
+    /* Parse relay consensuses from in/relay-descriptors/.  Skip this step
+     * if in/relay-descriptors/ does not exist. */
+    File consensusesDirectory = new File("in/relay-descriptors");
+    File hashedFingerprintsFile = new File("out/hashed-fingerprints");
+    File consensusesPerDayFile = new File("out/consensuses-per-day");
+    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+    dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+    if (consensusesDirectory.exists()) {
+      SortedSet<String> hashedFingerprints = new TreeSet<String>();
+      SortedMap<String, Integer> consensusesPerDay =
+          new TreeMap<String, Integer>();
+      DescriptorReader descriptorReader =
+          DescriptorSourceFactory.createDescriptorReader();
+      descriptorReader.addDirectory(consensusesDirectory);
+      Iterator<DescriptorFile> descriptorFiles =
+          descriptorReader.readDescriptors();
+      while (descriptorFiles.hasNext()) {
+        DescriptorFile descriptorFile = descriptorFiles.next();
+        for (Descriptor descriptor : descriptorFile.getDescriptors()) {
+          if (!(descriptor instanceof RelayNetworkStatusConsensus)) {
+            continue;
+          }
+          RelayNetworkStatusConsensus consensus =
+              (RelayNetworkStatusConsensus) descriptor;
+
+          /* Extract hashed fingerprints of all known relays to remove
+           * those fingerprints from bridge usage statistics later on. */
+          for (NetworkStatusEntry statusEntry :
+              consensus.getStatusEntries().values()) {
+            hashedFingerprints.add(Hex.encodeHexString(DigestUtils.sha(
+                Hex.decodeHex(statusEntry.getFingerprint().
+                toCharArray()))).toUpperCase());
+          }
+
+          /* Count the number of consensuses per day. */
+          String date = dateFormat.format(
+              consensus.getValidAfterMillis());
+          int consensuses = 1;
+          if (consensusesPerDay.containsKey(date)) {
+            consensuses += consensusesPerDay.get(date);
+          }
+          consensusesPerDay.put(date, consensuses);
+        }
+      }
+      hashedFingerprintsFile.getParentFile().mkdirs();
+      BufferedWriter bw = new BufferedWriter(new FileWriter(
+          hashedFingerprintsFile));
+      for (String hashedFingerprint : hashedFingerprints) {
+        bw.write(hashedFingerprint + "\n");
+      }
+      bw.close();
+      consensusesPerDayFile.getParentFile().mkdirs();
+      bw = new BufferedWriter(new FileWriter(consensusesPerDayFile));
+      for (Map.Entry<String, Integer> e : consensusesPerDay.entrySet()) {
+        bw.write(e.getKey() + "," + e.getValue() + "\n");
+      }
+      bw.close();
+    }
+
+    /* Parse bridge network statuses from in/bridge-descriptors/.  Skip
+     * this step if in/bridge-descriptors/ does not exist. */
+    File bridgeDescriptorsDirectory = new File("in/bridge-descriptors");
+    File bridgesPerDayFile = new File("out/bridges-per-day");
+    File dirreqResponsesFile = new File("out/dirreq-responses");
+    File dirreqWriteHistoryFile = new File("out/dirreq-write-history");
+    File bridgeStatsUsersFile = new File("out/bridge-stats-users");
+    SimpleDateFormat dateTimeFormat = new SimpleDateFormat(
+        "yyyy-MM-dd HH:mm:ss");
+    dateTimeFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+    if (bridgeDescriptorsDirectory.exists()) {
+
+      /* Read hashed fingerprints from disk, so that we can include in the
+       * intermediate files whether a bridge was running as non-bridge
+       * relay before. */
+      SortedSet<String> hashedFingerprints = new TreeSet<String>();
+      String line;
+      BufferedReader br = new BufferedReader(new FileReader(
+          hashedFingerprintsFile));
+      while ((line = br.readLine()) != null) {
+        hashedFingerprints.add(line.toUpperCase());
+      }
+      br.close();
+
+      /* Prepare data structures for first collecting everything we parse.
+       * There may be duplicates which we can best remove in memory. */
+      SortedMap<String, List<Integer>> bridgesPerDay =
+          new TreeMap<String, List<Integer>>();
+      SortedSet<String> dirreqResponses = new TreeSet<String>();
+      SortedMap<String, SortedMap<Long, Long>> dirreqWriteHistory =
+          new TreeMap<String, SortedMap<Long, Long>>();
+      SortedSet<String> bridgeIps = new TreeSet<String>();
+
+      /* Parse everything in in/bridge-descriptors/. */
+      DescriptorReader descriptorReader =
+          DescriptorSourceFactory.createDescriptorReader();
+      descriptorReader.addDirectory(bridgeDescriptorsDirectory);
+      Iterator<DescriptorFile> descriptorFiles =
+          descriptorReader.readDescriptors();
+      while (descriptorFiles.hasNext()) {
+        DescriptorFile descriptorFile = descriptorFiles.next();
+        for (Descriptor descriptor : descriptorFile.getDescriptors()) {
+          if (descriptor instanceof BridgeNetworkStatus) {
+            BridgeNetworkStatus status = (BridgeNetworkStatus) descriptor;
+
+            /* Extract number of running bridges to calculate daily means.
+             * Skip network statuses where less than 1% of bridges have
+             * the Running flag. */
+            String date = dateFormat.format(status.getPublishedMillis());
+            int totalBridges = 0, runningBridges = 0;
+            for (NetworkStatusEntry statusEntry :
+                status.getStatusEntries().values()) {
+              totalBridges++;
+              if (statusEntry.getFlags().contains("Running")) {
+                runningBridges++;
+              }
+            }
+            if (runningBridges * 100 > totalBridges) {
+              if (!bridgesPerDay.containsKey(date)) {
+                bridgesPerDay.put(date, new ArrayList<Integer>());
+              }
+              bridgesPerDay.get(date).add(runningBridges);
+            }
+          } else if (descriptor instanceof ExtraInfoDescriptor) {
+            ExtraInfoDescriptor extraInfoDescriptor =
+                (ExtraInfoDescriptor) descriptor;
+            String fingerprint = extraInfoDescriptor.getFingerprint().
+                toUpperCase();
+            String wasSeenAsRelay = hashedFingerprints.contains(
+                fingerprint) ? "TRUE" : "FALSE";
+
+            /* Extract v3 directory request response numbers from dirreq
+             * stats, if available. */
+            if (extraInfoDescriptor.getDirreqStatsEndMillis() >= 0 &&
+                extraInfoDescriptor.getDirreqStatsIntervalLength()
+                  == 86400 &&
+                extraInfoDescriptor.getDirreqV3Resp() != null &&
+                extraInfoDescriptor.getDirreqV3Resp().containsKey("ok")) {
+              String dirreqStatsEnd = dateTimeFormat.format(
+                  extraInfoDescriptor.getDirreqStatsEndMillis());
+              SortedMap<String, Integer> resp =
+                  extraInfoDescriptor.getDirreqV3Resp();
+              String ok = String.valueOf(resp.get("ok"));
+              String notEnoughSigs = resp.containsKey("not-enough-sigs")
+                  ? String.valueOf(resp.get("not-enough-sigs")) : "NA";
+              String unavailable = resp.containsKey("unavailable")
+                  ? String.valueOf(resp.get("unavailable")) : "NA";
+              String notFound = resp.containsKey("not-found")
+                  ? String.valueOf(resp.get("not-found")) : "NA";
+              String notModified = resp.containsKey("not-modified")
+                  ? String.valueOf(resp.get("not-modified")) : "NA";
+              String busy = resp.containsKey("busy")
+                  ? String.valueOf(resp.get("busy")) : "NA";
+              dirreqResponses.add(String.format(
+                  "%s,%s,%s,%s,%s,%s,%s,%s%n", dirreqStatsEnd,
+                  fingerprint, wasSeenAsRelay, ok, notEnoughSigs,
+                  unavailable, notFound, notModified, busy));
+            }
+
+            /* Extract written directory bytes, if available. */
+            if (extraInfoDescriptor.getDirreqWriteHistory() != null &&
+                extraInfoDescriptor.getDirreqWriteHistory().
+                getIntervalLength() == 900) {
+              if (!dirreqWriteHistory.containsKey(fingerprint)) {
+                dirreqWriteHistory.put(fingerprint,
+                    new TreeMap<Long, Long>());
+              }
+              dirreqWriteHistory.get(fingerprint).putAll(
+                  extraInfoDescriptor.getDirreqWriteHistory().
+                  getBandwidthValues());
+            }
+
+            /* Sum up unique IP address counts from .sy and from all
+             * countries from bridge stats, if available. */
+            if (extraInfoDescriptor.getBridgeStatsEndMillis() >= 0 &&
+                extraInfoDescriptor.getBridgeStatsIntervalLength()
+                  == 86400 &&
+                extraInfoDescriptor.getBridgeIps() != null) {
+              String bridgeStatsEnd = dateTimeFormat.format(
+                  extraInfoDescriptor.getBridgeStatsEndMillis());
+              int sy = 0, all = 0;
+              for (Map.Entry<String, Integer> e :
+                  extraInfoDescriptor.getBridgeIps().entrySet()) {
+                String country = e.getKey();
+                int adjustedIps = e.getValue() - 4;
+                if (country.equals("sy")) {
+                  sy = adjustedIps;
+                }
+                all += adjustedIps;
+              }
+              bridgeIps.add(String.format("%s,%s,%s,%d,%d%n",
+                  bridgeStatsEnd, fingerprint, wasSeenAsRelay, sy, all));
+            }
+          }
+        }
+      }
+
+      /* Write to disk what we learned while parsing bridge extra-info
+       * descriptors. */
+      bridgesPerDayFile.getParentFile().mkdirs();
+      BufferedWriter bw = new BufferedWriter(new FileWriter(
+          bridgesPerDayFile));
+      for (Map.Entry<String, List<Integer>> e :
+          bridgesPerDay.entrySet()) {
+        String date = e.getKey();
+        List<Integer> bridges = e.getValue();
+        int sum = 0;
+        for (int b : bridges) {
+          sum += b;
+        }
+        bw.write(String.format("%s,%d%n", date, sum / bridges.size()));
+      }
+      bw.close();
+      dirreqResponsesFile.getParentFile().mkdirs();
+      bw = new BufferedWriter(new FileWriter(dirreqResponsesFile));
+      for (String resp : dirreqResponses) {
+        bw.write(resp);
+      }
+      bw.close();
+      bridgeStatsUsersFile.getParentFile().mkdirs();
+      bw = new BufferedWriter(new FileWriter(bridgeStatsUsersFile));
+      for (String ips : bridgeIps) {
+        bw.write(ips);
+      }
+      bw.close();
+      bw = new BufferedWriter(new FileWriter(dirreqWriteHistoryFile));
+      for (Map.Entry<String, SortedMap<Long, Long>> e :
+          dirreqWriteHistory.entrySet()) {
+        String fingerprint = e.getKey();
+        String wasSeenAsRelay = hashedFingerprints.contains(
+            fingerprint) ? "TRUE" : "FALSE";
+        for (Map.Entry<Long, Long> f : e.getValue().entrySet()) {
+          String historyIntervalEnd = dateTimeFormat.format(f.getKey());
+          bw.write(String.format("%s,%s,%d,%s%n", fingerprint,
+              historyIntervalEnd, f.getValue(), wasSeenAsRelay));
+        }
+      }
+      bw.close();
+    }
+
+    /* Aggregate the parse results from above and write relevant data for
+     * estimating daily bridge users to disk.  Write results to
+     * out/bridge-dirreq-stats.  This step is distinct from the parsing
+     * steps, so that the parsing only has to be done once, whereas the
+     * aggregation can be tweaked and re-run easily. */
+    File bridgeDirreqStatsNoRelaysFile =
+        new File("out/bridge-dirreq-stats-no-relays");
+    File bridgeDirreqStatsAllBridgesFile =
+        new File("out/bridge-dirreq-stats-all-bridges");
+    if (bridgesPerDayFile.exists() &&
+        dirreqResponsesFile.exists() &&
+        bridgeStatsUsersFile.exists() &&
+        dirreqWriteHistoryFile.exists() &&
+        consensusesPerDayFile.exists()) {
+
+      /* Run the aggregation twice, once for all bridges and once for only
+       * bridges which haven't been seen as non-bridge relays before. */
+      boolean[] exclude = new boolean[] { true, false };
+      File[] outFiles = new File[] { bridgeDirreqStatsNoRelaysFile,
+          bridgeDirreqStatsAllBridgesFile };
+      for (int r = 0; r < 2; r++) {
+        boolean excludeHashedFingerprints = exclude[r];
+        File outFile = outFiles[r];
+
+        /* Read parse results back to memory. */
+        SortedMap<String, Integer> bridgesPerDay =
+            new TreeMap<String, Integer>();
+        BufferedReader br = new BufferedReader(new FileReader(
+            bridgesPerDayFile));
+        String line;
+        while ((line = br.readLine()) != null) {
+          String[] parts = line.split(",");
+          bridgesPerDay.put(parts[0], Integer.parseInt(parts[1]));
+        }
+        br.close();
+        SortedMap<String, SortedMap<Long, Long>> dirreqOkResponses =
+            new TreeMap<String, SortedMap<Long, Long>>();
+        br = new BufferedReader(new FileReader(dirreqResponsesFile));
+        while ((line = br.readLine()) != null) {
+          String[] parts = line.split(",");
+          if (excludeHashedFingerprints && parts[2].equals("TRUE")) {
+            /* Skip, because this bridge has been seen as relay before. */
+            continue;
+          }
+          String fingerprint = parts[1].toUpperCase();
+          long dirreqStatsEndMillis = dateTimeFormat.parse(parts[0]).
+              getTime();
+          long ok = Long.parseLong(parts[3]);
+          if (!dirreqOkResponses.containsKey(fingerprint)) {
+            dirreqOkResponses.put(fingerprint, new TreeMap<Long, Long>());
+          }
+          dirreqOkResponses.get(fingerprint).put(dirreqStatsEndMillis,
+              ok);
+        }
+        br.close();
+        SortedMap<String, long[]> ipsPerDay =
+            new TreeMap<String, long[]>();
+        br = new BufferedReader(new FileReader(bridgeStatsUsersFile));
+        while ((line = br.readLine()) != null) {
+          String[] parts = line.split(",");
+          if (excludeHashedFingerprints && parts[2].equals("TRUE")) {
+            /* Skip, because this bridge has been seen as relay before. */
+            continue;
+          }
+          long bridgeStatsEndMillis = dateTimeFormat.parse(parts[0]).
+              getTime();
+          long bridgeStatsStartMillis = bridgeStatsEndMillis - 86400000L;
+          long currentStartMillis = bridgeStatsStartMillis;
+
+          /* Find UTC date break in the interval and make sure that we
+           * distribute IPs to the two days correctly. */
+          String[] dates = new String[] {
+            dateFormat.format(bridgeStatsStartMillis),
+            dateFormat.format(bridgeStatsEndMillis) };
+          long[] seconds = new long[2];
+          if (!dates[0].equals(dates[1])) {
+            long dateBreakMillis = (bridgeStatsEndMillis / 86400000L)
+                * 86400000L;
+            seconds[0] = (dateBreakMillis - bridgeStatsStartMillis)
+                / 1000L;
+            bridgeStatsStartMillis = dateBreakMillis;
+          }
+          seconds[1] = (bridgeStatsEndMillis - bridgeStatsStartMillis)
+              / 1000L;
+
+          /* Update per-day counters. */
+          for (int i = 0; i < dates.length; i++) {
+            String date = dates[i];
+            long sy = seconds[i] * Long.parseLong(parts[3]);
+            long all = seconds[i] * Long.parseLong(parts[4]);
+            if (!ipsPerDay.containsKey(date)) {
+              ipsPerDay.put(date, new long[] { 0L, 0L });
+            }
+            ipsPerDay.get(date)[0] += sy;
+            ipsPerDay.get(date)[1] += all;
+          }
+        }
+        br.close();
+        SortedMap<String, Integer> consensusesPerDay =
+            new TreeMap<String, Integer>();
+        br = new BufferedReader(new FileReader(consensusesPerDayFile));
+        while ((line = br.readLine()) != null) {
+          String[] parts = line.split(",");
+          consensusesPerDay.put(parts[0], Integer.parseInt(parts[1]));
+        }
+        br.close();
+        br = new BufferedReader(new FileReader(dirreqWriteHistoryFile));
+        SortedMap<String, SortedMap<Long, Long>> dirreqWriteHistory =
+            new TreeMap<String, SortedMap<Long, Long>>();
+        while ((line = br.readLine()) != null) {
+          String[] parts = line.split(",");
+          if (excludeHashedFingerprints && parts[3].equals("TRUE")) {
+            /* Skip, because this bridge has been seen as relay before. */
+            continue;
+          }
+          String fingerprint = parts[0].toUpperCase();
+          long historyIntervalEndMillis = dateTimeFormat.parse(parts[1]).
+              getTime();
+          long writtenBytes = Long.parseLong(parts[2]);
+          if (!dirreqWriteHistory.containsKey(fingerprint)) {
+            dirreqWriteHistory.put(fingerprint, new TreeMap<Long, Long>());
+          }
+          dirreqWriteHistory.get(fingerprint).put(historyIntervalEndMillis,
+              writtenBytes);
+        }
+        br.close();
+
+        /* For every day, count reported v3 directory request responses,
+         * reported written directory bytes, and reporting bridges.
+         * Distinguish between bridges reporting both responses and bytes,
+         * bridges reporting only responses, and bridges reporting.  Map
+         * keys are dates, map values are the number of responses, bytes,
+         * or bridges. */
+        SortedMap<String, Long>
+            responsesReportingBoth = new TreeMap<String, Long>(),
+            responsesNotReportingBytes = new TreeMap<String, Long>(),
+            bytesReportingBoth = new TreeMap<String, Long>(),
+            bytesNotReportingResponses = new TreeMap<String, Long>(),
+            bridgesReportingBoth = new TreeMap<String, Long>(),
+            bridgesNotReportingBytes = new TreeMap<String, Long>(),
+            bridgesNotReportingResponses = new TreeMap<String, Long>();
+
+        /* Consider each bridge separately. */
+        SortedSet<String> allFingerprints = new TreeSet<String>();
+        allFingerprints.addAll(dirreqOkResponses.keySet());
+        allFingerprints.addAll(dirreqWriteHistory.keySet());
+        for (String fingerprint : allFingerprints) {
+
+          /* Obtain iterators over dirreq stats intervals and dirreq write
+           * history intervals, from oldest to newest.  Either iterator
+           * may contain zero elements if the bridge did not report any
+           * values, but not both. */
+          SortedMap<Long, Long> bridgeDirreqOkResponses =
+              dirreqOkResponses.containsKey(fingerprint) ?
+              dirreqOkResponses.get(fingerprint) :
+              new TreeMap<Long, Long>();
+          SortedMap<Long, Long> bridgeDirreqWriteHistory =
+              dirreqWriteHistory.containsKey(fingerprint) ?
+              dirreqWriteHistory.get(fingerprint) :
+              new TreeMap<Long, Long>();
+          Iterator<Long> responsesIterator =
+              bridgeDirreqOkResponses.keySet().iterator();
+          Iterator<Long> historyIterator =
+              bridgeDirreqWriteHistory.keySet().iterator();
+
+          /* Keep references to the currently considered intervals. */
+          long responseEndMillis = responsesIterator.hasNext() ?
+              responsesIterator.next() : Long.MAX_VALUE;
+          long historyEndMillis = historyIterator.hasNext() ?
+              historyIterator.next() : Long.MAX_VALUE;
+
+          /* Keep the time until when we have processed statistics. */
+          long currentStartMillis = 0L;
+
+          /* Iterate over both responses and byte histories until we set
+           * both to Long.MAX_VALUE, indicating that there are no further
+           * values. */
+          while (responseEndMillis < Long.MAX_VALUE ||
+              historyEndMillis < Long.MAX_VALUE) {
+
+            /* Dirreq-stats intervals are guaranteed to be 24 hours long,
+             * and dirreq-write-history intervals are 15 minutes long.
+             * This is guaranteed in the parsing code above.  It allows us
+             * to calculate interval starts.  Also, if we have already
+             * processed part of an interval, move the considered interval
+             * start accordingly. */
+            long historyStartMillis = Math.max(currentStartMillis,
+                historyEndMillis - 900000L);
+            long responseStartMillis = Math.max(currentStartMillis,
+                responseEndMillis - 86400000L);
+
+            /* Determine start and end time of the next interval, and
+             * whether the bridge reported dirreq-stats in that interval,
+             * or dirreq histories, or both. */
+            long currentEndMillis;
+            boolean addHistory = false, addResponses = false;
+            if (historyStartMillis < responseStartMillis) {
+              currentStartMillis = historyStartMillis;
+              currentEndMillis = Math.min(historyEndMillis,
+                  responseStartMillis);
+              addHistory = true;
+            } else if (responseStartMillis < historyStartMillis) {
+              currentStartMillis = responseStartMillis;
+              currentEndMillis = Math.min(historyStartMillis,
+                  responseEndMillis);
+              addResponses = true;
+            } else {
+              currentStartMillis = historyStartMillis;
+              currentEndMillis = Math.min(historyEndMillis,
+                  responseEndMillis);
+              addHistory = true;
+              addResponses = true;
+            }
+
+            /* Depending on which statistics the bridge reported in the
+             * determined interval, obtain the number of bytes or
+             * responses to add. */
+            long bytesInInterval = 0L, responsesInInterval = 0L;
+            if (addHistory) {
+              bytesInInterval = bridgeDirreqWriteHistory.
+                  get(historyEndMillis);
+            }
+            if (addResponses) {
+              responsesInInterval = bridgeDirreqOkResponses.
+                  get(responseEndMillis);
+            }
+
+            /* Find out if there is a UTC date break in the interval to be
+             * added.  If there is, make sure that we distribute responses
+             * and bytes to the two days correctly. */
+            String[] dates = new String[] {
+              dateFormat.format(currentStartMillis),
+              dateFormat.format(currentEndMillis) };
+            long[] seconds = new long[2];
+            if (!dates[0].equals(dates[1])) {
+              long dateBreakMillis = (currentEndMillis / 86400000L)
+                  * 86400000L;
+              seconds[0] = (dateBreakMillis - currentStartMillis) / 1000L;
+              currentStartMillis = dateBreakMillis;
+            }
+            seconds[1] = (currentEndMillis - currentStartMillis) / 1000L;
+
+            /* Update per-day counters. */
+            for (int i = 0; i < dates.length; i++) {
+              String date = dates[i];
+              long bytes = seconds[i] * bytesInInterval;
+              long responses = seconds[i] * responsesInInterval;
+              if (!bytesReportingBoth.containsKey(date)) {
+                bytesReportingBoth.put(date, 0L);
+                bytesNotReportingResponses.put(date, 0L);
+                responsesReportingBoth.put(date, 0L);
+                responsesNotReportingBytes.put(date, 0L);
+                bridgesReportingBoth.put(date, 0L);
+                bridgesNotReportingBytes.put(date, 0L);
+                bridgesNotReportingResponses.put(date, 0L);
+              }
+              if (addHistory) {
+                if (addResponses) {
+                  bytesReportingBoth.put(date,
+                      bytesReportingBoth.get(date) + bytes);
+                  responsesReportingBoth.put(date,
+                      responsesReportingBoth.get(date) + responses);
+                  bridgesReportingBoth.put(date,
+                      bridgesReportingBoth.get(date) + seconds[i]);
+                } else {
+                  bytesNotReportingResponses.put(date,
+                      bytesNotReportingResponses.get(date) + bytes);
+                  bridgesNotReportingResponses.put(date,
+                      bridgesNotReportingResponses.get(date)
+                      + seconds[i]);
+                }
+              } else if (addResponses) {
+                responsesNotReportingBytes.put(date,
+                    responsesNotReportingBytes.get(date) + responses);
+                bridgesNotReportingBytes.put(date,
+                    bridgesNotReportingBytes.get(date) + seconds[i]);
+              }
+            }
+
+            /* Move next interval start to the current interval end, and
+             * possibly move to the next stats intervals.  If we have run
+             * out of intervals in either or both of the sets, change the
+             * reference to Long.MAX_VALUE to add the other intervals and
+             * finally exit the loop. */
+            currentStartMillis = currentEndMillis;
+            if (historyEndMillis <= currentStartMillis) {
+              historyEndMillis = historyIterator.hasNext() ?
+                  historyIterator.next() : Long.MAX_VALUE;
+            }
+            if (responseEndMillis <= currentStartMillis) {
+              responseEndMillis = responsesIterator.hasNext() ?
+                  responsesIterator.next() : Long.MAX_VALUE;
+            }
+          }
+        }
+
+        /* Put together what we learned about bridge usage per day. */
+        outFile.getParentFile().mkdirs();
+        BufferedWriter bw = new BufferedWriter(new FileWriter(outFile));
+        bw.write("date,nabcd,sy,consensuses,ha,hc,ra,rb,na,nb,nc\n");
+        for (String date : bytesReportingBoth.keySet()) {
+          String bridges = "NA";
+          if (bridgesPerDay.containsKey(date)) {
+            bridges = String.valueOf(bridgesPerDay.get(date) * 86400L);
+          }
+          String sy = "NA";
+          if (ipsPerDay.containsKey(date)) {
+            long[] ips = ipsPerDay.get(date);
+            sy = String.format("%.5f", ((double) ips[0])
+                / ((double) ips[1]));
+          }
+          String consensuses = "NA";
+          if (consensusesPerDay.containsKey(date)) {
+            consensuses = String.valueOf(consensusesPerDay.get(date));
+          }
+          bw.write(String.format("%s,%s,%s,%s,%d,%d,%d,%d,%d,%d,%d%n",
+              date, bridges, sy, consensuses,
+              bytesReportingBoth.get(date),
+              bytesNotReportingResponses.get(date),
+              responsesReportingBoth.get(date),
+              responsesNotReportingBytes.get(date),
+              bridgesReportingBoth.get(date),
+              bridgesNotReportingBytes.get(date),
+              bridgesNotReportingResponses.get(date)));
+        }
+        bw.close();
+      }
+    }
+  }
+}
+



More information about the tor-commits mailing list