[tor-commits] [metrics-tasks/master] Add #4499 sources and tech report draft.

karsten at torproject.org karsten at torproject.org
Mon Feb 6 14:55:06 UTC 2012


commit 9b1e4846c3d602f8097f2700cf228c3df3765e98
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Mon Feb 6 15:54:16 2012 +0100

    Add #4499 sources and tech report draft.
---
 task-4499/.gitignore                               |    9 +
 .../GenerateSampleBridgeDescriptorTarballs.java    |  266 ++++++++++++++++++++
 task-4499/README                                   |   56 ++++
 task-4499/bridge-scaling.R                         |   23 ++
 task-4499/bridge-scaling.csv                       |   24 ++
 task-4499/bridge-scaling.tex                       |  117 +++++++++
 6 files changed, 495 insertions(+), 0 deletions(-)

diff --git a/task-4499/.gitignore b/task-4499/.gitignore
new file mode 100644
index 0000000..44c09e8
--- /dev/null
+++ b/task-4499/.gitignore
@@ -0,0 +1,9 @@
+*.class
+*.jar
+in/
+out/
+*.png
+*.aux
+*.log
+*.pdf
+
diff --git a/task-4499/GenerateSampleBridgeDescriptorTarballs.java b/task-4499/GenerateSampleBridgeDescriptorTarballs.java
new file mode 100755
index 0000000..a43fd73
--- /dev/null
+++ b/task-4499/GenerateSampleBridgeDescriptorTarballs.java
@@ -0,0 +1,266 @@
+import java.io.*;
+import java.util.*;
+import org.apache.commons.codec.*;
+import org.apache.commons.codec.digest.*;
+import org.apache.commons.codec.binary.*;
+
+/* Generate sample bridge descriptor tarball contents for metrics-db and
+ * BridgeDB load tests.  Accept an extracted, non-sanitized bridge
+ * descriptor tarball as input and generate sample tarball contents with
+ * multiples of bridges up to a given maximum multiplier as output.
+ * Descriptors are multiplied by overwriting the first four hex characters
+ * of bridge fingerprints with 0000, 0001, etc., keeping references
+ * between descriptors intact.
+ *
+ * NOTE THAT THE OUTPUT TARBALL CONTENTS ARE NOT SANITIZED!
+ *
+ * The changes are only sufficient to trick metrics-db and BridgeDB that
+ * bridges are distinct.  Descriptors may still contain original IP
+ * addresses in exit policies and other contact information.  Sanitized
+ * descriptors could not be used as input, because they may have skewed
+ * results too much. */
+public class GenerateSampleBridgeDescriptorTarballs {
+  public static void main(String[] args) throws Exception {
+    if (args.length != 3) {
+      System.err.println("Usage: java "
+          + GenerateSampleBridgeDescriptorTarballs.class.getName()
+          + " in-directory out-directory max-multiplier");
+      System.exit(1);
+    }
+    File inDirectory = new File(args[0]);
+    File outDirectory = new File(args[1]);
+    int maxMultiplier = Integer.parseInt(args[2]);
+    readDescriptors(inDirectory);
+    for (int multiplier = 1; multiplier <= maxMultiplier;
+        multiplier *= 2) {
+      writeDescriptors(new File(outDirectory, String.format("%04d",
+          multiplier)), multiplier);
+    }
+  }
+
+  private static void readDescriptors(File inDirectory) throws Exception {
+    readNetworkstatusBridges(new File(inDirectory,
+        "networkstatus-bridges"));
+    readBridgeDescriptors(new File(inDirectory, "bridge-descriptors"));
+    readCachedExtrainfos(new File(inDirectory, "cached-extrainfo"));
+    readCachedExtrainfos(new File(inDirectory, "cached-extrainfo.new"));
+  }
+
+  private static SortedMap<String, String> networkstatusEntries =
+      new TreeMap<String, String>();
+  private static void readNetworkstatusBridges(
+      File networkstatusBridgesFile) throws Exception {
+    BufferedReader br = new BufferedReader(new FileReader(
+        networkstatusBridgesFile));
+    String line, fingerprint = null, published = null;
+    StringBuilder sb = null;
+    while ((line = br.readLine()) != null) {
+      if (line.startsWith("r ")) {
+        if (sb != null) {
+          networkstatusEntries.put(fingerprint + " " + published,
+              sb.toString());
+        }
+        sb = new StringBuilder();
+        String[] parts = line.split(" ");
+        fingerprint = Hex.encodeHexString(Base64.decodeBase64(
+            parts[2] + "=")).toUpperCase();
+        published = parts[4] + " " + parts[5];
+      }
+      sb.append(line + "\n");
+    }
+    if (sb != null) {
+      networkstatusEntries.put(fingerprint + " " + published,
+          sb.toString());
+    }
+    br.close();
+  }
+
+  private static SortedMap<String, String> bridgeDescriptors =
+      new TreeMap<String, String>();
+  private static void readBridgeDescriptors(File bridgeDescriptorsFile)
+      throws Exception {
+    BufferedReader br = new BufferedReader(new FileReader(
+        bridgeDescriptorsFile));
+    String line, fingerprint = null, published = null;
+    StringBuilder sb = null;
+    while ((line = br.readLine()) != null) {
+      if (line.startsWith("@purpose ")) {
+        if (sb != null) {
+          bridgeDescriptors.put(fingerprint + " " + published,
+              sb.toString());
+        }
+        sb = new StringBuilder();
+      } else if (line.startsWith("published ")) {
+        published = line.substring("published ".length());
+      } else if (line.startsWith("opt fingerprint ")) {
+        fingerprint = line.substring("opt fingerprint ".length()).
+            replaceAll(" ", "");
+      }
+      sb.append(line + "\n");
+    }
+    if (sb != null) {
+      bridgeDescriptors.put(fingerprint + " " + published, sb.toString());
+    }
+    br.close();
+
+  }
+
+  private static SortedMap<String, String> cachedExtrainfos =
+      new TreeMap<String, String>();
+  private static void readCachedExtrainfos(File cachedExtrainfoFile)
+      throws Exception {
+    BufferedReader br = new BufferedReader(new FileReader(
+        cachedExtrainfoFile));
+    String line, fingerprint = null, published = null;
+    StringBuilder sb = null;
+    while ((line = br.readLine()) != null) {
+      if (line.startsWith("extra-info ")) {
+        if (sb != null) {
+          cachedExtrainfos.put(fingerprint + " " + published,
+              sb.toString());
+        }
+        sb = new StringBuilder();
+        fingerprint = line.split(" ")[2];
+      } else if (line.startsWith("published ")) {
+        published = line.substring("published ".length());
+      }
+      sb.append(line + "\n");
+    }
+    if (sb != null) {
+      cachedExtrainfos.put(fingerprint + " " + published, sb.toString());
+    }
+    br.close();
+  }
+
+  private static void writeDescriptors(File outDirectory, int multiplier)
+      throws Exception {
+    outDirectory.mkdirs();
+    for (File file : outDirectory.listFiles()) {
+      file.delete();
+    }
+    for (int i = 0; i < multiplier; i++) {
+      String fingerprintPrefix = String.format("%04x", i);
+      SortedMap<String, String> extraInfoDigests = writeCachedExtrainfos(
+          outDirectory, fingerprintPrefix);
+      SortedMap<String, String> descriptorDigests =
+          writeBridgeDescriptors(outDirectory, extraInfoDigests,
+          fingerprintPrefix);
+      writeNetworkstatusBridges(outDirectory, descriptorDigests,
+          fingerprintPrefix);
+    }
+  }
+
+  private static SortedMap<String, String> writeCachedExtrainfos(
+      File outDirectory, String fingerprintPrefix) throws Exception {
+    SortedMap<String, String> extraInfoDigests =
+        new TreeMap<String, String>();
+    BufferedWriter bw = new BufferedWriter(new FileWriter(new File(
+        outDirectory, "cached-extrainfo"), true));
+    for (Map.Entry<String, String> e : cachedExtrainfos.entrySet()) {
+      String fingerprintPublished = e.getKey();
+      String cachedExtrainfo = e.getValue();
+      BufferedReader br = new BufferedReader(new StringReader(
+          cachedExtrainfo));
+      String line;
+      StringBuilder sb = new StringBuilder();
+      while ((line = br.readLine()) != null) {
+        if (line.startsWith("extra-info ")) {
+          String[] parts = line.split(" ");
+          sb.append(parts[0] + " " + parts[1] + " " + fingerprintPrefix
+              + parts[2].substring(4) + "\n");
+        } else if (line.equals("router-signature")) {
+          sb.append(line + "\n");
+          String digest = DigestUtils.shaHex(sb.toString()).toUpperCase();
+          extraInfoDigests.put(fingerprintPublished, digest);
+        } else {
+          sb.append(line + "\n");
+        }
+      }
+      bw.write(sb.toString());
+    }
+    bw.close();
+    return extraInfoDigests;
+  }
+
+  private static SortedMap<String, String> writeBridgeDescriptors(
+      File outDirectory, SortedMap<String, String> extraInfoDigests,
+      String fingerprintPrefix) throws Exception {
+    SortedMap<String, String> descriptorDigests =
+        new TreeMap<String, String>();
+    BufferedWriter bw = new BufferedWriter(new FileWriter(new File(
+        outDirectory, "bridge-descriptors"), true));
+    for (Map.Entry<String, String> e : bridgeDescriptors.entrySet()) {
+      String fingerprintPublished = e.getKey();
+      String bridgeDescriptor = e.getValue();
+      BufferedReader br = new BufferedReader(new StringReader(
+          bridgeDescriptor));
+      String line;
+      StringBuilder sb = new StringBuilder();
+      while ((line = br.readLine()) != null) {
+        if (line.startsWith("@purpose ")) {
+        } else if (line.startsWith("opt fingerprint ")) {
+          sb.append("opt fingerprint " + fingerprintPrefix
+              + line.substring("opt fingerprint 0000".length()) + "\n");
+        } else if (line.startsWith("opt extra-info-digest ")) {
+          String extraInfoDigest = null;
+          if (extraInfoDigests.containsKey(fingerprintPublished)) {
+            extraInfoDigest = extraInfoDigests.get(fingerprintPublished);
+          } else {
+            extraInfoDigest = fingerprintPrefix
+                + line.split(" ")[2].substring(4);
+          }
+          sb.append("opt extra-info-digest " + extraInfoDigest + "\n");
+        } else if (line.equals("router-signature")) {
+          sb.append(line + "\n");
+          String digest = DigestUtils.shaHex(sb.toString()).toUpperCase();
+          descriptorDigests.put(fingerprintPublished, digest);
+        } else {
+          sb.append(line + "\n");
+        }
+      }
+      bw.write("@purpose bridge\n" + sb.toString());
+    }
+    bw.close();
+    return descriptorDigests;
+  }
+
+  private static void writeNetworkstatusBridges(File outDirectory,
+      SortedMap<String, String> descriptorDigests,
+      String fingerprintPrefix) throws Exception {
+    BufferedWriter bw = new BufferedWriter(new FileWriter(new File(
+        outDirectory, "networkstatus-bridges"), true));
+    for (Map.Entry<String, String> e : networkstatusEntries.entrySet()) {
+      String fingerprintPublished = e.getKey();
+      String networkstatusEntry = e.getValue();
+      BufferedReader br = new BufferedReader(new StringReader(
+          networkstatusEntry));
+      String line;
+      StringBuilder sb = new StringBuilder();
+      while ((line = br.readLine()) != null) {
+        if (line.startsWith("r ")) {
+          String[] parts = line.split(" ");
+          String fingerprint = parts[2], descriptorDigest = parts[3];
+          String newFingerprint = Base64.encodeBase64String(Hex.decodeHex(
+              (fingerprintPrefix + fingerprintPublished.split(" ")[0].
+              substring(4)).toCharArray())).substring(0, 27);
+          String newDescriptorDigest = null;
+          if (descriptorDigests.containsKey(fingerprintPublished)) {
+            newDescriptorDigest = Base64.encodeBase64String(Hex.decodeHex(
+                descriptorDigests.get(fingerprintPublished).
+                toCharArray())).substring(0, 27);
+          } else {
+            newDescriptorDigest = "AA" + descriptorDigest.substring(2);
+          }
+          sb.append("r " + parts[1] + " " + newFingerprint + " "
+              + newDescriptorDigest + " " + parts[4] + " " + parts[5]
+              + " " + parts[6] + " " + parts[7] + " " + parts[8] + "\n");
+        } else {
+          sb.append(line + "\n");
+        }
+      }
+      bw.write(sb.toString());
+    }
+    bw.close();
+  }
+}
+
diff --git a/task-4499/README b/task-4499/README
new file mode 100644
index 0000000..4bf9264
--- /dev/null
+++ b/task-4499/README
@@ -0,0 +1,56 @@
+1  Generating sample bridge descriptors
+=======================================
+
+This is a simple Java class to generate sample bridge descriptors for
+metrics-db and BridgeDB load tests.
+
+==========================================================================
+======== NOTE THAT THE OUTPUT TARBALL CONTENTS ARE NOT SANITIZED! ========
+==========================================================================
+
+The changes are only sufficient to trick metrics-db and BridgeDB that
+bridges are distinct.  Descriptors may still contain original IP addresses
+in exit policies and other contact information.  Sanitized descriptors
+could not be used as input, because they may have skewed results too much.
+
+Here's how you generate sample bridge descriptors from original
+descriptors.
+
+Extract a non-sanitized bridge descriptor tarball to in/, so that there
+are four files:
+
+    in/bridge-descriptors
+    in/cached-extrainfo.new
+    in/cached-extrainfo
+    in/networkstatus-bridges
+
+Download the Apache Commons Codec .jar file and put in the root directory,
+e.g.,
+
+    commons-codec-1.4.jar
+
+Compile the Java class:
+
+    $ javac -cp commons-codec-1.4.jar \
+      GenerateSampleBridgeDescriptorTarballs.java
+
+Run the Java class to generate sample data up to a factor of 256 times the
+descriptors in the in/ directory:
+
+    $ java -cp .:commons-codec-1.4.jar \
+      GenerateSampleBridgeDescriptorTarballs in out 256
+
+Find the generated sample data in the out/ directory.
+
+
+2  Building the tech report
+===========================
+
+Generate the graph:
+
+  $ R --slave -f bridge-scaling.R
+
+Build the PDF:
+
+  $ pdflatex bridge-scaling.tex
+
diff --git a/task-4499/bridge-scaling.R b/task-4499/bridge-scaling.R
new file mode 100644
index 0000000..972f240
--- /dev/null
+++ b/task-4499/bridge-scaling.R
@@ -0,0 +1,23 @@
+library(ggplot2)
+d <- read.csv("bridge-scaling.csv", header = TRUE)
+t <- d[d$variable == "1tarball", ]
+b <- d[d$variable == "2bridgedb", ]
+m <- d[d$variable == "3metricsdb", ]
+d <- rbind(
+  data.frame(x = t$x, y = t$y, colour = t$colour,
+    variable = "Tarball size in GiB/day"),
+  data.frame(x = b$x, y = b$y, colour = b$colour,
+    variable = "BridgeDB time in min"),
+  data.frame(x = m$x, y = m$y, colour = m$colour,
+    variable = "metrics-db time in min"))
+ggplot(d, aes(x = x, y = y, colour = colour)) +
+geom_line(colour = "black") +
+geom_point() +
+facet_grid(variable ~ ., scales = "free_y") +
+scale_x_continuous(name = "\nRunning bridges (2012-01-31 = 838, red)") +
+scale_y_continuous(name = "") +
+scale_colour_manual(name = "", value = c("black", "red")) +
+opts(legend.position = "none",
+  title = "Scalability of Tor's bridge infrastructure\n")
+ggsave("bridge-scaling.png", width = 7, height = 6, dpi = 100)
+
diff --git a/task-4499/bridge-scaling.csv b/task-4499/bridge-scaling.csv
new file mode 100644
index 0000000..24bbbf3
--- /dev/null
+++ b/task-4499/bridge-scaling.csv
@@ -0,0 +1,24 @@
+x,y,variable,colour
+NA,0,1tarball,black
+838,0.103,1tarball,red
+1676,0.206,1tarball,black
+3352,0.412,1tarball,black
+6704,0.843,1tarball,black
+13408,1.64,1tarball,black
+26816,3.281,1tarball,black
+53632,6.562,1tarball,black
+NA,0,2bridgedb,black
+838,0.0833,2bridgedb,red
+1676,0.1833,2bridgedb,black
+3352,0.3833,2bridgedb,black
+6704,0.7666,2bridgedb,black
+13408,1.5833,2bridgedb,black
+26816,3.3,2bridgedb,black
+53632,6.283,2bridgedb,black
+NA,0,3metricsdb,black
+838,0.583,3metricsdb,red
+1676,1.366,3metricsdb,black
+3352,3.816,3metricsdb,black
+6704,7.9,3metricsdb,black
+13408,20.216,3metricsdb,black
+26816,44.75,3metricsdb,black
diff --git a/task-4499/bridge-scaling.tex b/task-4499/bridge-scaling.tex
new file mode 100644
index 0000000..14dae1a
--- /dev/null
+++ b/task-4499/bridge-scaling.tex
@@ -0,0 +1,117 @@
+\documentclass{article}
+\usepackage{url}
+\usepackage[pdftex]{graphicx}
+\usepackage{graphics}
+\usepackage{color}
+\begin{document}
+\title{Investigating scaling points to handle more bridges}
+\author{Karsten Loesing\\{\tt karsten at torproject.org}}
+
+\maketitle
+
+\section{Introduction}
+
+The current bridge infrastructure relies on a central bridge authority to
+collect, distribute, and publish bridge relay descriptors.
+We believe the current infrastructure can handle up to 10,000 bridges.
+
+The scaling points involve the database of descriptors, the metrics portal
+and its ability to handle this many descriptors for analysis, and the
+reachability testing part of the code for the bridge authority.
+We should investigate scaling points to handle more than 10,000 bridge
+descriptors.
+
+\section{Early results}
+
+We started this analysis by writing a small tool to generate sample data
+for BridgeDB and metrics-db.
+This tool takes the contents from one of Tonga's bridge tarball as input,
+copies them a given number of times, and overwrites the first two bytes of
+relay fingerprints in every copy with 0000, 0001, etc.
+The tool also fixes references between network statuses, server
+descriptors, and extra-info descriptors.
+This is sufficient to trick BridgeDB and metrics-db into thinking that
+relays in the copies are distinct relays.
+We used the tool to generate tarballs with 2, 4, 8, 16, 32, and 64 times
+as many bridge descriptors in them.
+
+In the next step we fed the tarballs into BridgeDB and metrics-db.
+BridgeDB reads the network statuses and server descriptors from the latest
+tarball and writes them to a local database.
+metrics-db sanitizes two half-hourly created tarballs every hour,
+establishes an internal mapping between descriptors, and writes sanitized
+descriptors with fixed references to disk.
+
+Figure~\ref{fig:bridgescaling} shows the results.
+
+\begin{figure}[t]
+\includegraphics[width=\textwidth]{bridge-scaling.png}
+%\caption{}
+\label{fig:bridgescaling}
+\end{figure}
+
+The upper graph shows how the tarballs grow in size with more bridge
+descriptors in them.
+This growth is, unsurprisingly, linear.
+One thing to keep in mind here is that bandwidth and storage requirements
+to the hosts transferring and storing bridge tarballs are growing with the
+tarballs.
+We'll want to pay extra attention to disk space running out on those
+hosts.
+
+The middle graph shows how long BridgeDB takes to load descriptors from a
+tarball.
+This graph is linear, too, which indicates that BridgeDB can handle an
+increase in the number of bridges pretty well.
+One thing we couldn't check is whether BridgeDB's ability to serve client
+requests is in any way affected during the descriptor import.
+We assume it'll be fine.
+We should ask Aaron, if there are other things in BridgeDB that we
+overlooked that may not scale.
+
+The lower graph shows how metrics-db can or cannot handle more bridges.
+The growth is slightly worse than linear.
+In any case, the absolute time required to handle 25K bridges is worrisome
+(we didn't try 50K).
+metrics-db runs in an hourly cronjob, and if that cronjob doesn't finish
+within 1 hour, we cannot start the next run and will be missing some data.
+We might have to sanitize bridge descriptors in a different thread or
+process than the one that fetches all the other metrics data.
+We can also look into other Java libraries to handle .gz-compressed files
+that are faster than the one we're using.
+So, we can probably handle 25K bridges somehow, and maybe even 50K.
+Somehow.
+
+Finally, note that we left out the most important part of this analysis:
+can Tonga, or more generally, a single bridge authority handle this
+increase in bridges?
+We're not sure how to test such a setting, or at least without running 50K
+bridges in a private network.
+We could imagine this requires some more sophisticated sample data
+generation including getting the crypto right and then talking to Tonga's
+DirPort.
+If there's an easy way to test this, we'll do it.
+If not, we can always hope for the best.
+What can go wrong.
+
+\section{Work left to do}
+
+If we end up with way too many bridges, here are a few things we'll want
+to look at updating:
+
+\begin{itemize}
+\item Tonga still does a reachability test on each bridge every 21 minutes
+or so.
+Eventually the number of TLS handshakes it's doing will overwhelm its cpu.
+\item The tarballs we make every half hour have substantial overlap.
+If we have tens of thousands of descriptors, we would want to get smarter
+at sending diffs over to bridgedb.
+\item Somebody should check whether BridgeDB's interaction with users
+freezes while it's reading a new set of data.
+\end{itemize}
+
+%\bibliography{bridge-scaling}
+%\bibliographystyle{plain}
+
+\end{document}
+



More information about the tor-commits mailing list