[metrics-tasks/master] Add #4499 sources and tech report draft.

commit 9b1e4846c3d602f8097f2700cf228c3df3765e98 Author: Karsten Loesing <karsten.loesing@gmx.net> Date: Mon Feb 6 15:54:16 2012 +0100 Add #4499 sources and tech report draft. --- task-4499/.gitignore | 9 + .../GenerateSampleBridgeDescriptorTarballs.java | 266 ++++++++++++++++++++ task-4499/README | 56 ++++ task-4499/bridge-scaling.R | 23 ++ task-4499/bridge-scaling.csv | 24 ++ task-4499/bridge-scaling.tex | 117 +++++++++ 6 files changed, 495 insertions(+), 0 deletions(-) diff --git a/task-4499/.gitignore b/task-4499/.gitignore new file mode 100644 index 0000000..44c09e8 --- /dev/null +++ b/task-4499/.gitignore @@ -0,0 +1,9 @@ +*.class +*.jar +in/ +out/ +*.png +*.aux +*.log +*.pdf + diff --git a/task-4499/GenerateSampleBridgeDescriptorTarballs.java b/task-4499/GenerateSampleBridgeDescriptorTarballs.java new file mode 100755 index 0000000..a43fd73 --- /dev/null +++ b/task-4499/GenerateSampleBridgeDescriptorTarballs.java @@ -0,0 +1,266 @@ +import java.io.*; +import java.util.*; +import org.apache.commons.codec.*; +import org.apache.commons.codec.digest.*; +import org.apache.commons.codec.binary.*; + +/* Generate sample bridge descriptor tarball contents for metrics-db and + * BridgeDB load tests. Accept an extracted, non-sanitized bridge + * descriptor tarball as input and generate sample tarball contents with + * multiples of bridges up to a given maximum multiplier as output. + * Descriptors are multiplied by overwriting the first four hex characters + * of bridge fingerprints with 0000, 0001, etc., keeping references + * between descriptors intact. + * + * NOTE THAT THE OUTPUT TARBALL CONTENTS ARE NOT SANITIZED! + * + * The changes are only sufficient to trick metrics-db and BridgeDB that + * bridges are distinct. Descriptors may still contain original IP + * addresses in exit policies and other contact information. Sanitized + * descriptors could not be used as input, because they may have skewed + * results too much. */ +public class GenerateSampleBridgeDescriptorTarballs { + public static void main(String[] args) throws Exception { + if (args.length != 3) { + System.err.println("Usage: java " + + GenerateSampleBridgeDescriptorTarballs.class.getName() + + " in-directory out-directory max-multiplier"); + System.exit(1); + } + File inDirectory = new File(args[0]); + File outDirectory = new File(args[1]); + int maxMultiplier = Integer.parseInt(args[2]); + readDescriptors(inDirectory); + for (int multiplier = 1; multiplier <= maxMultiplier; + multiplier *= 2) { + writeDescriptors(new File(outDirectory, String.format("%04d", + multiplier)), multiplier); + } + } + + private static void readDescriptors(File inDirectory) throws Exception { + readNetworkstatusBridges(new File(inDirectory, + "networkstatus-bridges")); + readBridgeDescriptors(new File(inDirectory, "bridge-descriptors")); + readCachedExtrainfos(new File(inDirectory, "cached-extrainfo")); + readCachedExtrainfos(new File(inDirectory, "cached-extrainfo.new")); + } + + private static SortedMap<String, String> networkstatusEntries = + new TreeMap<String, String>(); + private static void readNetworkstatusBridges( + File networkstatusBridgesFile) throws Exception { + BufferedReader br = new BufferedReader(new FileReader( + networkstatusBridgesFile)); + String line, fingerprint = null, published = null; + StringBuilder sb = null; + while ((line = br.readLine()) != null) { + if (line.startsWith("r ")) { + if (sb != null) { + networkstatusEntries.put(fingerprint + " " + published, + sb.toString()); + } + sb = new StringBuilder(); + String[] parts = line.split(" "); + fingerprint = Hex.encodeHexString(Base64.decodeBase64( + parts[2] + "=")).toUpperCase(); + published = parts[4] + " " + parts[5]; + } + sb.append(line + "\n"); + } + if (sb != null) { + networkstatusEntries.put(fingerprint + " " + published, + sb.toString()); + } + br.close(); + } + + private static SortedMap<String, String> bridgeDescriptors = + new TreeMap<String, String>(); + private static void readBridgeDescriptors(File bridgeDescriptorsFile) + throws Exception { + BufferedReader br = new BufferedReader(new FileReader( + bridgeDescriptorsFile)); + String line, fingerprint = null, published = null; + StringBuilder sb = null; + while ((line = br.readLine()) != null) { + if (line.startsWith("@purpose ")) { + if (sb != null) { + bridgeDescriptors.put(fingerprint + " " + published, + sb.toString()); + } + sb = new StringBuilder(); + } else if (line.startsWith("published ")) { + published = line.substring("published ".length()); + } else if (line.startsWith("opt fingerprint ")) { + fingerprint = line.substring("opt fingerprint ".length()). + replaceAll(" ", ""); + } + sb.append(line + "\n"); + } + if (sb != null) { + bridgeDescriptors.put(fingerprint + " " + published, sb.toString()); + } + br.close(); + + } + + private static SortedMap<String, String> cachedExtrainfos = + new TreeMap<String, String>(); + private static void readCachedExtrainfos(File cachedExtrainfoFile) + throws Exception { + BufferedReader br = new BufferedReader(new FileReader( + cachedExtrainfoFile)); + String line, fingerprint = null, published = null; + StringBuilder sb = null; + while ((line = br.readLine()) != null) { + if (line.startsWith("extra-info ")) { + if (sb != null) { + cachedExtrainfos.put(fingerprint + " " + published, + sb.toString()); + } + sb = new StringBuilder(); + fingerprint = line.split(" ")[2]; + } else if (line.startsWith("published ")) { + published = line.substring("published ".length()); + } + sb.append(line + "\n"); + } + if (sb != null) { + cachedExtrainfos.put(fingerprint + " " + published, sb.toString()); + } + br.close(); + } + + private static void writeDescriptors(File outDirectory, int multiplier) + throws Exception { + outDirectory.mkdirs(); + for (File file : outDirectory.listFiles()) { + file.delete(); + } + for (int i = 0; i < multiplier; i++) { + String fingerprintPrefix = String.format("%04x", i); + SortedMap<String, String> extraInfoDigests = writeCachedExtrainfos( + outDirectory, fingerprintPrefix); + SortedMap<String, String> descriptorDigests = + writeBridgeDescriptors(outDirectory, extraInfoDigests, + fingerprintPrefix); + writeNetworkstatusBridges(outDirectory, descriptorDigests, + fingerprintPrefix); + } + } + + private static SortedMap<String, String> writeCachedExtrainfos( + File outDirectory, String fingerprintPrefix) throws Exception { + SortedMap<String, String> extraInfoDigests = + new TreeMap<String, String>(); + BufferedWriter bw = new BufferedWriter(new FileWriter(new File( + outDirectory, "cached-extrainfo"), true)); + for (Map.Entry<String, String> e : cachedExtrainfos.entrySet()) { + String fingerprintPublished = e.getKey(); + String cachedExtrainfo = e.getValue(); + BufferedReader br = new BufferedReader(new StringReader( + cachedExtrainfo)); + String line; + StringBuilder sb = new StringBuilder(); + while ((line = br.readLine()) != null) { + if (line.startsWith("extra-info ")) { + String[] parts = line.split(" "); + sb.append(parts[0] + " " + parts[1] + " " + fingerprintPrefix + + parts[2].substring(4) + "\n"); + } else if (line.equals("router-signature")) { + sb.append(line + "\n"); + String digest = DigestUtils.shaHex(sb.toString()).toUpperCase(); + extraInfoDigests.put(fingerprintPublished, digest); + } else { + sb.append(line + "\n"); + } + } + bw.write(sb.toString()); + } + bw.close(); + return extraInfoDigests; + } + + private static SortedMap<String, String> writeBridgeDescriptors( + File outDirectory, SortedMap<String, String> extraInfoDigests, + String fingerprintPrefix) throws Exception { + SortedMap<String, String> descriptorDigests = + new TreeMap<String, String>(); + BufferedWriter bw = new BufferedWriter(new FileWriter(new File( + outDirectory, "bridge-descriptors"), true)); + for (Map.Entry<String, String> e : bridgeDescriptors.entrySet()) { + String fingerprintPublished = e.getKey(); + String bridgeDescriptor = e.getValue(); + BufferedReader br = new BufferedReader(new StringReader( + bridgeDescriptor)); + String line; + StringBuilder sb = new StringBuilder(); + while ((line = br.readLine()) != null) { + if (line.startsWith("@purpose ")) { + } else if (line.startsWith("opt fingerprint ")) { + sb.append("opt fingerprint " + fingerprintPrefix + + line.substring("opt fingerprint 0000".length()) + "\n"); + } else if (line.startsWith("opt extra-info-digest ")) { + String extraInfoDigest = null; + if (extraInfoDigests.containsKey(fingerprintPublished)) { + extraInfoDigest = extraInfoDigests.get(fingerprintPublished); + } else { + extraInfoDigest = fingerprintPrefix + + line.split(" ")[2].substring(4); + } + sb.append("opt extra-info-digest " + extraInfoDigest + "\n"); + } else if (line.equals("router-signature")) { + sb.append(line + "\n"); + String digest = DigestUtils.shaHex(sb.toString()).toUpperCase(); + descriptorDigests.put(fingerprintPublished, digest); + } else { + sb.append(line + "\n"); + } + } + bw.write("@purpose bridge\n" + sb.toString()); + } + bw.close(); + return descriptorDigests; + } + + private static void writeNetworkstatusBridges(File outDirectory, + SortedMap<String, String> descriptorDigests, + String fingerprintPrefix) throws Exception { + BufferedWriter bw = new BufferedWriter(new FileWriter(new File( + outDirectory, "networkstatus-bridges"), true)); + for (Map.Entry<String, String> e : networkstatusEntries.entrySet()) { + String fingerprintPublished = e.getKey(); + String networkstatusEntry = e.getValue(); + BufferedReader br = new BufferedReader(new StringReader( + networkstatusEntry)); + String line; + StringBuilder sb = new StringBuilder(); + while ((line = br.readLine()) != null) { + if (line.startsWith("r ")) { + String[] parts = line.split(" "); + String fingerprint = parts[2], descriptorDigest = parts[3]; + String newFingerprint = Base64.encodeBase64String(Hex.decodeHex( + (fingerprintPrefix + fingerprintPublished.split(" ")[0]. + substring(4)).toCharArray())).substring(0, 27); + String newDescriptorDigest = null; + if (descriptorDigests.containsKey(fingerprintPublished)) { + newDescriptorDigest = Base64.encodeBase64String(Hex.decodeHex( + descriptorDigests.get(fingerprintPublished). + toCharArray())).substring(0, 27); + } else { + newDescriptorDigest = "AA" + descriptorDigest.substring(2); + } + sb.append("r " + parts[1] + " " + newFingerprint + " " + + newDescriptorDigest + " " + parts[4] + " " + parts[5] + + " " + parts[6] + " " + parts[7] + " " + parts[8] + "\n"); + } else { + sb.append(line + "\n"); + } + } + bw.write(sb.toString()); + } + bw.close(); + } +} + diff --git a/task-4499/README b/task-4499/README new file mode 100644 index 0000000..4bf9264 --- /dev/null +++ b/task-4499/README @@ -0,0 +1,56 @@ +1 Generating sample bridge descriptors +======================================= + +This is a simple Java class to generate sample bridge descriptors for +metrics-db and BridgeDB load tests. + +========================================================================== +======== NOTE THAT THE OUTPUT TARBALL CONTENTS ARE NOT SANITIZED! ======== +========================================================================== + +The changes are only sufficient to trick metrics-db and BridgeDB that +bridges are distinct. Descriptors may still contain original IP addresses +in exit policies and other contact information. Sanitized descriptors +could not be used as input, because they may have skewed results too much. + +Here's how you generate sample bridge descriptors from original +descriptors. + +Extract a non-sanitized bridge descriptor tarball to in/, so that there +are four files: + + in/bridge-descriptors + in/cached-extrainfo.new + in/cached-extrainfo + in/networkstatus-bridges + +Download the Apache Commons Codec .jar file and put in the root directory, +e.g., + + commons-codec-1.4.jar + +Compile the Java class: + + $ javac -cp commons-codec-1.4.jar \ + GenerateSampleBridgeDescriptorTarballs.java + +Run the Java class to generate sample data up to a factor of 256 times the +descriptors in the in/ directory: + + $ java -cp .:commons-codec-1.4.jar \ + GenerateSampleBridgeDescriptorTarballs in out 256 + +Find the generated sample data in the out/ directory. + + +2 Building the tech report +=========================== + +Generate the graph: + + $ R --slave -f bridge-scaling.R + +Build the PDF: + + $ pdflatex bridge-scaling.tex + diff --git a/task-4499/bridge-scaling.R b/task-4499/bridge-scaling.R new file mode 100644 index 0000000..972f240 --- /dev/null +++ b/task-4499/bridge-scaling.R @@ -0,0 +1,23 @@ +library(ggplot2) +d <- read.csv("bridge-scaling.csv", header = TRUE) +t <- d[d$variable == "1tarball", ] +b <- d[d$variable == "2bridgedb", ] +m <- d[d$variable == "3metricsdb", ] +d <- rbind( + data.frame(x = t$x, y = t$y, colour = t$colour, + variable = "Tarball size in GiB/day"), + data.frame(x = b$x, y = b$y, colour = b$colour, + variable = "BridgeDB time in min"), + data.frame(x = m$x, y = m$y, colour = m$colour, + variable = "metrics-db time in min")) +ggplot(d, aes(x = x, y = y, colour = colour)) + +geom_line(colour = "black") + +geom_point() + +facet_grid(variable ~ ., scales = "free_y") + +scale_x_continuous(name = "\nRunning bridges (2012-01-31 = 838, red)") + +scale_y_continuous(name = "") + +scale_colour_manual(name = "", value = c("black", "red")) + +opts(legend.position = "none", + title = "Scalability of Tor's bridge infrastructure\n") +ggsave("bridge-scaling.png", width = 7, height = 6, dpi = 100) + diff --git a/task-4499/bridge-scaling.csv b/task-4499/bridge-scaling.csv new file mode 100644 index 0000000..24bbbf3 --- /dev/null +++ b/task-4499/bridge-scaling.csv @@ -0,0 +1,24 @@ +x,y,variable,colour +NA,0,1tarball,black +838,0.103,1tarball,red +1676,0.206,1tarball,black +3352,0.412,1tarball,black +6704,0.843,1tarball,black +13408,1.64,1tarball,black +26816,3.281,1tarball,black +53632,6.562,1tarball,black +NA,0,2bridgedb,black +838,0.0833,2bridgedb,red +1676,0.1833,2bridgedb,black +3352,0.3833,2bridgedb,black +6704,0.7666,2bridgedb,black +13408,1.5833,2bridgedb,black +26816,3.3,2bridgedb,black +53632,6.283,2bridgedb,black +NA,0,3metricsdb,black +838,0.583,3metricsdb,red +1676,1.366,3metricsdb,black +3352,3.816,3metricsdb,black +6704,7.9,3metricsdb,black +13408,20.216,3metricsdb,black +26816,44.75,3metricsdb,black diff --git a/task-4499/bridge-scaling.tex b/task-4499/bridge-scaling.tex new file mode 100644 index 0000000..14dae1a --- /dev/null +++ b/task-4499/bridge-scaling.tex @@ -0,0 +1,117 @@ +\documentclass{article} +\usepackage{url} +\usepackage[pdftex]{graphicx} +\usepackage{graphics} +\usepackage{color} +\begin{document} +\title{Investigating scaling points to handle more bridges} +\author{Karsten Loesing\\{\tt karsten@torproject.org}} + +\maketitle + +\section{Introduction} + +The current bridge infrastructure relies on a central bridge authority to +collect, distribute, and publish bridge relay descriptors. +We believe the current infrastructure can handle up to 10,000 bridges. + +The scaling points involve the database of descriptors, the metrics portal +and its ability to handle this many descriptors for analysis, and the +reachability testing part of the code for the bridge authority. +We should investigate scaling points to handle more than 10,000 bridge +descriptors. + +\section{Early results} + +We started this analysis by writing a small tool to generate sample data +for BridgeDB and metrics-db. +This tool takes the contents from one of Tonga's bridge tarball as input, +copies them a given number of times, and overwrites the first two bytes of +relay fingerprints in every copy with 0000, 0001, etc. +The tool also fixes references between network statuses, server +descriptors, and extra-info descriptors. +This is sufficient to trick BridgeDB and metrics-db into thinking that +relays in the copies are distinct relays. +We used the tool to generate tarballs with 2, 4, 8, 16, 32, and 64 times +as many bridge descriptors in them. + +In the next step we fed the tarballs into BridgeDB and metrics-db. +BridgeDB reads the network statuses and server descriptors from the latest +tarball and writes them to a local database. +metrics-db sanitizes two half-hourly created tarballs every hour, +establishes an internal mapping between descriptors, and writes sanitized +descriptors with fixed references to disk. + +Figure~\ref{fig:bridgescaling} shows the results. + +\begin{figure}[t] +\includegraphics[width=\textwidth]{bridge-scaling.png} +%\caption{} +\label{fig:bridgescaling} +\end{figure} + +The upper graph shows how the tarballs grow in size with more bridge +descriptors in them. +This growth is, unsurprisingly, linear. +One thing to keep in mind here is that bandwidth and storage requirements +to the hosts transferring and storing bridge tarballs are growing with the +tarballs. +We'll want to pay extra attention to disk space running out on those +hosts. + +The middle graph shows how long BridgeDB takes to load descriptors from a +tarball. +This graph is linear, too, which indicates that BridgeDB can handle an +increase in the number of bridges pretty well. +One thing we couldn't check is whether BridgeDB's ability to serve client +requests is in any way affected during the descriptor import. +We assume it'll be fine. +We should ask Aaron, if there are other things in BridgeDB that we +overlooked that may not scale. + +The lower graph shows how metrics-db can or cannot handle more bridges. +The growth is slightly worse than linear. +In any case, the absolute time required to handle 25K bridges is worrisome +(we didn't try 50K). +metrics-db runs in an hourly cronjob, and if that cronjob doesn't finish +within 1 hour, we cannot start the next run and will be missing some data. +We might have to sanitize bridge descriptors in a different thread or +process than the one that fetches all the other metrics data. +We can also look into other Java libraries to handle .gz-compressed files +that are faster than the one we're using. +So, we can probably handle 25K bridges somehow, and maybe even 50K. +Somehow. + +Finally, note that we left out the most important part of this analysis: +can Tonga, or more generally, a single bridge authority handle this +increase in bridges? +We're not sure how to test such a setting, or at least without running 50K +bridges in a private network. +We could imagine this requires some more sophisticated sample data +generation including getting the crypto right and then talking to Tonga's +DirPort. +If there's an easy way to test this, we'll do it. +If not, we can always hope for the best. +What can go wrong. + +\section{Work left to do} + +If we end up with way too many bridges, here are a few things we'll want +to look at updating: + +\begin{itemize} +\item Tonga still does a reachability test on each bridge every 21 minutes +or so. +Eventually the number of TLS handshakes it's doing will overwhelm its cpu. +\item The tarballs we make every half hour have substantial overlap. +If we have tens of thousands of descriptors, we would want to get smarter +at sending diffs over to bridgedb. +\item Somebody should check whether BridgeDB's interaction with users +freezes while it's reading a new set of data. +\end{itemize} + +%\bibliography{bridge-scaling} +%\bibliographystyle{plain} + +\end{document} +
participants (1)
-
karsten@torproject.org