commit 1e8dbd3857c58e26f57973da4d64170eae0e1be6 Author: Karsten Loesing karsten.loesing@gmx.net Date: Mon Mar 14 14:15:33 2011 +0100
Add bridge pool assignment file parser for #2680. --- task-2680/.gitignore | 2 + task-2680/ProcessSanitizedAssignments.java | 102 ++++++++++++++++++++++++++++ task-2680/README | 40 ++++++++++- task-2680/verify.R | 6 ++ 4 files changed, 147 insertions(+), 3 deletions(-)
diff --git a/task-2680/.gitignore b/task-2680/.gitignore index 6378a79..b394fe6 100644 --- a/task-2680/.gitignore +++ b/task-2680/.gitignore @@ -4,4 +4,6 @@ bridge-descriptors/ commons-codec-1.4.jar consensuses/ *.tar.bz2 +*.swp +bridge-pool-assignments/
diff --git a/task-2680/ProcessSanitizedAssignments.java b/task-2680/ProcessSanitizedAssignments.java new file mode 100644 index 0000000..9289aa1 --- /dev/null +++ b/task-2680/ProcessSanitizedAssignments.java @@ -0,0 +1,102 @@ +import java.io.*; +import java.util.*; + +public class ProcessSanitizedAssignments { + public static void main(String[] args) throws IOException { + + /* Validate command-line arguments. */ + if (args.length != 1 || !new File(args[0]).exists()) { + System.out.println("Usage: java ProcessSanitizedAssignments <dir>"); + System.exit(1); + } + + /* Find all files that we should parse. Somewhat fragile, but should + * work. */ + System.out.println("Creating list of files we should parse."); + SortedMap<String, File> assignments = new TreeMap<String, File>(); + Stack<File> files = new Stack<File>(); + files.add(new File(args[0])); + while (!files.isEmpty()) { + File file = files.pop(); + if (file.isDirectory()) { + files.addAll(Arrays.asList(file.listFiles())); + } else { + assignments.put(file.getName(), file); + } + } + System.out.println("We found " + assignments.size() + " bridge pool " + + "assignment files."); + + /* Parse assignments. */ + if (!assignments.isEmpty()) { + System.out.println("Parsing bridge pool assignment files."); + BufferedWriter bw = new BufferedWriter(new FileWriter( + "assignments.csv")); + bw.write("assignment,fingerprint,type,ring,port,flag,bucket\n"); + int parsedAssignments = 0, totalAssignments = assignments.size(), + writtenOutputLines = 1; + long started = System.currentTimeMillis(); + for (File file : assignments.values()) { + BufferedReader br = new BufferedReader(new FileReader(file)); + String line, assignmentTime = null; + while ((line = br.readLine()) != null) { + if (line.startsWith("bridge-pool-assignment ")) { + assignmentTime = line.substring("bridge-pool-assignment ". + length()); + } else { + String[] parts = line.split(" "); + String fingerprint = parts[0]; + String type = parts[1]; + String ring = null, port = null, flag = null, bucket = null; + for (int i = 2; i < parts.length; i++) { + String[] parts2 = parts[i].split("="); + String key = parts2[0]; + String value = parts2[1]; + if (key.equals("ring")) { + } else if (key.equals("ring")) { + ring = value; + } else if (key.equals("port")) { + port = value; + } else if (key.equals("flag")) { + flag = value; + } else if (key.equals("bucket")) { + bucket = value; + } else { + System.out.println("Unknown keyword in line '" + line + + "'. Please check. Exiting."); + System.exit(1); + } + } + bw.write(assignmentTime + "," + fingerprint + "," + type + "," + + (ring != null ? ring : "NA") + "," + + (port != null ? port : "NA") + "," + + (flag != null ? flag : "NA") + "," + + (bucket != null ? bucket : "NA") + "\n"); + writtenOutputLines++; + } + } + br.close(); + parsedAssignments++; + if (parsedAssignments % (totalAssignments / 10) == 0) { + double fractionDone = (double) (parsedAssignments) / + (double) totalAssignments; + double fractionLeft = 1.0D - fractionDone; + long now = System.currentTimeMillis(); + double millisLeft = ((double) (now - started)) * fractionLeft / + fractionDone; + long secondsLeft = (long) millisLeft / 1000L; + System.out.println(" " + (parsedAssignments / (totalAssignments + / 10)) + "0% done, " + secondsLeft + " seconds left."); + } + } + bw.close(); + System.out.println("Parsed " + parsedAssignments + " bridge pool " + + "assignment files and wrote " + writtenOutputLines + " lines " + + "to assignments.csv."); + } + + /* This is it. */ + System.out.println("Terminating."); + } +} + diff --git a/task-2680/README b/task-2680/README index a00856f..65d8b85 100644 --- a/task-2680/README +++ b/task-2680/README @@ -6,9 +6,9 @@ This ticket contains Java and R code to
This README has a separate section for each Java or R code snippet.
-The Java applications produce three output formats containing bridge -descriptors, bridge status lines, and hashed relay identities. The data -formats are described below. +The Java applications produce four output formats containing bridge +descriptors, bridge status lines, bridge pool assignments, and hashed +relay identities. The data formats are described below.
--------------------------------------------------------------------------
@@ -33,6 +33,23 @@ ProcessSanitizedBridges.java
--------------------------------------------------------------------------
+ProcessSanitizedAssignments.java + + - Download sanitized bridge pool assignments from the metrics website, + e.g., https://metrics.torproject.org/data/bridge-pool-assignments-2011-01.tar.bz2 + and extract them in a local directory, e.g., bridge-pool-assignments/. + + - Compile the Java class, e.g., + $ javac ProcessSanitizedAssignments.java + + - Run the Java class, e.g., + $ java ProcessSanitizedAssignments bridge-pool-assignments/ + + - Once the Java application is done, you'll find a file assignments.csv + in this directory. + +-------------------------------------------------------------------------- + ProcessRelayConsensuses.java
- Download v3 relay consensuses from the metrics website, e.g., @@ -130,6 +147,23 @@ The columns in statuses.csv are:
--------------------------------------------------------------------------
+assignments.csv + +The assignments.csv file contains one line for every running bridge and +the rings, subrings, and buckets that BridgeDB assigned it to. + +The columns in assignments.csv are: + + - assignment: ISO-formatted bridge pool assignment time + - fingerprint: Hex-formatted SHA-1 hash of identity fingerprint + - type: Name of the distributor: "https", "email", or "unallocated" + - ring: Ring number, only for distributor "https" + - port: Port subring + - flag: Flag subring + - bucket: File bucket, only for distributor "unallocated" + +-------------------------------------------------------------------------- + relays.csv
The relays.csv file contains SHA-1 hashes of identity fingerprints of diff --git a/task-2680/verify.R b/task-2680/verify.R index 63ef233..241a196 100644 --- a/task-2680/verify.R +++ b/task-2680/verify.R @@ -25,3 +25,9 @@ if (file.exists("relays.csv")) { summary(as.POSIXct(r$consensus)) }
+if (file.exists("assignments.csv")) { + cat("Verifying assignments.csv. This may take a while.\n") + r <- read.csv("assignments.csv", stringsAsFactors = FALSE) + summary(as.POSIXct(r$assignment)) +} +