commit ee7df6a759f6788579f61403fb9771996ea5c988 Author: Karsten Loesing karsten.loesing@gmx.net Date: Fri Mar 11 14:51:12 2011 +0100
Add code for 2680. --- task-2680/.gitignore | 6 + task-2680/ProcessRelayConsensuses.java | 85 ++++++++ task-2680/ProcessSanitizedBridges.java | 327 ++++++++++++++++++++++++++++++++ task-2680/README | 145 ++++++++++++++ task-2680/verify.R | 27 +++ 5 files changed, 590 insertions(+), 0 deletions(-)
diff --git a/task-2680/.gitignore b/task-2680/.gitignore new file mode 100644 index 0000000..134e86d --- /dev/null +++ b/task-2680/.gitignore @@ -0,0 +1,6 @@ +*.class +*.csv +bridge-descriptors/ +commons-codec-1.4.jar +consensuses/ + diff --git a/task-2680/ProcessRelayConsensuses.java b/task-2680/ProcessRelayConsensuses.java new file mode 100644 index 0000000..44d9ce3 --- /dev/null +++ b/task-2680/ProcessRelayConsensuses.java @@ -0,0 +1,85 @@ +import java.io.*; +import java.util.*; +import org.apache.commons.codec.binary.*; +import org.apache.commons.codec.digest.*; + +public class ProcessRelayConsensuses { + public static void main(String[] args) throws IOException { + + /* Validate command-line arguments. */ + if (args.length != 1 || !new File(args[0]).exists()) { + System.out.println("Usage: java ProcessRelayConsensuses <dir>"); + System.exit(1); + } + + /* Find all files that we should parse. Somewhat fragile, but should + * work. */ + System.out.println("Creating list of files we should parse."); + SortedMap<String, File> consensuses = new TreeMap<String, File>(); + Stack<File> files = new Stack<File>(); + files.add(new File(args[0])); + while (!files.isEmpty()) { + File file = files.pop(); + String filename = file.getName(); + if (file.isDirectory()) { + files.addAll(Arrays.asList(file.listFiles())); + } else if (filename.endsWith("-consensus")) { + consensuses.put(filename, file); + } + } + System.out.println("We found " + consensuses.size() + + " consensuses."); + + /* Parse consensuses. */ + if (!consensuses.isEmpty()) { + System.out.println("Parsing consensuses."); + BufferedWriter bw = new BufferedWriter(new FileWriter( + "relays.csv")); + bw.write("consensus,fingerprint\n"); + int parsedConsensuses = 0, totalConsensuses = consensuses.size(), + writtenOutputLines = 1; + long started = System.currentTimeMillis(); + for (File file : consensuses.values()) { + BufferedReader br = new BufferedReader(new FileReader(file)); + String line, validAfter = null; + while ((line = br.readLine()) != null) { + if (line.startsWith("valid-after ")) { + validAfter = line.substring("valid-after ".length()); + } else if (line.startsWith("r ")) { + if (validAfter == null) { + System.out.println("Found an r line before the valid-after " + + "line in " + file.getName() + ". Please check. " + + "Exiting."); + System.exit(1); + } + String fingerprint = DigestUtils.shaHex(Base64.decodeBase64( + line.split(" ")[2] + "=")); + bw.write(validAfter + "," + fingerprint + "\n"); + writtenOutputLines++; + } + } + br.close(); + parsedConsensuses++; + if (parsedConsensuses % (totalConsensuses / 10) == 0) { + double fractionDone = (double) (parsedConsensuses) / + (double) totalConsensuses; + double fractionLeft = 1.0D - fractionDone; + long now = System.currentTimeMillis(); + double millisLeft = ((double) (now - started)) * fractionLeft / + fractionDone; + long secondsLeft = (long) millisLeft / 1000L; + System.out.println(" " + (parsedConsensuses / (totalConsensuses + / 10)) + "0% done, " + secondsLeft + " seconds left."); + } + } + bw.close(); + System.out.println("Parsed " + parsedConsensuses + " consensuses " + + "and wrote " + writtenOutputLines + " lines to relays.csv."); + } + + /* This is it. */ + System.out.println("Terminating."); + } +} + + diff --git a/task-2680/ProcessSanitizedBridges.java b/task-2680/ProcessSanitizedBridges.java new file mode 100644 index 0000000..1f0e00e --- /dev/null +++ b/task-2680/ProcessSanitizedBridges.java @@ -0,0 +1,327 @@ +import java.io.*; +import java.text.*; +import java.util.*; +import org.apache.commons.codec.binary.*; + +public class ProcessSanitizedBridges { + public static void main(String[] args) throws IOException, + ParseException { + + /* Validate command-line arguments. */ + if (args.length != 1 || !new File(args[0]).exists()) { + System.out.println("Usage: java ProcessSanitizedBridges <dir>"); + System.exit(1); + } + + /* Find all files that we should parse. Somewhat fragile, but should + * work. */ + System.out.println("Creating list of files we should parse."); + SortedMap<String, File> statuses = new TreeMap<String, File>(); + SortedMap<String, File> serverDescriptors = + new TreeMap<String, File>(); + SortedMap<String, File> extraInfoDescriptors = + new TreeMap<String, File>(); + Stack<File> files = new Stack<File>(); + files.add(new File(args[0])); + while (!files.isEmpty()) { + File file = files.pop(); + String path = file.getAbsolutePath(); + String filename = file.getName(); + if (file.isDirectory()) { + files.addAll(Arrays.asList(file.listFiles())); + } else if (path.contains("statuses")) { + statuses.put(filename, file); + } else if (path.contains("server-descriptors")) { + serverDescriptors.put(filename, file); + } else if (path.contains("extra-infos")) { + extraInfoDescriptors.put(filename, file); + } + } + System.out.println("We found\n " + statuses.size() + " statuses,\n " + + serverDescriptors.size() + " server descriptors, and\n " + + extraInfoDescriptors.size() + " extra-info descriptors."); + + /* Parse statuses. */ + if (!statuses.isEmpty()) { + System.out.println("Parsing statuses."); + List<String> knownFlags = new ArrayList<String>(Arrays.asList( + ("Authority,BadExit,BadDirectory,Exit,Fast,Guard,Named,Stable," + + "Running,Valid,V2Dir").split(","))); + BufferedWriter bw = new BufferedWriter(new FileWriter( + "statuses.csv")); + bw.write("status,fingerprint,descriptor,published,address,orport," + + "dirport"); + for (String knownFlag : knownFlags) { + bw.write("," + knownFlag.toLowerCase()); + } + bw.write("\n"); + int parsedStatuses = 0, totalStatuses = statuses.size(), + writtenOutputLines = 1; + long started = System.currentTimeMillis(); + for (File file : statuses.values()) { + String filename = file.getName(); + if (filename.length() != ("20110101-000703-" + + "4A0CCD2DDC7995083D73F5D667100C8A5831F16D").length()) { + System.out.println("Status filename has wrong length: '" + + filename + "' Please check. Exiting."); + System.exit(1); + } + String statusDateTime = filename.substring(0, 4) + "-" + + filename.substring(4, 6) + "-" + filename.substring(6, 8) + + " " + filename.substring(9, 11) + ":" + + filename.substring(11, 13) + ":" + + filename.substring(13, 15); + BufferedReader br = new BufferedReader(new FileReader(file)); + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("r ")) { + String[] parts = line.split(" "); + if (parts.length != 9) { + System.out.println("r line doesn't have the correct number " + + "of entries: '" + line + "'. Please check. Exiting."); + System.exit(1); + } + String fingerprint = Hex.encodeHexString(Base64.decodeBase64( + parts[2] + "=")); + String descriptor = Hex.encodeHexString(Base64.decodeBase64( + parts[2] + "=")); + String published = parts[4] + " " + parts[5]; + String address = parts[6]; + String orPort = parts[7]; + String dirPort = parts[8]; + bw.write(statusDateTime + "," + fingerprint + "," + descriptor + + "," + published + "," + address + "," + orPort + "," + + dirPort); + } else if (line.equals("s") || line.startsWith("s ")) { + String flags = line.substring(1); + for (String flag : knownFlags) { + if (flags.contains(" " + flag)) { + bw.write(",TRUE"); + } else { + bw.write(",FALSE"); + } + } + bw.write("\n"); + writtenOutputLines++; + } + } + br.close(); + parsedStatuses++; + if (parsedStatuses % (totalStatuses / 10) == 0) { + double fractionDone = (double) (parsedStatuses) / + (double) totalStatuses; + double fractionLeft = 1.0D - fractionDone; + long now = System.currentTimeMillis(); + double millisLeft = ((double) (now - started)) * fractionLeft / + fractionDone; + long secondsLeft = (long) millisLeft / 1000L; + System.out.println(" " + (parsedStatuses / (totalStatuses + / 10)) + "0% done, " + secondsLeft + " seconds left."); + } + } + bw.close(); + System.out.println("Parsed " + parsedStatuses + " statuses and " + + "wrote " + writtenOutputLines + " lines to statuses.csv."); + } + + /* Parse server descriptors and extra-info descriptors. */ + if (!serverDescriptors.isEmpty()) { + System.out.println("Parsing server descriptors and extra-info " + + "descriptors."); + List<String> knownCountries = new ArrayList<String>(Arrays.asList( + ("?? A1 A2 AD AE AF AG AI AL AM AN AO AP AQ AR AS AT AU AW AX " + + "AZ BA BB BD BE BF BG BH BI BJ BM BN BO BR BS BT BV BW BY BZ " + + "CA CD CF CG CH CI CK CL CM CN CO CR CS CU CV CY CZ DE DJ DK " + + "DM DO DZ EC EE EG ER ES ET EU FI FJ FK FM FO FR GA GB GD GE " + + "GF GG GH GI GL GM GN GP GQ GR GT GU GW GY HK HN HR HT HU ID " + + "IE IL IM IN IO IQ IR IS IT JE JM JO JP KE KG KH KI KM KN KP " + + "KR KW KY KZ LA LB LC LI LK LR LS LT LU LV LY MA MC MD ME MF " + + "MG MH MK ML MM MN MO MP MQ MR MS MT MU MV MW MX MY MZ NA NC " + + "NE NF NG NI NL NO NP NR NU NZ OM PA PE PF PG PH PK PL PM PR " + + "PS PT PW PY QA RE RO RS RU RW SA SB SC SD SE SG SH SI SJ SK " + + "SL SM SN SO SR ST SV SY SZ TC TD TG TH TJ TK TL TM TN TO TR " + + "TT TV TW TZ UA UG UM US UY UZ VA VC VE VG VI VN VU WF WS YE " + + "YT ZA ZM ZW").toLowerCase().split(" "))); + BufferedWriter bw = new BufferedWriter(new FileWriter( + "descriptors.csv")); + bw.write("descriptor,fingerprint,published,address,orport,dirport," + + "version,platform,uptime,bridgestatsend,bridgestatsseconds"); + for (String country : knownCountries) { + bw.write("," + country); + } + bw.write(",bridgestatscountries,bridgestatstotal\n"); + int parsedServerDescriptors = 0, parsedExtraInfoDescriptors = 0, + parsedGeoipStats = 0, skippedGeoipStats = 0, + parsedBridgeStats = 0, + totalServerDescriptors = serverDescriptors.size(), + writtenOutputLines = 1; + SimpleDateFormat timeFormat = new SimpleDateFormat( + "yyyy-MM-dd HH:mm:ss"); + timeFormat.setTimeZone(TimeZone.getTimeZone("UTC")); + long started = System.currentTimeMillis(); + for (File file : serverDescriptors.values()) { + String filename = file.getName(); + BufferedReader br = new BufferedReader(new FileReader(file)); + String line, fingerprint = null, published = null, address = null, + orPort = null, dirPort = null, version = null, + platform = null, uptime = null, extraInfoDigest = null, + bridgeStatsEnd = null, bridgeStatsSeconds = null; + SortedMap<String, String> bridgeStatsIps = + new TreeMap<String, String>(); + long bridgeStatsTotal = 0L; + while ((line = br.readLine()) != null) { + if (line.startsWith("opt ")) { + line = line.substring(4); + } + if (line.startsWith("router ")) { + String[] parts = line.split(" "); + address = parts[2]; + orPort = parts[3]; + dirPort = parts[4]; + } else if (line.startsWith("platform ")) { + version = line.split(" ")[2]; + platform = line.substring(line.indexOf("on ") + + "on ".length()); + if (platform.contains("Windows")) { + platform = "Windows"; + } else if (platform.contains("Linux")) { + platform = "Linux"; + } else if (platform.contains("Darwin")) { + platform = "Mac OS X"; + } else if (platform.contains("BSD")) { + platform = "*BSD"; + } else { + platform = "Other"; + } + } else if (line.startsWith("published ")) { + String[] parts = line.split(" "); + published = parts[1] + " " + parts[2]; + } else if (line.startsWith("fingerprint ")) { + fingerprint = line.substring("fingerprint".length()). + replaceAll(" ", "").toLowerCase(); + } else if (line.startsWith("uptime ")) { + uptime = line.split(" ")[1]; + } else if (line.startsWith("extra-info-digest ")) { + extraInfoDigest = line.substring("extra-info-digest ". + length()).toLowerCase(); + if (extraInfoDescriptors.containsKey(extraInfoDigest)) { + parsedExtraInfoDescriptors++; + BufferedReader br2 = new BufferedReader(new FileReader( + extraInfoDescriptors.get(extraInfoDigest))); + String geoipStartTime = null, bridgeStatsEndLine = null; + while ((line = br2.readLine()) != null) { + if (line.startsWith("geoip-start-time ")) { + geoipStartTime = line.substring("geoip-start-time ". + length()); + } else if (line.startsWith("geoip-client-origins ") && + line.split(" ").length > 1 && published != null && + geoipStartTime != null) { + if (version.startsWith("0.2.2.")) { + skippedGeoipStats++; + } else { + parsedGeoipStats++; + bridgeStatsEnd = published; + bridgeStatsSeconds = "" + + + (timeFormat.parse(published).getTime() + - timeFormat.parse(geoipStartTime).getTime()) + / 1000L; + for (String pair : line.split(" ")[1].split(",")) { + String country = pair.substring(0, 2); + String ips = pair.substring(3); + bridgeStatsIps.put(country, ips); + bridgeStatsTotal += Long.parseLong(ips); + } + } + } else if (line.startsWith("bridge-stats-end ")) { + bridgeStatsEndLine = line; + } else if (line.startsWith("bridge-ips ") && + line.length() > "bridge-ips ".length() && + bridgeStatsEndLine != null) { + parsedBridgeStats++; + String[] parts = bridgeStatsEndLine.split(" "); + bridgeStatsEnd = parts[1] + " " + parts[2]; + bridgeStatsSeconds = parts[3].substring(1); + for (String pair : line.split(" ")[1].split(",")) { + String country = pair.substring(0, 2); + String ips = pair.substring(3); + bridgeStatsIps.put(country, ips); + bridgeStatsTotal += Long.parseLong(ips); + } + } + } + br2.close(); + } + } + } + br.close(); + if (fingerprint == null || published == null || address == null || + orPort == null || dirPort == null || version == null || + platform == null || uptime == null) { + System.out.println("Server descriptor " + filename + " is " + + "missing critical information. Please check. Exiting."); + System.exit(1); + } + bw.write(filename + "," + fingerprint + "," + published + "," + + address + "," + orPort + "," + dirPort + "," + version + "," + + platform + "," + uptime); + if (bridgeStatsEnd != null) { + bw.write("," + bridgeStatsEnd + "," + bridgeStatsSeconds); + int bridgeStatsCountries = bridgeStatsIps.size(); + for (String country : knownCountries) { + if (bridgeStatsIps.containsKey(country)) { + bw.write("," + bridgeStatsIps.remove(country)); + } else { + bw.write(",0"); + } + } + if (!bridgeStatsIps.isEmpty()) { + StringBuilder message = new StringBuilder(); + for (String country : bridgeStatsIps.keySet()) { + message.append(", " + country); + } + System.out.println("Unknown " + (bridgeStatsIps.size() == 1 ? + "country" : "countries") + " " + message.toString(). + substring(2) + " in extra-info descriptor " + + extraInfoDigest + ". Please check. Exiting."); + System.exit(1); + } + bw.write("," + bridgeStatsCountries + "," + bridgeStatsTotal + + "\n"); + } else { + bw.write(",NA,NA"); + for (String country : knownCountries) { + bw.write(",NA"); + } + bw.write(",NA,NA\n"); + } + writtenOutputLines++; + parsedServerDescriptors++; + if (parsedServerDescriptors % (totalServerDescriptors / 100) + == 0) { + double fractionDone = (double) (parsedServerDescriptors) / + (double) totalServerDescriptors; + double fractionLeft = 1.0D - fractionDone; + long now = System.currentTimeMillis(); + double millisLeft = ((double) (now - started)) * fractionLeft / + fractionDone; + long secondsLeft = (long) millisLeft / 1000L; + System.out.println(" " + (parsedServerDescriptors / + (totalServerDescriptors / 100)) + "% done, " + secondsLeft + + " seconds left."); + } + } + bw.close(); + System.out.println("Parsed " + parsedServerDescriptors + " server " + + "descriptors and " + parsedExtraInfoDescriptors + + " extra-info descriptors.\nParsed " + parsedGeoipStats + + " geoip-stats and " + parsedBridgeStats + " bridge-stats.\n" + + "Skipped " + skippedGeoipStats + " broken geoip-stats of " + + "0.2.2.x bridges.\nWrote " + writtenOutputLines + " to " + + "descriptors.csv."); + } + + /* This is it. */ + System.out.println("Terminating."); + } +} + diff --git a/task-2680/README b/task-2680/README new file mode 100644 index 0000000..a00856f --- /dev/null +++ b/task-2680/README @@ -0,0 +1,145 @@ +This ticket contains Java and R code to + + a) process bridge and relay data to convert them to a format that is more + useful for researchers and + b) verify that the output data files are valid. + +This README has a separate section for each Java or R code snippet. + +The Java applications produce three output formats containing bridge +descriptors, bridge status lines, and hashed relay identities. The data +formats are described below. + +-------------------------------------------------------------------------- + +ProcessSanitizedBridges.java + + - Download sanitized bridge descriptors from the metrics website, e.g., + https://metrics.torproject.org/data/bridge-descriptors-2011-01.tar.bz2, + and extract them in a local directory, e.g., bridge-descriptors/. + + - Download Apache Commons Codec 1.4 or higher and put in in this + directory. + + - Compile the Java class, e.g., + $ javac -cp commons-codec-1.4.jar ProcessSanitizedBridges.java + + - Run the Java class, e.g., + $ java -cp .:commons-codec-1.4.jar ProcessSanitizedBridges + bridge-descriptors/ + + - Once the Java application is done, you'll find the two files + statuses.csv and descriptors.csv in this directory. + +-------------------------------------------------------------------------- + +ProcessRelayConsensuses.java + + - Download v3 relay consensuses from the metrics website, e.g., + https://metrics.torproject.org/data/consensuses-2011-01.tar.bz2, and + extract them in a local directory, e.g., consensuses/. + + - Download Apache Commons Codec 1.4 or higher and put in in this + directory, unless you haven't already done this above for + ProcessSanitizedBridges.java. + + - Compile the Java class, e.g., + $ javac -cp commons-codec-1.4.jar ProcessRelayConsensuses.java + + - Run the Java class, e.g., + $ java -cp .:commons-codec-1.4.jar ProcessRelayConsensuses consensuses/ + + - Once the Java application is done, you'll find a file relays.csv in + this directory. + +-------------------------------------------------------------------------- + +verify.R + + - Run the R verification script like this: + $ R --slave -f verify.R + +-------------------------------------------------------------------------- + +descriptors.csv + +The descriptors.csv file contains one line for each bridge descriptor that +a bridge has published. This descriptor consists of fields coming from +the bridge's server descriptor and the bridge's extra-info descriptor that +was published at the same time. + +The columns in descriptors.csv are: + + - descriptor: Hex-formatted descriptor identifier + - fingerprint: Hex-formatted SHA-1 hash of identity fingerprint + - published: ISO-formatted descriptor publication time + - address: Sanitized IPv4 address in dotted notation + - orport: OR port + - dirport: Dir port + - version: Tor version + - platform: Operating system family (Windows, Linux, etc.) + - uptime: Uptime in seconds + - bridgestatsend: ISO-formatted time when stats interval ended + - bridgestatsseconds: Stats interval length in seconds + - ??: Unique client IP addresses that could not be resolved + - a1: Unique client IP addresses from anonymous proxies + - a2: Unique client IP addresses from satellite providers + - ad: Unique client IP addresses from Andorra + - ae: Unique client IP addresses from the United Arab Emirates + - [...] See ISO 3166-1 alpha-2 country codes + - zw: Unique client IP addresses from Zimbabwe + - bridgestatscountries: Number of countries with non-zero unique IPs + - bridgestatstotal: Total number of unique IPs + +There are two sources for the bridgestats* and country-code columns, +depending on Tor's version. Bridges running Tor version 0.2.1.x or +earlier use dynamic stats intervals from a few hours to a few days. +Bridges running early 0.2.2.x versions published faulty stats and are +therefore removed from descriptors.csv. Bridges running 0.2.2.x or higher +(except the faulty 0.2.2.x versions) collect stats in 24-hour intervals. + +-------------------------------------------------------------------------- + +statuses.csv + +The statuses.csv file contains one line for every bridge that is +referenced in a bridge network status. Note that if a bridge is running +for, say, 12 hours, it will be contained in 24 half-hourly published +statuses in that time and will be listed 24 times in statuses.csv. + +The columns in statuses.csv are: + + - status: ISO-formatted status publication time + - fingerprint: Hex-formatted SHA-1 hash of identity fingerprint + - descriptor: Hex-formatted descriptor identifier + - published: ISO-formatted descriptor publication time + - address: Sanitized IPv4 address in dotted notation + - orport: OR port + - dirport: Dir port + - authority: TRUE if bridge has the Authority flag, FALSE otherwise + - badexit: TRUE if bridge has the BadExit flag, FALSE otherwise + - baddirectory: TRUE if bridge has the BadDirectory flag, FALSE otherwise + - exit: TRUE if bridge has the Exit flag, FALSE otherwise + - fast: TRUE if bridge has the Fast flag, FALSE otherwise + - guard: TRUE if bridge has the Guard flag, FALSE otherwise + - named: TRUE if bridge has the Named flag, FALSE otherwise + - stable: TRUE if bridge has the Stable flag, FALSE otherwise + - running: TRUE if bridge has the Running flag, FALSE otherwise + - valid: TRUE if bridge has the Valid flag, FALSE otherwise + - v2dir: TRUE if bridge has the V2Dir flag, FALSE otherwise + +-------------------------------------------------------------------------- + +relays.csv + +The relays.csv file contains SHA-1 hashes of identity fingerprints of +normal relays. If a bridge uses the same identity key that it also used +as a relay, it might observe more users than it would observe as a pure +bridge. Therefore, bridges that have been running as relays before should +be excluded from bridge statistics. + +The columns in relays.csv are: + + - consensus: ISO-formatted consensus publication time + - fingerprint: Hex-formatted SHA-1 hash of identity fingerprint + diff --git a/task-2680/verify.R b/task-2680/verify.R new file mode 100644 index 0000000..63ef233 --- /dev/null +++ b/task-2680/verify.R @@ -0,0 +1,27 @@ +# Usage: R --slave -f verify.R + +if (file.exists("descriptors.csv")) { + cat("Verifying descriptors.csv. This may take a while.\n") + d <- read.csv("descriptors.csv", stringsAsFactors = FALSE) + cat(" ", length(na.omit(d$bridgestatsend)), "of", length(d$descriptor), + "descriptors contain bridge stats.\n") +} else { + cat("descriptors.csv does not exist\n") +} + +if (file.exists("statuses.csv")) { + cat("Verifying statuses.csv. This may take a while.\n") + s <- read.csv("statuses.csv", stringsAsFactors = FALSE) + cat(" ", length(s[s$running == TRUE, "running"]), "of", + length(s$running), "bridges contained in the statuses have the", + "Running flag.\n") +} else { + cat("statuses.csv does not exist\n") +} + +if (file.exists("relays.csv")) { + cat("Verifying relays.csv. This may take a while.\n") + r <- read.csv("relays.csv", stringsAsFactors = FALSE) + summary(as.POSIXct(r$consensus)) +} +