commit d301479e3fdd8c1d17f02f6d72a8cf6eaa87e17e Author: Karsten Loesing karsten.loesing@gmx.net Date: Wed Oct 24 11:34:10 2012 -0400
Add code behind bridge user counting report (#5807). --- task-5807/bridge-dirreq-stats.R | 222 +++++++++++ task-5807/run.sh | 3 + task-5807/src/EvalBridgeDirreqStats.java | 603 ++++++++++++++++++++++++++++++ 3 files changed, 828 insertions(+), 0 deletions(-)
diff --git a/task-5807/bridge-dirreq-stats.R b/task-5807/bridge-dirreq-stats.R new file mode 100644 index 0000000..2a07a3a --- /dev/null +++ b/task-5807/bridge-dirreq-stats.R @@ -0,0 +1,222 @@ +library(ggplot2) +library(reshape) +library(scales) + +# Commented out, because this graph takes a while to draw... +#d <- read.csv("out/dirreq-responses", stringsAsFactors = FALSE, +# header = FALSE) +#d <- data.frame(date = as.Date(d$V1), requests = d$V4, +# asrelay = ifelse(d$V3, "also seen as\nnon-bridge relays", +# "only seen as\nbridges")) +#ggplot(d, aes(x = date, y = requests)) + +#geom_point() + +#facet_grid(asrelay ~ .) + +#scale_x_date(name = "", +# labels = date_format("%b %Y"), +# minor_breaks = date_breaks("1 month")) + +#scale_y_continuous(name = "", labels = comma_format(digits = 1)) +#ggsave("graphs/responses-single-bridges.png", width = 6, height = 3.5, +# dpi = 100) + +# ALTERNATIVE: out/bridge-dirreq-stats-no-relays +b <- read.csv("out/bridge-dirreq-stats-all-bridges", + stringsAsFactors = FALSE) +b <- b[b$date >= "2011-07-01" & b$date <= "2012-09-30", ] + +x <- data.frame(date = b$date, + value = (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb) / + ((b$ha + b$hc) * b$nabcd)) +x <- melt(x, id = "date") +ggplot(x, aes(x = as.Date(date), y = value)) + +geom_line() + +scale_x_date(name = "", + labels = date_format("%b %Y"), + minor_breaks = date_breaks("1 month")) + +scale_y_continuous(name = "", limit = c(0, 1), labels = percent) +ggsave("graphs/fraction.pdf", width = 6, height = 3, dpi = 100) + +ggplot(b, aes(x = as.Date(date), y = (ra + rb) / 86400)) + +geom_line() + +scale_x_date(name = "", + labels = date_format("%b %Y"), + minor_breaks = date_breaks("1 month")) + +scale_y_continuous(name = "", labels = comma_format(digits = 1)) +ggsave("graphs/responses.pdf", width = 6, height = 3, dpi = 72) + +x <- data.frame( + date = as.Date(b$date), + responses = (b$ra + b$rb) / 86400, + fraction = (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb) / + ((b$ha + b$hc) * b$nabcd), + totalresponses = ((b$ra + b$rb) * (b$ha + b$hc) * + b$nabcd) / (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb) / 86400) +x <- melt(x, id = "date") +x <- data.frame(date = x$date, value = x$value, variable = + ifelse(x$variable == "responses", + "1. Reported directory\nrequests", + ifelse(x$variable == "fraction", paste("2. Estimated fraction\n", + "of bridges reporting\ndirectory requests", sep = ""), + "3. Estimated directory\nrequests in the\nnetwork"))) +ggplot(x, aes(x = as.Date(date), y = value)) + +geom_line() + +facet_grid(variable ~ ., scales = "free_y") + +scale_x_date(name = "", + labels = date_format("%b %Y"), + minor_breaks = date_breaks("1 month")) + +scale_y_continuous(name = "", labels = comma_format(digits = 1)) +ggsave("graphs/extrapolated-responses.pdf", width = 6, height = 5, + dpi = 72) + +ggplot(b, aes(x = as.Date(date), y = (na + nb) / nabcd)) + +geom_line() + +scale_x_date(name = "", + labels = date_format("%b %Y"), + minor_breaks = date_breaks("1 month")) + +scale_y_continuous(name = "", limit = c(0, 1), labels = percent) +ggsave("graphs/fraction-unweighted.pdf", width = 6, height = 3, dpi = 72) + +x <- data.frame(date = b$date, + #x1 = (b$ra + b$rb) * b$nabcd / (b$na + b$nb), + x2 = ((b$ra + b$rb) * (b$ha + b$hc) * + b$nabcd) / (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb)) +#x <- melt(x, id = "date") +ggplot(x, aes(x = as.Date(date), y = x2 / 86400)) + +geom_line() + +scale_x_date(name = "", + labels = date_format("%b %Y"), + minor_breaks = date_breaks("1 month")) + +scale_y_continuous(name = "", labels = comma_format(digits = 1)) +ggsave("graphs/totalresponses.pdf", width = 6, height = 3, dpi = 72) + +n <- data.frame(date = b$date, na = b$na / 86400, nb = b$nb / 86400, + nc = b$nc / 86400, nd = (b$nabcd - b$na - b$nb - b$nc) / 86400) +n <- melt(n, id = "date") +ggplot(n, aes(x = as.Date(date), y = value)) + +geom_line() + +facet_grid(variable ~ .) + +scale_x_date(name = "", + labels = date_format("%b %Y"), + minor_breaks = date_breaks("1 month")) + +scale_y_continuous(name = "", labels = comma_format(digits = 1)) +ggsave("graphs/n.pdf", width = 6, height = 7, dpi = 100) + +h <- data.frame(date = b$date, value = (b$ha + b$hc) / 86400) +ggplot(h, aes(x = as.Date(date), y = value)) + +geom_line() + +scale_x_date(name = "", + labels = date_format("%b %Y"), + minor_breaks = date_breaks("1 month")) + +scale_y_continuous(name = "", labels = comma_format(digits = 1)) +ggsave("graphs/history-bytes.pdf", width = 6, height = 3, dpi = 100) + +h <- data.frame(date = b$date, ha = b$ha / 86400, hc = b$hc / 86400) +h <- melt(h, id = "date") +ggplot(h, aes(x = as.Date(date), y = value)) + +geom_line() + +facet_grid(variable ~ .) + +scale_x_date(name = "", + labels = date_format("%b %Y"), + minor_breaks = date_breaks("1 month")) + +scale_y_continuous(name = "", labels = comma_format(digits = 1)) +ggsave("graphs/h.pdf", width = 6, height = 5, dpi = 100) + +r <- data.frame(date = b$date, ra = b$ra / 86400, rb = b$rb / 86400) +r <- melt(r, id = "date") +ggplot(r, aes(x = as.Date(date), y = value)) + +geom_line() + +facet_grid(variable ~ .) + +scale_x_date(name = "", + labels = date_format("%b %Y"), + minor_breaks = date_breaks("1 month")) + +scale_y_continuous(name = "", labels = comma_format(digits = 1)) +ggsave("graphs/r.pdf", width = 6, height = 5, dpi = 100) + +x <- data.frame(date = b$date, + value = ((b$ra + b$rb) * (b$ha + b$hc) * + b$nabcd) / (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb) / 864000, + stringsAsFactors = FALSE) +x <- melt(x, id = "date") +ggplot(x, aes(x = as.Date(date), y = value)) + +geom_line() + +scale_x_date(name = "", + labels = date_format("%b %Y"), + minor_breaks = date_breaks("1 month")) + +scale_y_continuous(name = "", labels = comma_format(digits = 1)) +ggsave("graphs/totalusers.pdf", width = 6, height = 3, dpi = 100) +x <- x[x$date >= '2012-07-01', ] +#max_y <- max(x$value / 864000, na.omit = FALSE) +ggplot(x, aes(x = as.Date(date), y = value)) + +geom_line() + +scale_x_date(name = "", + labels = date_format("%b %Y"), + breaks = date_breaks("1 month"), + minor_breaks = date_breaks("1 week")) + +scale_y_continuous(name = "", #limit = c(0, max_y), + labels = comma_format(digits = 1)) +ggsave("graphs/totalusers-q3-2012.pdf", width = 6, height = 3, dpi = 100) + +ggplot(b, aes(x = as.Date(date), y = consensuses)) + +geom_point() + +geom_hline(yintercept = 19.5, linetype = 2) + +scale_x_date(name = "", + labels = date_format("%b %Y"), + minor_breaks = date_breaks("1 month")) + +scale_y_continuous(name = "", labels = comma_format(digits = 1)) +ggsave("graphs/consensuses.pdf", width = 6, height = 3, dpi = 100) + +x <- data.frame(date = b$date, + value = (b$sy * (b$ra + b$rb) * (b$ha + b$hc) * + b$nabcd) / (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb)) +x <- melt(x, id = "date") +ggplot(x, aes(x = as.Date(date), y = value / 864000)) + +geom_line() + +scale_x_date(name = "", + labels = date_format("%b %Y"), + minor_breaks = date_breaks("1 month")) + +scale_y_continuous(name = "", labels = comma_format(digits = 1)) +ggsave("graphs/syusers.pdf", width = 6, height = 3, dpi = 100) + +u <- read.csv("bridge-users.csv", stringsAsFactors = FALSE) +u <- u[u$date >= "2011-07-01" & u$date <= "2012-09-30", ] +u <- data.frame(date = u$date, all = u$all) +ggplot(u, aes(x = as.Date(date), y = all)) + +geom_line() + +scale_x_date(name = "", + labels = date_format("%b %Y"), + minor_breaks = date_breaks("1 month")) + +scale_y_continuous(name = "", labels = comma_format(digits = 1)) +ggsave("graphs/totalusers-oldapproach.pdf", width = 6, height = 3, + dpi = 100) + +u <- read.csv("bridge-users.csv", stringsAsFactors = FALSE) +u <- u[u$date >= "2011-07-01" & u$date <= "2012-09-30", ] +u <- data.frame(date = u$date, value = u$all, + variable = "old approach based on\nunique IP addresses", + stringsAsFactors = FALSE) +x <- data.frame(date = b$date, + value = ((b$ra + b$rb) * (b$ha + b$hc) * + b$nabcd) / (b$ha * (b$na + b$nc) + (b$ha + b$hc) * b$nb) / 864000, + variable = "new approach based on\ndirectory requests", + stringsAsFactors = FALSE) +u <- rbind(u, x) +ggplot(u, aes(x = as.Date(date), y = value)) + +geom_line() + +facet_grid(variable ~ ., scales = "free_y") + +scale_x_date(name = "", + labels = date_format("%b %Y"), + minor_breaks = date_breaks("1 month")) + +scale_y_continuous(name = "", labels = comma_format(digits = 1)) +ggsave("graphs/compare-totalusers.pdf", width = 6, height = 4, + dpi = 100) +u <- u[u$date >= '2012-07-01', ] +ggplot(u, aes(x = as.Date(date), y = value)) + +geom_line() + +facet_grid(variable ~ ., scales = "free_y") + +scale_x_date(name = "", + labels = date_format("%b %Y"), + breaks = date_breaks("1 month"), + minor_breaks = date_breaks("1 week")) + +scale_y_continuous(name = "", labels = comma_format(digits = 1)) +ggsave("graphs/compare-totalusers-q3-2012.pdf", width = 6, height = 4, + dpi = 100) + diff --git a/task-5807/run.sh b/task-5807/run.sh new file mode 100755 index 0000000..52d1ee7 --- /dev/null +++ b/task-5807/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash +javac -d bin/ -cp lib/commons-codec-1.4.jar:lib/commons-compress-1.3.jar:lib/descriptor.jar src/EvalBridgeDirreqStats.java && time java -Xmx6g -cp bin/:lib/commons-codec-1.4.jar:lib/commons-compress-1.3.jar:lib/descriptor.jar EvalBridgeDirreqStats + diff --git a/task-5807/src/EvalBridgeDirreqStats.java b/task-5807/src/EvalBridgeDirreqStats.java new file mode 100644 index 0000000..c996a26 --- /dev/null +++ b/task-5807/src/EvalBridgeDirreqStats.java @@ -0,0 +1,603 @@ +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.TimeZone; +import java.util.TreeMap; +import java.util.TreeSet; + +import org.apache.commons.codec.binary.Hex; +import org.apache.commons.codec.digest.DigestUtils; +import org.torproject.descriptor.BridgeNetworkStatus; +import org.torproject.descriptor.Descriptor; +import org.torproject.descriptor.DescriptorFile; +import org.torproject.descriptor.DescriptorReader; +import org.torproject.descriptor.DescriptorSourceFactory; +import org.torproject.descriptor.ExtraInfoDescriptor; +import org.torproject.descriptor.NetworkStatusEntry; +import org.torproject.descriptor.RelayNetworkStatusConsensus; + +/* Extract relevant pieces of information from relay consensuses and + * bridge descriptors to estimate daily bridge users. See README for + * usage instructions. */ +public class EvalBridgeDirreqStats { + public static void main(String[] args) throws Exception { + + /* Parse relay consensuses from in/relay-descriptors/. Skip this step + * if in/relay-descriptors/ does not exist. */ + File consensusesDirectory = new File("in/relay-descriptors"); + File hashedFingerprintsFile = new File("out/hashed-fingerprints"); + File consensusesPerDayFile = new File("out/consensuses-per-day"); + SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); + dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); + if (consensusesDirectory.exists()) { + SortedSet<String> hashedFingerprints = new TreeSet<String>(); + SortedMap<String, Integer> consensusesPerDay = + new TreeMap<String, Integer>(); + DescriptorReader descriptorReader = + DescriptorSourceFactory.createDescriptorReader(); + descriptorReader.addDirectory(consensusesDirectory); + Iterator<DescriptorFile> descriptorFiles = + descriptorReader.readDescriptors(); + while (descriptorFiles.hasNext()) { + DescriptorFile descriptorFile = descriptorFiles.next(); + for (Descriptor descriptor : descriptorFile.getDescriptors()) { + if (!(descriptor instanceof RelayNetworkStatusConsensus)) { + continue; + } + RelayNetworkStatusConsensus consensus = + (RelayNetworkStatusConsensus) descriptor; + + /* Extract hashed fingerprints of all known relays to remove + * those fingerprints from bridge usage statistics later on. */ + for (NetworkStatusEntry statusEntry : + consensus.getStatusEntries().values()) { + hashedFingerprints.add(Hex.encodeHexString(DigestUtils.sha( + Hex.decodeHex(statusEntry.getFingerprint(). + toCharArray()))).toUpperCase()); + } + + /* Count the number of consensuses per day. */ + String date = dateFormat.format( + consensus.getValidAfterMillis()); + int consensuses = 1; + if (consensusesPerDay.containsKey(date)) { + consensuses += consensusesPerDay.get(date); + } + consensusesPerDay.put(date, consensuses); + } + } + hashedFingerprintsFile.getParentFile().mkdirs(); + BufferedWriter bw = new BufferedWriter(new FileWriter( + hashedFingerprintsFile)); + for (String hashedFingerprint : hashedFingerprints) { + bw.write(hashedFingerprint + "\n"); + } + bw.close(); + consensusesPerDayFile.getParentFile().mkdirs(); + bw = new BufferedWriter(new FileWriter(consensusesPerDayFile)); + for (Map.Entry<String, Integer> e : consensusesPerDay.entrySet()) { + bw.write(e.getKey() + "," + e.getValue() + "\n"); + } + bw.close(); + } + + /* Parse bridge network statuses from in/bridge-descriptors/. Skip + * this step if in/bridge-descriptors/ does not exist. */ + File bridgeDescriptorsDirectory = new File("in/bridge-descriptors"); + File bridgesPerDayFile = new File("out/bridges-per-day"); + File dirreqResponsesFile = new File("out/dirreq-responses"); + File dirreqWriteHistoryFile = new File("out/dirreq-write-history"); + File bridgeStatsUsersFile = new File("out/bridge-stats-users"); + SimpleDateFormat dateTimeFormat = new SimpleDateFormat( + "yyyy-MM-dd HH:mm:ss"); + dateTimeFormat.setTimeZone(TimeZone.getTimeZone("UTC")); + if (bridgeDescriptorsDirectory.exists()) { + + /* Read hashed fingerprints from disk, so that we can include in the + * intermediate files whether a bridge was running as non-bridge + * relay before. */ + SortedSet<String> hashedFingerprints = new TreeSet<String>(); + String line; + BufferedReader br = new BufferedReader(new FileReader( + hashedFingerprintsFile)); + while ((line = br.readLine()) != null) { + hashedFingerprints.add(line.toUpperCase()); + } + br.close(); + + /* Prepare data structures for first collecting everything we parse. + * There may be duplicates which we can best remove in memory. */ + SortedMap<String, List<Integer>> bridgesPerDay = + new TreeMap<String, List<Integer>>(); + SortedSet<String> dirreqResponses = new TreeSet<String>(); + SortedMap<String, SortedMap<Long, Long>> dirreqWriteHistory = + new TreeMap<String, SortedMap<Long, Long>>(); + SortedSet<String> bridgeIps = new TreeSet<String>(); + + /* Parse everything in in/bridge-descriptors/. */ + DescriptorReader descriptorReader = + DescriptorSourceFactory.createDescriptorReader(); + descriptorReader.addDirectory(bridgeDescriptorsDirectory); + Iterator<DescriptorFile> descriptorFiles = + descriptorReader.readDescriptors(); + while (descriptorFiles.hasNext()) { + DescriptorFile descriptorFile = descriptorFiles.next(); + for (Descriptor descriptor : descriptorFile.getDescriptors()) { + if (descriptor instanceof BridgeNetworkStatus) { + BridgeNetworkStatus status = (BridgeNetworkStatus) descriptor; + + /* Extract number of running bridges to calculate daily means. + * Skip network statuses where less than 1% of bridges have + * the Running flag. */ + String date = dateFormat.format(status.getPublishedMillis()); + int totalBridges = 0, runningBridges = 0; + for (NetworkStatusEntry statusEntry : + status.getStatusEntries().values()) { + totalBridges++; + if (statusEntry.getFlags().contains("Running")) { + runningBridges++; + } + } + if (runningBridges * 100 > totalBridges) { + if (!bridgesPerDay.containsKey(date)) { + bridgesPerDay.put(date, new ArrayList<Integer>()); + } + bridgesPerDay.get(date).add(runningBridges); + } + } else if (descriptor instanceof ExtraInfoDescriptor) { + ExtraInfoDescriptor extraInfoDescriptor = + (ExtraInfoDescriptor) descriptor; + String fingerprint = extraInfoDescriptor.getFingerprint(). + toUpperCase(); + String wasSeenAsRelay = hashedFingerprints.contains( + fingerprint) ? "TRUE" : "FALSE"; + + /* Extract v3 directory request response numbers from dirreq + * stats, if available. */ + if (extraInfoDescriptor.getDirreqStatsEndMillis() >= 0 && + extraInfoDescriptor.getDirreqStatsIntervalLength() + == 86400 && + extraInfoDescriptor.getDirreqV3Resp() != null && + extraInfoDescriptor.getDirreqV3Resp().containsKey("ok")) { + String dirreqStatsEnd = dateTimeFormat.format( + extraInfoDescriptor.getDirreqStatsEndMillis()); + SortedMap<String, Integer> resp = + extraInfoDescriptor.getDirreqV3Resp(); + String ok = String.valueOf(resp.get("ok")); + String notEnoughSigs = resp.containsKey("not-enough-sigs") + ? String.valueOf(resp.get("not-enough-sigs")) : "NA"; + String unavailable = resp.containsKey("unavailable") + ? String.valueOf(resp.get("unavailable")) : "NA"; + String notFound = resp.containsKey("not-found") + ? String.valueOf(resp.get("not-found")) : "NA"; + String notModified = resp.containsKey("not-modified") + ? String.valueOf(resp.get("not-modified")) : "NA"; + String busy = resp.containsKey("busy") + ? String.valueOf(resp.get("busy")) : "NA"; + dirreqResponses.add(String.format( + "%s,%s,%s,%s,%s,%s,%s,%s%n", dirreqStatsEnd, + fingerprint, wasSeenAsRelay, ok, notEnoughSigs, + unavailable, notFound, notModified, busy)); + } + + /* Extract written directory bytes, if available. */ + if (extraInfoDescriptor.getDirreqWriteHistory() != null && + extraInfoDescriptor.getDirreqWriteHistory(). + getIntervalLength() == 900) { + if (!dirreqWriteHistory.containsKey(fingerprint)) { + dirreqWriteHistory.put(fingerprint, + new TreeMap<Long, Long>()); + } + dirreqWriteHistory.get(fingerprint).putAll( + extraInfoDescriptor.getDirreqWriteHistory(). + getBandwidthValues()); + } + + /* Sum up unique IP address counts from .sy and from all + * countries from bridge stats, if available. */ + if (extraInfoDescriptor.getBridgeStatsEndMillis() >= 0 && + extraInfoDescriptor.getBridgeStatsIntervalLength() + == 86400 && + extraInfoDescriptor.getBridgeIps() != null) { + String bridgeStatsEnd = dateTimeFormat.format( + extraInfoDescriptor.getBridgeStatsEndMillis()); + int sy = 0, all = 0; + for (Map.Entry<String, Integer> e : + extraInfoDescriptor.getBridgeIps().entrySet()) { + String country = e.getKey(); + int adjustedIps = e.getValue() - 4; + if (country.equals("sy")) { + sy = adjustedIps; + } + all += adjustedIps; + } + bridgeIps.add(String.format("%s,%s,%s,%d,%d%n", + bridgeStatsEnd, fingerprint, wasSeenAsRelay, sy, all)); + } + } + } + } + + /* Write to disk what we learned while parsing bridge extra-info + * descriptors. */ + bridgesPerDayFile.getParentFile().mkdirs(); + BufferedWriter bw = new BufferedWriter(new FileWriter( + bridgesPerDayFile)); + for (Map.Entry<String, List<Integer>> e : + bridgesPerDay.entrySet()) { + String date = e.getKey(); + List<Integer> bridges = e.getValue(); + int sum = 0; + for (int b : bridges) { + sum += b; + } + bw.write(String.format("%s,%d%n", date, sum / bridges.size())); + } + bw.close(); + dirreqResponsesFile.getParentFile().mkdirs(); + bw = new BufferedWriter(new FileWriter(dirreqResponsesFile)); + for (String resp : dirreqResponses) { + bw.write(resp); + } + bw.close(); + bridgeStatsUsersFile.getParentFile().mkdirs(); + bw = new BufferedWriter(new FileWriter(bridgeStatsUsersFile)); + for (String ips : bridgeIps) { + bw.write(ips); + } + bw.close(); + bw = new BufferedWriter(new FileWriter(dirreqWriteHistoryFile)); + for (Map.Entry<String, SortedMap<Long, Long>> e : + dirreqWriteHistory.entrySet()) { + String fingerprint = e.getKey(); + String wasSeenAsRelay = hashedFingerprints.contains( + fingerprint) ? "TRUE" : "FALSE"; + for (Map.Entry<Long, Long> f : e.getValue().entrySet()) { + String historyIntervalEnd = dateTimeFormat.format(f.getKey()); + bw.write(String.format("%s,%s,%d,%s%n", fingerprint, + historyIntervalEnd, f.getValue(), wasSeenAsRelay)); + } + } + bw.close(); + } + + /* Aggregate the parse results from above and write relevant data for + * estimating daily bridge users to disk. Write results to + * out/bridge-dirreq-stats. This step is distinct from the parsing + * steps, so that the parsing only has to be done once, whereas the + * aggregation can be tweaked and re-run easily. */ + File bridgeDirreqStatsNoRelaysFile = + new File("out/bridge-dirreq-stats-no-relays"); + File bridgeDirreqStatsAllBridgesFile = + new File("out/bridge-dirreq-stats-all-bridges"); + if (bridgesPerDayFile.exists() && + dirreqResponsesFile.exists() && + bridgeStatsUsersFile.exists() && + dirreqWriteHistoryFile.exists() && + consensusesPerDayFile.exists()) { + + /* Run the aggregation twice, once for all bridges and once for only + * bridges which haven't been seen as non-bridge relays before. */ + boolean[] exclude = new boolean[] { true, false }; + File[] outFiles = new File[] { bridgeDirreqStatsNoRelaysFile, + bridgeDirreqStatsAllBridgesFile }; + for (int r = 0; r < 2; r++) { + boolean excludeHashedFingerprints = exclude[r]; + File outFile = outFiles[r]; + + /* Read parse results back to memory. */ + SortedMap<String, Integer> bridgesPerDay = + new TreeMap<String, Integer>(); + BufferedReader br = new BufferedReader(new FileReader( + bridgesPerDayFile)); + String line; + while ((line = br.readLine()) != null) { + String[] parts = line.split(","); + bridgesPerDay.put(parts[0], Integer.parseInt(parts[1])); + } + br.close(); + SortedMap<String, SortedMap<Long, Long>> dirreqOkResponses = + new TreeMap<String, SortedMap<Long, Long>>(); + br = new BufferedReader(new FileReader(dirreqResponsesFile)); + while ((line = br.readLine()) != null) { + String[] parts = line.split(","); + if (excludeHashedFingerprints && parts[2].equals("TRUE")) { + /* Skip, because this bridge has been seen as relay before. */ + continue; + } + String fingerprint = parts[1].toUpperCase(); + long dirreqStatsEndMillis = dateTimeFormat.parse(parts[0]). + getTime(); + long ok = Long.parseLong(parts[3]); + if (!dirreqOkResponses.containsKey(fingerprint)) { + dirreqOkResponses.put(fingerprint, new TreeMap<Long, Long>()); + } + dirreqOkResponses.get(fingerprint).put(dirreqStatsEndMillis, + ok); + } + br.close(); + SortedMap<String, long[]> ipsPerDay = + new TreeMap<String, long[]>(); + br = new BufferedReader(new FileReader(bridgeStatsUsersFile)); + while ((line = br.readLine()) != null) { + String[] parts = line.split(","); + if (excludeHashedFingerprints && parts[2].equals("TRUE")) { + /* Skip, because this bridge has been seen as relay before. */ + continue; + } + long bridgeStatsEndMillis = dateTimeFormat.parse(parts[0]). + getTime(); + long bridgeStatsStartMillis = bridgeStatsEndMillis - 86400000L; + long currentStartMillis = bridgeStatsStartMillis; + + /* Find UTC date break in the interval and make sure that we + * distribute IPs to the two days correctly. */ + String[] dates = new String[] { + dateFormat.format(bridgeStatsStartMillis), + dateFormat.format(bridgeStatsEndMillis) }; + long[] seconds = new long[2]; + if (!dates[0].equals(dates[1])) { + long dateBreakMillis = (bridgeStatsEndMillis / 86400000L) + * 86400000L; + seconds[0] = (dateBreakMillis - bridgeStatsStartMillis) + / 1000L; + bridgeStatsStartMillis = dateBreakMillis; + } + seconds[1] = (bridgeStatsEndMillis - bridgeStatsStartMillis) + / 1000L; + + /* Update per-day counters. */ + for (int i = 0; i < dates.length; i++) { + String date = dates[i]; + long sy = seconds[i] * Long.parseLong(parts[3]); + long all = seconds[i] * Long.parseLong(parts[4]); + if (!ipsPerDay.containsKey(date)) { + ipsPerDay.put(date, new long[] { 0L, 0L }); + } + ipsPerDay.get(date)[0] += sy; + ipsPerDay.get(date)[1] += all; + } + } + br.close(); + SortedMap<String, Integer> consensusesPerDay = + new TreeMap<String, Integer>(); + br = new BufferedReader(new FileReader(consensusesPerDayFile)); + while ((line = br.readLine()) != null) { + String[] parts = line.split(","); + consensusesPerDay.put(parts[0], Integer.parseInt(parts[1])); + } + br.close(); + br = new BufferedReader(new FileReader(dirreqWriteHistoryFile)); + SortedMap<String, SortedMap<Long, Long>> dirreqWriteHistory = + new TreeMap<String, SortedMap<Long, Long>>(); + while ((line = br.readLine()) != null) { + String[] parts = line.split(","); + if (excludeHashedFingerprints && parts[3].equals("TRUE")) { + /* Skip, because this bridge has been seen as relay before. */ + continue; + } + String fingerprint = parts[0].toUpperCase(); + long historyIntervalEndMillis = dateTimeFormat.parse(parts[1]). + getTime(); + long writtenBytes = Long.parseLong(parts[2]); + if (!dirreqWriteHistory.containsKey(fingerprint)) { + dirreqWriteHistory.put(fingerprint, new TreeMap<Long, Long>()); + } + dirreqWriteHistory.get(fingerprint).put(historyIntervalEndMillis, + writtenBytes); + } + br.close(); + + /* For every day, count reported v3 directory request responses, + * reported written directory bytes, and reporting bridges. + * Distinguish between bridges reporting both responses and bytes, + * bridges reporting only responses, and bridges reporting. Map + * keys are dates, map values are the number of responses, bytes, + * or bridges. */ + SortedMap<String, Long> + responsesReportingBoth = new TreeMap<String, Long>(), + responsesNotReportingBytes = new TreeMap<String, Long>(), + bytesReportingBoth = new TreeMap<String, Long>(), + bytesNotReportingResponses = new TreeMap<String, Long>(), + bridgesReportingBoth = new TreeMap<String, Long>(), + bridgesNotReportingBytes = new TreeMap<String, Long>(), + bridgesNotReportingResponses = new TreeMap<String, Long>(); + + /* Consider each bridge separately. */ + SortedSet<String> allFingerprints = new TreeSet<String>(); + allFingerprints.addAll(dirreqOkResponses.keySet()); + allFingerprints.addAll(dirreqWriteHistory.keySet()); + for (String fingerprint : allFingerprints) { + + /* Obtain iterators over dirreq stats intervals and dirreq write + * history intervals, from oldest to newest. Either iterator + * may contain zero elements if the bridge did not report any + * values, but not both. */ + SortedMap<Long, Long> bridgeDirreqOkResponses = + dirreqOkResponses.containsKey(fingerprint) ? + dirreqOkResponses.get(fingerprint) : + new TreeMap<Long, Long>(); + SortedMap<Long, Long> bridgeDirreqWriteHistory = + dirreqWriteHistory.containsKey(fingerprint) ? + dirreqWriteHistory.get(fingerprint) : + new TreeMap<Long, Long>(); + Iterator<Long> responsesIterator = + bridgeDirreqOkResponses.keySet().iterator(); + Iterator<Long> historyIterator = + bridgeDirreqWriteHistory.keySet().iterator(); + + /* Keep references to the currently considered intervals. */ + long responseEndMillis = responsesIterator.hasNext() ? + responsesIterator.next() : Long.MAX_VALUE; + long historyEndMillis = historyIterator.hasNext() ? + historyIterator.next() : Long.MAX_VALUE; + + /* Keep the time until when we have processed statistics. */ + long currentStartMillis = 0L; + + /* Iterate over both responses and byte histories until we set + * both to Long.MAX_VALUE, indicating that there are no further + * values. */ + while (responseEndMillis < Long.MAX_VALUE || + historyEndMillis < Long.MAX_VALUE) { + + /* Dirreq-stats intervals are guaranteed to be 24 hours long, + * and dirreq-write-history intervals are 15 minutes long. + * This is guaranteed in the parsing code above. It allows us + * to calculate interval starts. Also, if we have already + * processed part of an interval, move the considered interval + * start accordingly. */ + long historyStartMillis = Math.max(currentStartMillis, + historyEndMillis - 900000L); + long responseStartMillis = Math.max(currentStartMillis, + responseEndMillis - 86400000L); + + /* Determine start and end time of the next interval, and + * whether the bridge reported dirreq-stats in that interval, + * or dirreq histories, or both. */ + long currentEndMillis; + boolean addHistory = false, addResponses = false; + if (historyStartMillis < responseStartMillis) { + currentStartMillis = historyStartMillis; + currentEndMillis = Math.min(historyEndMillis, + responseStartMillis); + addHistory = true; + } else if (responseStartMillis < historyStartMillis) { + currentStartMillis = responseStartMillis; + currentEndMillis = Math.min(historyStartMillis, + responseEndMillis); + addResponses = true; + } else { + currentStartMillis = historyStartMillis; + currentEndMillis = Math.min(historyEndMillis, + responseEndMillis); + addHistory = true; + addResponses = true; + } + + /* Depending on which statistics the bridge reported in the + * determined interval, obtain the number of bytes or + * responses to add. */ + long bytesInInterval = 0L, responsesInInterval = 0L; + if (addHistory) { + bytesInInterval = bridgeDirreqWriteHistory. + get(historyEndMillis); + } + if (addResponses) { + responsesInInterval = bridgeDirreqOkResponses. + get(responseEndMillis); + } + + /* Find out if there is a UTC date break in the interval to be + * added. If there is, make sure that we distribute responses + * and bytes to the two days correctly. */ + String[] dates = new String[] { + dateFormat.format(currentStartMillis), + dateFormat.format(currentEndMillis) }; + long[] seconds = new long[2]; + if (!dates[0].equals(dates[1])) { + long dateBreakMillis = (currentEndMillis / 86400000L) + * 86400000L; + seconds[0] = (dateBreakMillis - currentStartMillis) / 1000L; + currentStartMillis = dateBreakMillis; + } + seconds[1] = (currentEndMillis - currentStartMillis) / 1000L; + + /* Update per-day counters. */ + for (int i = 0; i < dates.length; i++) { + String date = dates[i]; + long bytes = seconds[i] * bytesInInterval; + long responses = seconds[i] * responsesInInterval; + if (!bytesReportingBoth.containsKey(date)) { + bytesReportingBoth.put(date, 0L); + bytesNotReportingResponses.put(date, 0L); + responsesReportingBoth.put(date, 0L); + responsesNotReportingBytes.put(date, 0L); + bridgesReportingBoth.put(date, 0L); + bridgesNotReportingBytes.put(date, 0L); + bridgesNotReportingResponses.put(date, 0L); + } + if (addHistory) { + if (addResponses) { + bytesReportingBoth.put(date, + bytesReportingBoth.get(date) + bytes); + responsesReportingBoth.put(date, + responsesReportingBoth.get(date) + responses); + bridgesReportingBoth.put(date, + bridgesReportingBoth.get(date) + seconds[i]); + } else { + bytesNotReportingResponses.put(date, + bytesNotReportingResponses.get(date) + bytes); + bridgesNotReportingResponses.put(date, + bridgesNotReportingResponses.get(date) + + seconds[i]); + } + } else if (addResponses) { + responsesNotReportingBytes.put(date, + responsesNotReportingBytes.get(date) + responses); + bridgesNotReportingBytes.put(date, + bridgesNotReportingBytes.get(date) + seconds[i]); + } + } + + /* Move next interval start to the current interval end, and + * possibly move to the next stats intervals. If we have run + * out of intervals in either or both of the sets, change the + * reference to Long.MAX_VALUE to add the other intervals and + * finally exit the loop. */ + currentStartMillis = currentEndMillis; + if (historyEndMillis <= currentStartMillis) { + historyEndMillis = historyIterator.hasNext() ? + historyIterator.next() : Long.MAX_VALUE; + } + if (responseEndMillis <= currentStartMillis) { + responseEndMillis = responsesIterator.hasNext() ? + responsesIterator.next() : Long.MAX_VALUE; + } + } + } + + /* Put together what we learned about bridge usage per day. */ + outFile.getParentFile().mkdirs(); + BufferedWriter bw = new BufferedWriter(new FileWriter(outFile)); + bw.write("date,nabcd,sy,consensuses,ha,hc,ra,rb,na,nb,nc\n"); + for (String date : bytesReportingBoth.keySet()) { + String bridges = "NA"; + if (bridgesPerDay.containsKey(date)) { + bridges = String.valueOf(bridgesPerDay.get(date) * 86400L); + } + String sy = "NA"; + if (ipsPerDay.containsKey(date)) { + long[] ips = ipsPerDay.get(date); + sy = String.format("%.5f", ((double) ips[0]) + / ((double) ips[1])); + } + String consensuses = "NA"; + if (consensusesPerDay.containsKey(date)) { + consensuses = String.valueOf(consensusesPerDay.get(date)); + } + bw.write(String.format("%s,%s,%s,%s,%d,%d,%d,%d,%d,%d,%d%n", + date, bridges, sy, consensuses, + bytesReportingBoth.get(date), + bytesNotReportingResponses.get(date), + responsesReportingBoth.get(date), + responsesNotReportingBytes.get(date), + bridgesReportingBoth.get(date), + bridgesNotReportingBytes.get(date), + bridgesNotReportingResponses.get(date))); + } + bw.close(); + } + } + } +} +
tor-commits@lists.torproject.org