[metrics-tasks/master] Add parsing and graphing code for #3260.

commit e435e803f808994aa42de70e4b8af048c0005509 Author: Karsten Loesing <karsten.loesing@gmx.net> Date: Tue Mar 20 09:15:23 2012 +0100 Add parsing and graphing code for #3260. --- task-3260/.gitignore | 8 ++ task-3260/ExtractClientSpeedTrends.java | 150 +++++++++++++++++++++++++++++++ task-3260/README | 24 +++++ task-3260/client-speed-trends.R | 30 ++++++ task-3260/run.sh | 3 + 5 files changed, 215 insertions(+), 0 deletions(-) diff --git a/task-3260/.gitignore b/task-3260/.gitignore new file mode 100644 index 0000000..8231133 --- /dev/null +++ b/task-3260/.gitignore @@ -0,0 +1,8 @@ +in/ +status/ +out/ +*.pdf +*.jar +*.class +*.png + diff --git a/task-3260/ExtractClientSpeedTrends.java b/task-3260/ExtractClientSpeedTrends.java new file mode 100644 index 0000000..5a2e36e --- /dev/null +++ b/task-3260/ExtractClientSpeedTrends.java @@ -0,0 +1,150 @@ +import java.io.*; +import java.text.*; +import java.util.*; +import org.torproject.descriptor.*; + +/* Extract client speed trends from download times of network status + * consensuses. Directory mirrors report mininum, maximum, median, + * quartiles, and deciles of measured client bandwidths in their + * extra-info descriptors. Combine statistics from the top 50 percent of + * directory mirrors by finished downloads, under the assumption that + * these directory mirrors had enough available bandwidth to serve even + * fast clients. Calculate statistics on a given day by reconstructing + * original client bandwidths from reported percentiles by assuming a + * linear increase between two reported percentiles and calculating + * percentiles of all client bandwidth values on that day. */ +public class ExtractClientSpeedTrends { + public static void main(String[] args) throws Exception { + + System.out.println(new Date() + " Starting."); + + /* Read dirreq-v3-tunneled-dl lines from in/extra-infos/ and append + * them to files status/YYYY-MM-DD, prefixing lines with the relay + * fingerprint to detect duplicates. */ + System.out.println(new Date() + " Reading in/extra-infos/* ..."); + SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd"); + dateFormatter.setTimeZone(TimeZone.getTimeZone("UTC")); + SimpleDateFormat dateTimeFormatter = new SimpleDateFormat( + "yyyy-MM-dd hh:mm:ss"); + dateFormatter.setTimeZone(TimeZone.getTimeZone("UTC")); + dateTimeFormatter.setTimeZone(TimeZone.getTimeZone("UTC")); + DescriptorReader extraInfoReader = DescriptorSourceFactory + .createDescriptorReader(); + extraInfoReader.addDirectory(new File("in/extra-infos")); + extraInfoReader.setExcludeFiles(new File( + "status/extra-info-history")); + Iterator<DescriptorFile> extraInfoFiles = + extraInfoReader.readDescriptors(); + List<String> columns = Arrays.asList( + "complete,min,d1,d2,q1,d3,d4,md,d6,d7,q3,d8,d9,max".split(",")); + while (extraInfoFiles.hasNext()) { + DescriptorFile extraInfoFile = extraInfoFiles.next(); + if (extraInfoFile.getDescriptors() == null) { + continue; + } + for (Descriptor descriptor : extraInfoFile.getDescriptors()) { + ExtraInfoDescriptor extraInfoDescriptor = + (ExtraInfoDescriptor) descriptor; + SortedMap<String, Integer> dirreqV3TunneledDl = + extraInfoDescriptor.getDirreqV3TunneledDl(); + if (dirreqV3TunneledDl == null || + !dirreqV3TunneledDl.keySet().containsAll(columns)) { + continue; + } + long dirreqStatsEndMillis = extraInfoDescriptor. + getDirreqStatsEndMillis(); + String dirreqStatsEndDate = dateFormatter.format( + dirreqStatsEndMillis); + String dirreqStatsEndDateTime = dateTimeFormatter.format( + dirreqStatsEndMillis); + String fingerprint = extraInfoDescriptor.getFingerprint(); + File statusFile = new File("status/" + dirreqStatsEndDate); + statusFile.getParentFile().mkdirs(); + BufferedWriter bw = new BufferedWriter(new FileWriter(statusFile, + true)); + bw.write(fingerprint + "," + dirreqStatsEndDateTime); + for (String column : columns) { + bw.write("," + String.valueOf(dirreqV3TunneledDl.get(column))); + } + bw.write("\n"); + bw.close(); + } + } + + /* Iterate over files status/YYYY-MM-DD, aggregate client speed + * statistics, and write them to out/client-speed-trends.csv. */ + System.out.println(new Date() + " Writing " + + "out/client-speed-trends.csv ..."); + File clientSpeedTrendsFile = new File("out/client-speed-trends.csv"); + if (!clientSpeedTrendsFile.exists()) { + clientSpeedTrendsFile.getParentFile().mkdirs(); + BufferedWriter bw = new BufferedWriter(new FileWriter( + clientSpeedTrendsFile)); + bw.write("date,lines,complete,p0,p10,p20,p25,p30,p40,p50,p60,p70," + + "p75,p80,p90,p100\n"); + for (File statusFile : new File("status").listFiles()) { + if (!statusFile.getName().startsWith("20")) { + continue; + } + BufferedReader br = new BufferedReader(new FileReader( + statusFile)); + String line; + Set<String> statsLines = new HashSet<String>(); + List<Integer> completes = new ArrayList<Integer>(); + while ((line = br.readLine()) != null) { + int complete = Integer.parseInt(line.split(",")[2]); + completes.add(complete); + statsLines.add(line); + } + br.close(); + Collections.sort(completes); + int completeP50 = completes.get((completes.size() - 1) / 2); + List<Integer> downloadTimes = new ArrayList<Integer>(); + List<Integer> percentiles = Arrays.asList(new Integer[] { 0, 10, + 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 100 }); + int containedStatsLines = 0; + for (String statsLine : statsLines) { + String[] parts = statsLine.split(","); + int complete = Integer.parseInt(parts[2]); + if (complete < completeP50) { + continue; + } + containedStatsLines++; + int previousDownloadTime = Integer.parseInt(parts[3]); + downloadTimes.add(previousDownloadTime); + int previousPercentile = 0; + for (int i = 4; i < parts.length; i++) { + int currentDownloadTime = Integer.parseInt(parts[i]); + int currentPercentile = percentiles.get(i - 3); + int values = complete * (currentPercentile + - previousPercentile) / 100; + if (values > 0) { + int increment = (currentDownloadTime - previousDownloadTime) + / values; + int downloadTime = previousDownloadTime; + for (int j = 0; j < values; j++) { + downloadTime += increment; + downloadTimes.add(downloadTime); + } + } + previousDownloadTime = currentDownloadTime; + previousPercentile = currentPercentile; + } + } + bw.write(statusFile.getName() + "," + containedStatsLines + "," + + downloadTimes.size()); + Collections.sort(downloadTimes); + for (int percentile : percentiles) { + int downloadTime = downloadTimes.get(((percentile + * (downloadTimes.size() - 1)) / 100)); + bw.write("," + downloadTime); + } + bw.write("\n"); + } + bw.close(); + } + + System.out.println(new Date() + " Terminating."); + } +} + diff --git a/task-3260/README b/task-3260/README new file mode 100644 index 0000000..0843c71 --- /dev/null +++ b/task-3260/README @@ -0,0 +1,24 @@ +Extract client speed trends from download times of network status +consensuses. +========================================================================== + +Clone the metrics-lib repository, create the descriptor.jar file, and put +it in this directory. + +Obtain the Apache Commons Codec 1.4 .jar file commons-codec-1.4.jar and +put it in this directory. + +Download metrics tarballs containing extra-info descriptors. Extract the +tarballs and put them in in/extra-infos/ in this directory. + +Compile and run the Java class: + + $ ./run.sh + +In order to re-run parts of the analysis, delete files in status/ or the +results file in out/. + +Draw graphs using R and ggplot2: + + $ R --slave -f client-speed-trends.R + diff --git a/task-3260/client-speed-trends.R b/task-3260/client-speed-trends.R new file mode 100644 index 0000000..8f389e0 --- /dev/null +++ b/task-3260/client-speed-trends.R @@ -0,0 +1,30 @@ +library(ggplot2) +c <- read.csv("out/client-speed-trends.csv", stringsAsFactors = FALSE) +c <- c[as.Date(c$date) < max(as.Date(c$date)) - 1, ] + +ggplot(c, aes(x = as.Date(date), y = lines * 2)) + +geom_line() + +scale_y_continuous(name = "Directory mirrors reporting statistics\n", + limits = c(0, max(c$lines) * 2)) + +ggplot(c, aes(x = as.Date(date), y = complete)) + +geom_line() + +d <- c +d[d$lines < 5, c("lines", "p10", "p20", "p30", "p40", "p50")] <- NA +d <- d[, c("date", "lines", "p10", "p20", "p30", "p40", "p50")] +d <- melt(d, id.vars = c("date", "lines")) +ggplot(d, aes(x = as.Date(date), y = value / 1024, colour = variable, + alpha = log(lines))) + +geom_line() + +scale_x_date(name = "", limits = as.Date(c(min(d[!is.na(d$value), "date"]), + max(d[!is.na(d$value), "date"]))), format = "%Y") + +scale_y_continuous(name = "", limits = c(0, 200)) + + #limits = c(0, max(d$value, na.rm = TRUE) / 1024)) + +scale_colour_hue(name = "Percentile", + breaks = c("p50", "p40", "p30", "p20", "p10"), + labels = seq(50, 10, -10), h.start = 60) + +scale_alpha(legend = FALSE) + +opts(title = "Estimated client bandwidth in KiB/s\n") +ggsave("client-speed-trends.png", width = 8, height = 5, dpi = 72) + diff --git a/task-3260/run.sh b/task-3260/run.sh new file mode 100755 index 0000000..63a2c98 --- /dev/null +++ b/task-3260/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash +javac -cp descriptor.jar:commons-codec-1.4.jar ExtractClientSpeedTrends.java && java -Xmx4000m -cp descriptor.jar:commons-codec-1.4.jar:. ExtractClientSpeedTrends +
participants (1)
-
karsten@torproject.org