commit a257846fda8e01261817f9c286de43fa49c9f1c2 Author: tomb tomb@torproject.org Date: Wed Mar 2 10:39:21 2011 -0500
Add scripts for filtering and visualizing Torperf data.
Implements #2563. --- README | 14 ++++- metrics/HOWTO | 109 ++++++++++++++++++++++++++++++++++++ metrics/filter.R | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++ metrics/timematrix.R | 36 ++++++++++++ 4 files changed, 305 insertions(+), 3 deletions(-)
diff --git a/README b/README index c4769d4..48b9617 100644 --- a/README +++ b/README @@ -8,9 +8,9 @@ Contents via SOCKS 4a/5 and outputs timing information util.c: Utility functions for trivsocks-client.c util.h: Utility function declarations for trivsocks-client.c - + Makefile: Builds and tests trivsocks-client - + [run_test.py: Script to automate running of trivsocks-client -- defect] [plot_results.R: Plot the results from run_test.py -- defect]
@@ -27,5 +27,13 @@ Contents performance data and path data
LICENSE: The Tor license (3-clause BSD) - README: This file + README: This file + +Subdirectory /metrics +------------ -------- + +A set of utilities for filtering and graphing Tor performance data.
+ filter.R: filters torperf data and prepares it for graphing + timematrix.R: graphs tordata for interpretation and visualization + HOWTO: documentation and examples diff --git a/metrics/HOWTO b/metrics/HOWTO new file mode 100644 index 0000000..ce6f7eb --- /dev/null +++ b/metrics/HOWTO @@ -0,0 +1,109 @@ +HOWTO -- How to generate nifty graphs of tor performance + +Welcome traveler! You have reached the howto for some tor performance +and metrics stuff. You will find here some techniques and scripts +developed during several tasks including: +#1919; in which we examine torperfs with fixed entry guards +#2543; in which we create graphs of #1919 data +#2563; in which we generalize techniques from #2543 for the future + +The remainder of this HOWTO will walk you through what you need to do +to use the generalized techniques to generate graphs from performance +data. We will use #2543 as an example, because it is from this +example that the generalized technique was derived. This is intended +to be a living document. If something is unclear, or if you wish to +request a feature, please open a ticket: +https://trac.torproject.org/projects/tor/newticket + +As far as I know, this document was written by Karsten, Mike Perry, +and Tom Benjamin. If you are also an author of this document, please +add yourself to this list. + +Step 1: Download Torperf request files +-------------------------------------- + +The 15 Torperf request files are available here: + + https://metrics.torproject.org/data.html#performance + +The wget commands to download all of them are: + + wget https://metrics.torproject.org/data/torperf-50kb.data + wget https://metrics.torproject.org/data/torperf-1mb.data + wget https://metrics.torproject.org/data/torperf-5mb.data + wget https://metrics.torproject.org/data/torperffastratio-50kb.data + wget https://metrics.torproject.org/data/torperffastratio-1mb.data + wget https://metrics.torproject.org/data/torperffastratio-5mb.data + wget https://metrics.torproject.org/data/torperffast-50kb.data + wget https://metrics.torproject.org/data/torperffast-1mb.data + wget https://metrics.torproject.org/data/torperffast-5mb.data + wget https://metrics.torproject.org/data/torperfslow-50kb.data + wget https://metrics.torproject.org/data/torperfslow-1mb.data + wget https://metrics.torproject.org/data/torperfslow-5mb.data + wget https://metrics.torproject.org/data/torperfslowratio-50kb.data + wget https://metrics.torproject.org/data/torperfslowratio-1mb.data + wget https://metrics.torproject.org/data/torperfslowratio-5mb.data + +Note that the torperf-*.data files are quite big already (25M+). + + +Step 2: Install R and ggplot2 +----------------------------- + +Install R 2.8 or higher. + +Run R as user and install ggplot2, quit R, start R again and try to load +ggplot2: + + $ R + > install.packages("ggplot2") + > q() # No need to save the workspace image, ever. + $ R + > library(ggplot2) + > q() + + +Step 3: Filter the data +----------------------- + +Before actually graphing the Torperf data, we should filter it to avoid +reading 29M of data for each graph. filter.R is a script that +accomplishes this task, writing it's output to filtered.csv +It is used as follows: + +1) Decide which files you are interested in. If you only want graphs +based on the fast guard nodes, you only need to crunch those files. + +2) Decide what date range you are interested in. The default is to +include all data since 2001-02-01 until 2099-12-31, by which time I +expect this script may be obsolete. + +usage: R --slave -f filter.R --args [-start=DATE] [-end=DATE] FILENAME(S) + +filename must be of the form guardname-basesizeSUFFIX.data +where SUFFIX is one of kb, mb, gb, tb + eg: R --slave -f filter.R --args -start=2011-02-01 -end=2099-12-31 *.data + eg: R --slave -f filter.R --args torperf-50kb.data + +So, to filter all data from #1919 you would execute: + $ R --slave -f filter.R --args *.data + +The script may take some time to run if the data files are large. + + +Step 4: Visualize the data +-------------------------- + +Let's start with plotting a matrix of completion times graphs for every +file size and guard selection. + + $ R --slave -f timematrix.R + +This execution may take around 15 seconds. + + +Step 5: Find a more useful visualization of the data +---------------------------------------------------- + +... TODO ... + diff --git a/metrics/filter.R b/metrics/filter.R new file mode 100644 index 0000000..f069856 --- /dev/null +++ b/metrics/filter.R @@ -0,0 +1,149 @@ +## A new and "improved" genericised version of the old filter script +## This version was created for task 2563 +## See HOWTO to put this in context +## +## usage: R -f filter.R --args [-start=DATE] [-end=DATE] FILENAME(S) +## filename must be of the form guardname-basesizeSUFFIX.data +## where SUFFIX is one of kb, mb, gb, tb +## +## eg: R -f filter.R --args -start=2011-02-01 -end=2099-12-31 *.data +## eg: R -f filter.R --args torperf-50kb.data +## +## This R script reads in Torperf files as specified on the command line +## and writes a filtered version to filtered.csv for later processing. + +FilterMain <- function(ARGV) { + kDebug <- FALSE # set TRUE for debugging output + kVersion <- 0.3 + if (kDebug) { cat("filter.R version ", kVersion, "\n\n") } + files <- NULL # files is a list of torperfFiles as definied below + setClass("torperfFile", + representation( + filename = "character", + guardLabel = "character", + filesizeLabel = "character", + filesize = "numeric" + ) + ) + + ## default values + ## cutoff dates for observations + start <- as.POSIXct("2011-02-01", origin = "1970-01-01") + end <- as.POSIXct("2099-12-31", origin = "1970-01-01") + + ## process command line arguments + args <- unlist(strsplit(ARGV, " ")) + + ## there are better ways to process command line args, but this works for me :-) + for (arg in args) { + if (kDebug) { cat('arg: ', arg, "\n") } + ## if start date specified + if (length(splitArgL <- unlist(strsplit(arg, "-start="))) == 2) { + if (kDebug) { cat('Starting from ', splitArgL[2], '\n') } + start <- as.POSIXct(splitArgL[2], origin = "1970-01-01") + next + } + ## if end date specified + if (length(splitArgL <- unlist(strsplit(arg, "-end="))) == 2) { + if (kDebug) { cat('Ending at ', splitArgL[2], '\n') } + end <- as.POSIXct(splitArgL[2], origin = "1970-01-01") + next + } + ## if the argument is -start= or -end= we will not reach this line + ## now, if it isn't a parameter add it to the file list + ## parse filename for metadata... + ## examples: + ## "torperf-50kb.data" should result in + ## filename = "torperf-50kb.data" + ## guardLabel = "torperf" + ## filesizeLabel = "50kb" + ## filesize = 50 * 1024 + my.file <- new("torperfFile", filename = arg) + + ## get base filename (strip out leading parts of filename such as dirname) + baseFilename <- basename(my.file@filename) + parseFileStr <- unlist(strsplit(baseFilename, "-")) ## split the two parts of the filename string + if (length(parseFileStr) != 2) { + cat("error: filenames must be of the form guard-filesize.data, you said "", baseFilename, ""\n") + quit("no", 1) + } + my.file@guardLabel <- parseFileStr[1] + cdr <- parseFileStr[2] + parseFilesize <- unlist(strsplit(cdr, "\.")) + if (length(parseFilesize) != 2) { + cat("error: tail of filename must be filesize.data, you said "", cdr, ""\n") + quit("no", 1) + } + my.file@filesizeLabel <- tolower(parseFilesize[1]) ## smash case to make our life easier + + fileBaseSize <- as.integer(unlist(strsplit(my.file@filesizeLabel, "[a-z]"))[1]) + fileSizeMultiplierStr <- unlist(strsplit(my.file@filesizeLabel, '[0-9]')) + fileSizeMultiplierStr <- fileSizeMultiplierStr[length(fileSizeMultiplierStr)] + fileSizeMultiplier <- 1 ## assume no suffix + if (fileSizeMultiplierStr == "kb") { fileSizeMultiplier <- 1024 } + if (fileSizeMultiplierStr == "mb") { fileSizeMultiplier <- 1024 * 1024 } + if (fileSizeMultiplierStr == "gb") { fileSizeMultiplier <- 1024 * 1024 * 1024} + ## yeah right, like we are really pushing TB of data + if (fileSizeMultiplierStr == "tb") { fileSizeMultiplier <- 1024 * 1024 * 1024 * 1024 } + my.file@filesize <- fileBaseSize * fileSizeMultiplier + + if (kDebug) { + cat("i will read file: ", my.file@filename, ' ', + my.file@guardLabel, ' ', + my.file@filesizeLabel, ' ', + my.file@filesize, "\n") + } + + files <- c(files, my.file) + } + + ## sanity check arguments + if (start >= end) { + cat("error: start date must be before end date\n"); + quit("no", 1) + } + if (length(files) == 0) { + cat("error: input files must be specified as arguments\n") + quit("no", 1) ## terminate with non-zero errlev + } + + if (kDebug) { + cat("filtering from ", as.character.POSIXt(start), " to ", + as.character.POSIXt(end), "\n") + } + + ## Turn a given Torperf file into a data frame with the information we care + ## about. + read <- function(filename, guards, filesize, bytes) { + x <- read.table(filename) + x <- x[as.POSIXct(x$V1, origin = "1970-01-01") >= start & + as.POSIXct(x$V1, origin = "1970-01-01") <= end, ] + if (length(x$V1) == 0) + NULL + else + data.frame( + started = as.POSIXct(x$V1, origin = "1970-01-01"), + timeout = x$V17 == 0, + failure = x$V17 > 0 & x$V20 < bytes, + completemillis = ifelse(x$V17 > 0 & x$V20 >= bytes, + round((x$V17 * 1000 + x$V18 / 1000) - + (x$V1 * 1000 + x$V19 / 1000), 0), NA), + guards = guards, + filesize = filesize) + } + + ## Read in files and bind them to a single data frame. + filtered <- NULL + for (file in files) { + if (kDebug) { cat('Processing ', file@filename, "...\n") } + filtered <- rbind(filtered, + read(file@filename, file@guardLabel, file@filesizeLabel, file@filesize) + ) + } + + # Write data frame to a csv file for later processing. + write.csv(filtered, "filtered.csv", quote = FALSE, row.names = FALSE) + +} + +FilterMain(commandArgs(TRUE)) diff --git a/metrics/timematrix.R b/metrics/timematrix.R new file mode 100644 index 0000000..ec01a25 --- /dev/null +++ b/metrics/timematrix.R @@ -0,0 +1,36 @@ +# Load ggplot library without printing out stupid warnings. +options(warn = -1) +suppressPackageStartupMessages(library("ggplot2")) + +# Read in filtered data. +data <- read.csv("filtered.csv", stringsAsFactors = FALSE) + +# Remove NA's +data <- na.omit(data) + +# Remove "outliers" +data <- data[(data$filesize == "50kb" & data$completemillis < 60000) | + (data$filesize == "1mb" & data$completemillis < 120000) | + (data$filesize == "5mb" & data$completemillis < 300000), ] + +# Plot a matrix of scatter plots; the first step is to define which data +# we want to plot (here: data) and what to put on x and y axis. +ggplot(data, aes(x = as.POSIXct(started), y = completemillis / 1000)) + + +# Draw a point for every observation, but with an alpha value of 1/10 to +# reduce overplotting +geom_point(alpha = 1/10) + + +# Draw a matrix of these graphs with different filesizes and different +# guards. +facet_grid(filesize ~ guards, scales = "free_y") + + +# Rename y axis. +scale_y_continuous(name = "Completion time in seconds") + + +# Rename x axis. +scale_x_datetime(name = "Starting time") + +# Save the result to a large PNG file. +ggsave("timematrix.png", width = 10, height = 10, dpi = 150) +