[tor-commits] [tech-reports/master] Add Virgil's Tor growth report.

karsten at torproject.org karsten at torproject.org
Sun Dec 28 08:03:15 UTC 2014


commit 9495a0361e2dbd603038fe0ac2bfc29db7e9b2e5
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Mon Dec 22 11:04:12 2014 +0100

    Add Virgil's Tor growth report.
---
 2014/tor-growth/.gitignore                         |    3 +
 .../figs/5a--normalized-torperf-small.png          |  Bin 0 -> 130729 bytes
 .../figs/NUR-predicts-norm-torperf-large.png       |  Bin 0 -> 182833 bytes
 .../figs/NUR-predicts-norm-torperf-small.png       |  Bin 0 -> 183043 bytes
 2014/tor-growth/figs/appendix--NUR-over-time.png   |  Bin 0 -> 109859 bytes
 2014/tor-growth/figs/fig1-relays.png               |  Bin 0 -> 91418 bytes
 2014/tor-growth/figs/fig2-bw.png                   |  Bin 0 -> 91863 bytes
 2014/tor-growth/figs/fig3-mean-bw.png              |  Bin 0 -> 109277 bytes
 2014/tor-growth/figs/fig4--torperf.png             |  Bin 0 -> 213045 bytes
 ...fig6--NUR-predicts-normalized-torperf_large.png |  Bin 0 -> 147906 bytes
 .../figs/fig6--NUR-predicts-normtorperf_small.png  |  Bin 0 -> 149461 bytes
 2014/tor-growth/figs/non-tor-bw.png                |  Bin 0 -> 108303 bytes
 2014/tor-growth/scripts/NUR-predicts-normtorperf.R |  211 +++++++++++++++++
 2014/tor-growth/scripts/NUR.R                      |  102 ++++++++
 2014/tor-growth/scripts/bandwidth-per-relay.R      |   95 ++++++++
 2014/tor-growth/scripts/bandwidth.R                |   90 +++++++
 2014/tor-growth/scripts/doubling-table.R           |  148 ++++++++++++
 2014/tor-growth/scripts/non-tor-bandwidth.R        |   70 ++++++
 2014/tor-growth/scripts/relays.R                   |   97 ++++++++
 2014/tor-growth/scripts/torperf.R                  |  136 +++++++++++
 2014/tor-growth/tor-growth.tex                     |  245 ++++++++++++++++++++
 2014/tor-growth/tortechrep.cls                     |    1 +
 22 files changed, 1198 insertions(+)

diff --git a/2014/tor-growth/.gitignore b/2014/tor-growth/.gitignore
new file mode 100644
index 0000000..e4ba2a9
--- /dev/null
+++ b/2014/tor-growth/.gitignore
@@ -0,0 +1,3 @@
+tor-growth.pdf
+tor-growth-2014-10-04.pdf
+
diff --git a/2014/tor-growth/figs/5a--normalized-torperf-small.png b/2014/tor-growth/figs/5a--normalized-torperf-small.png
new file mode 100644
index 0000000..b13592b
Binary files /dev/null and b/2014/tor-growth/figs/5a--normalized-torperf-small.png differ
diff --git a/2014/tor-growth/figs/NUR-predicts-norm-torperf-large.png b/2014/tor-growth/figs/NUR-predicts-norm-torperf-large.png
new file mode 100644
index 0000000..4798a36
Binary files /dev/null and b/2014/tor-growth/figs/NUR-predicts-norm-torperf-large.png differ
diff --git a/2014/tor-growth/figs/NUR-predicts-norm-torperf-small.png b/2014/tor-growth/figs/NUR-predicts-norm-torperf-small.png
new file mode 100644
index 0000000..2b08b3b
Binary files /dev/null and b/2014/tor-growth/figs/NUR-predicts-norm-torperf-small.png differ
diff --git a/2014/tor-growth/figs/appendix--NUR-over-time.png b/2014/tor-growth/figs/appendix--NUR-over-time.png
new file mode 100644
index 0000000..6badfc6
Binary files /dev/null and b/2014/tor-growth/figs/appendix--NUR-over-time.png differ
diff --git a/2014/tor-growth/figs/fig1-relays.png b/2014/tor-growth/figs/fig1-relays.png
new file mode 100644
index 0000000..16d4a29
Binary files /dev/null and b/2014/tor-growth/figs/fig1-relays.png differ
diff --git a/2014/tor-growth/figs/fig2-bw.png b/2014/tor-growth/figs/fig2-bw.png
new file mode 100644
index 0000000..65f2456
Binary files /dev/null and b/2014/tor-growth/figs/fig2-bw.png differ
diff --git a/2014/tor-growth/figs/fig3-mean-bw.png b/2014/tor-growth/figs/fig3-mean-bw.png
new file mode 100644
index 0000000..5da852b
Binary files /dev/null and b/2014/tor-growth/figs/fig3-mean-bw.png differ
diff --git a/2014/tor-growth/figs/fig4--torperf.png b/2014/tor-growth/figs/fig4--torperf.png
new file mode 100644
index 0000000..ba32cd4
Binary files /dev/null and b/2014/tor-growth/figs/fig4--torperf.png differ
diff --git a/2014/tor-growth/figs/fig6--NUR-predicts-normalized-torperf_large.png b/2014/tor-growth/figs/fig6--NUR-predicts-normalized-torperf_large.png
new file mode 100644
index 0000000..5983fe2
Binary files /dev/null and b/2014/tor-growth/figs/fig6--NUR-predicts-normalized-torperf_large.png differ
diff --git a/2014/tor-growth/figs/fig6--NUR-predicts-normtorperf_small.png b/2014/tor-growth/figs/fig6--NUR-predicts-normtorperf_small.png
new file mode 100644
index 0000000..4036d05
Binary files /dev/null and b/2014/tor-growth/figs/fig6--NUR-predicts-normtorperf_small.png differ
diff --git a/2014/tor-growth/figs/non-tor-bw.png b/2014/tor-growth/figs/non-tor-bw.png
new file mode 100644
index 0000000..acdb646
Binary files /dev/null and b/2014/tor-growth/figs/non-tor-bw.png differ
diff --git a/2014/tor-growth/scripts/NUR-predicts-normtorperf.R b/2014/tor-growth/scripts/NUR-predicts-normtorperf.R
new file mode 100644
index 0000000..ca01f45
--- /dev/null
+++ b/2014/tor-growth/scripts/NUR-predicts-normtorperf.R
@@ -0,0 +1,211 @@
+rm( list = ls() )
+setwd('/Users/know/Desktop/tor analytics/')
+library(car)    # for pretty plots
+#library(matlab) # for matlab function names
+#library(data.table) # for data.table like data.frame
+#library(xtable) # for exporting to LaTeX
+#library(gdata)
+#library(lmtest) # for testing linear models
+
+
+library(plyr) # for renaming columns
+source("colortitles.R")
+#######################################################
+
+process_torperf_rawdata <- function( filename, filesize_to_consider=5242880 )
+{
+  
+  # Import data from TorPerf
+  Dtp <- read.csv( filename )[c('date','size','source','q1','md','q3')]
+  Dtp <- rename(Dtp, c("size"="filesize") )
+  
+  print(unique(Dtp$filesize))
+  # only use the aggregated Tor data for downloading a 5 MiB file
+  Dtp <- subset( Dtp, source=='' & filesize==filesize_to_consider )
+  
+  #print( tail(Dtp) )
+  
+  # drop the source and filesize column
+  #Dtp <- Dtp[ , -which(names(Dtp) %in% c('source','filesize'))]
+  Dtp <- Dtp[ , -which(names(Dtp) %in% c('source'))]
+  
+  # rename the q1, md, and q3 for TIME
+  Dtp <- rename(Dtp, c("q1"="time_q1","md"="time_md","q3"="time_q3") )
+  
+  # convert time from MILLISECONDS -> SECONDS
+  Dtp$time_q1 <- Dtp$time_q1/1000
+  Dtp$time_md <- Dtp$time_md/1000
+  Dtp$time_q3 <- Dtp$time_q3/1000
+  
+  
+  # now create the bw_q1, bw_md, bw_q3 in: KiB/s
+  Dtp[c("bw_q1","bw_md","bw_q3")] <- c(NA,NA,NA)
+  
+  #convert my_filesize to 
+  
+  # Rewrite q1, md, and q3 to be in bandwidth (KB/s)
+  Dtp$bw_q1 <- (filesize_to_consider / 1024) / Dtp$time_q1;
+  Dtp$bw_md <- (filesize_to_consider / 1024) / Dtp$time_md;
+  Dtp$bw_q3 <- (filesize_to_consider / 1024) / Dtp$time_q3;
+  
+  return(Dtp)
+}
+
+
+composite_nontor_bandwidth <- function( filename )
+{
+  D_netindex <- read.csv( filename )[c('date','country_code','download_kbps')]
+  
+  D_netindex$download_kbps <- D_netindex$download_kbps * (1000 / 1024)
+  
+  
+  # make a single download_rate averaging across the countries: US, DE, RU.
+  D_US = subset( D_netindex, country_code=='US' )
+  D_DE = subset( D_netindex, country_code=='DE' )
+  D_RU = subset( D_netindex, country_code=='RU' )
+  
+  # merge the US, DE, and RU bandwidths
+  D_temp <- merge( D_US, D_DE, by='date' )
+  D_ni <- merge( D_temp, D_RU, by='date' )
+  
+  # drop the country codes
+  D_ni <- D_ni[ , -which(names(D_ni) %in% c('country_code.x','country_code.y','country_code'))]
+  
+  # average the download KiB/s entries into one
+  D_ni$avr_download_KBps <- NA
+  D_ni$avr_download_KBps <- (D_ni$download_kbps.x + D_ni$download_kbps.y + D_ni$download_kbps) / 3.0
+  
+  # drop the country-specific download rates
+  D_ni <- D_ni[ , -which(names(D_ni) %in% c('download_kbps.x','download_kbps.y','download_kbps'))]
+  
+  D_ni <- rename(D_ni, c("avr_download_KBps"="download_kbps") )
+  
+  return( D_ni )
+  
+}
+
+# readin the Tor bandwidth data
+#################################################################
+Dbw <- read.csv('bandwidth-clean.csv')[c('day','advbw','date','bwread')]
+
+# convert units from from B/s to KiB/s
+Dbw$bwread <- Dbw$bwread / 1024
+Dbw$advbw <- Dbw$advbw / 1024
+
+
+## plot the clients and advertised bandwidth with time
+#plot(day, log2(clients), pch=20, ylab='Clients', xlab='Year', xaxt='n')
+
+Dbw[c("congestion")] <- c(NA)
+Dbw$congestion <- Dbw$bwread / Dbw$advbw
+
+# remove all instances of Dbw with congestion==NA
+Dbw <- subset(Dbw, ! is.na(congestion) )
+Dbw <- subset(Dbw, day > 917 ) 
+
+# remove some outliers
+Dbw <- Dbw[-c(318,367,2),]
+
+
+# readin the non-Tor bandwidth data
+#################################################################
+D_netindex <- composite_nontor_bandwidth('country_daily_speeds.csv')
+
+# Pull in the torperf data for downloading a 5 MiB file
+#################################################################
+#Dtp <- process_torperf_rawdata('torperf-clean.csv',51200)
+Dtp <- process_torperf_rawdata('torperf-clean.csv',5242880)
+
+# now merge Dtp and D by date
+Dnorm <- merge(D_netindex,Dtp,by='date')
+
+# drop some columns we don't need
+Dnorm <- Dnorm[ , -which(names(Dnorm) %in% c('time_q1','time_md','time_q3','country_code','filesize','bw_q1','bw_q3','day'))]
+
+Dnorm$normalized_torperf <- NA
+Dnorm$normalized_torperf <- (Dnorm$bw_md / Dnorm$download_kbps) * 100
+
+# now merge Dnorm and Dbw into D
+D <- merge(Dnorm, Dbw, by='date' )
+
+D <- D[ , -which(names(D) %in% c('download_kbps','bw_md','advbw','bwread'))]
+
+#plot( D$day, D$normalized_torperf )
+#plot( D$congestion, D$normalized_torperf )
+
+m0 <- lm( D$normalized_torperf ~ D$day )
+m1 <- lm( D$normalized_torperf ~ D$congestion )
+m2 <- lm( D$normalized_torperf ~ D$day+D$congestion )
+
+
+#m3 <- lm( log2(D$normalized_torperf) ~ D$congestion+D$day )
+
+# break D into years and more/less effected by the botnet
+D2010 <- D[grepl("2010", D$date), ]
+D2011 <- D[grepl("2011", D$date), ]
+D2012 <- D[grepl("2012", D$date), ]
+D2013 <- D[grepl("2013", D$date), ]
+D2014 <- D[grepl("2014", D$date), ]
+
+
+#dim(D2012)
+par(las=1)
+
+#plot( D$day, D$normalized_torperf, ylab='Norm Torperf', xlab='day', cex=0.1 )
+#plot( D$congestion, D$normalized_torperf, ylab='Norm Torperf', xlab="Network's Read / Advertised Bandwidth", cex=0.2, xaxt='n', yaxt='n' )
+plot( D$congestion, D$normalized_torperf, ylab='Normalized Torperf', xlab="NUR", cex=0.2, xaxt='n', yaxt='n' )
+#abline( m1 )
+
+# plot as a function of advbw_per_client
+par(new = T)
+points(D2010$congestion, D2010$normalized_torperf, pch=20, cex=0.7, col='red' )
+points(D2011$congestion, D2011$normalized_torperf, pch=20, cex=0.7, col='orange' )
+points(D2012$congestion, D2012$normalized_torperf, pch=20, cex=0.7, col='green' )
+points(D2013$congestion, D2013$normalized_torperf, pch=20, cex=0.7, col='blue' )
+points(D2014$congestion, D2014$normalized_torperf, pch=20, cex=0.7, col='purple' )
+
+
+#title("Lower network utilization has little impact on Torperf (50 KiB)")
+#title("Lower network utilization implies faster Torperf (5 MiB)")
+
+## Set the Legend and Title
+##################################################################
+legend_texts = c("2010","2011","2012", "2013", "2014")
+legend( "topright", legend=legend_texts, inset=0.01, pch=c(20,20), col=c('red','orange','green','blue','purple') ) 
+
+
+CongestLabels=c('','50%','','60%','','70%','')
+CongestLocations=c(.45,.50,.55,.60,.65,.70,.75)
+axis(1,at=CongestLocations,labels=CongestLabels )
+
+
+# normalized Torperf labels
+# For 50 KiB
+#TorperfLabels=c('.05%','.1%','.15%','.2%','.25%')
+#TorperfLocations=c(.05,.1,.15,.2,.25)
+#axis(2,at=TorperfLocations,labels=TorperfLabels )
+
+# For 5 MiB
+TorperfLabels=c('.5%','1%','1.5%','2%','2.5%')
+TorperfLocations=c(.5,1,1.5,2,2.5)
+axis(2,at=TorperfLocations,labels=TorperfLabels )
+
+
+
+
+#################################################################
+
+nfit <- lm( normalized_torperf ~ congestion, data=D )
+summary( nfit )
+first_x <- .45
+last_x <- .75
+
+# add the line segment for the predicted pine
+segments( first_x, predict(nfit, data.frame(congestion=first_x)),
+          last_x, predict(nfit, data.frame(congestion=last_x)),
+          col="black", lty=2, lwd=3 )
+
+# Add a point highlighting the beginning of the line
+points( first_x, predict(nfit, data.frame(congestion=first_x)), col="black", pch=15, cex=1.3)
+
+
diff --git a/2014/tor-growth/scripts/NUR.R b/2014/tor-growth/scripts/NUR.R
new file mode 100644
index 0000000..68cd1a4
--- /dev/null
+++ b/2014/tor-growth/scripts/NUR.R
@@ -0,0 +1,102 @@
+  rm( list = ls() )
+  setwd('/Users/know/Desktop/tor analytics/')
+  library(car)    # for pretty plots
+  
+  library(plyr) # for renaming columns
+  source("colortitles.R")
+  #######################################################
+  
+  D <- read.csv('bandwidth-clean.csv')[c('day','advbw','bwread','date')]
+  #D <- rename(D, c("date"="day") )
+  
+  # convert from B/s to MiB/s
+  D$advbw <- D$advbw / 1048576.0
+  D$bwread <- D$bwread / 1048576.0
+  
+  ## Plot the "Congestion" --- read / advertised
+  ##################################################################
+  D[c("congestion")] <- NA
+  D$congestion <- D$bwread / D$advbw
+  
+  ####### Remove some outliers ####################################################
+  Dother <- subset(D, congestion <= 0.01 )
+  D <- subset(D, congestion > 0.01 )
+  # drop all points between days [1200,1310] with congestion BELOW 0.55
+  outliers <- subset(D, ( 1200 <= day & day <= 1310 & congestion <=0.55) | (2258 <= day & day <= 2320 & congestion <= 0.47) )
+  
+  D <- subset(D, !( 1200 <= day & day <= 1310 & congestion <=0.55) )
+  D <- subset(D, !(2258 <= day & day <= 2320 & congestion <= 0.47) )
+  #################################################################################
+  
+  ####### Put into groups ####################################################
+  cut_off1 <- 2173                    # delta=2173 is date 2013-10-08
+  cut_off2 <- 2413                    # delta=2413 is date 2014-06-05
+  g1 <- subset(D, day <= cut_off1 & congestion >= 0.5 )
+  g2 <- subset(D, cut_off1 < day & day <= cut_off2 )
+  g3 <- subset(D, cut_off2 < day )
+  #################################################################################
+  
+  par(las=1)
+  plot(D$day, D$congestion,
+       col='black', pch='.', cex=0.6, ylim=c(0.35,0.8), 
+       #xlab="Year", ylab="used bandwidth / capacity bandwidth", xaxt='n', yaxs="i")
+       xlab="Year", ylab="NUR", xaxt='n', yaxs="i")
+  
+  # plot the three groups
+  points( g1$day, g1$congestion, col='red', pch=20, cex=0.6 )
+  points( g2$day, g2$congestion, col='blue', pch=20, cex=0.6 )
+  points( g3$day, g3$congestion, col='green', pch=20, cex=0.6 )
+  
+  # plot the outliers
+  #points( outliers$day, outliers$congestion, col='black', pch=1, cex=0.6 )
+  
+  
+  ####### Set the pretty X-axis ############################
+  par(las=1)
+  YearLabels=seq(from=2008,to=2014,by=1)
+  YearLocations=c(66,432,797,1162,1527,1893,2258)
+  axis(1,at=YearLocations,labels=YearLabels )
+  ##########################################################
+  
+  
+  ## Plot the three best-fit lines
+  #################################################################
+  g1 <- subset(g1, day >= 1200)
+  fit_D <-  lm( congestion ~ day, data=D )
+  fit_g1 <- lm( congestion ~ day, data=g1 )
+  fit_g2 <- lm( congestion ~ day, data=g2 )
+  fit_g3 <- lm( congestion ~ day, data=g3 )
+  
+  
+  # add the line segment for the predicted pine
+  segments( min(g1$day), predict(fit_g1, data.frame(day=min(g1$day))),
+            max(g1$day), predict(fit_g1, data.frame(day=max(g1$day))),
+            col="black", lty=1, lwd=3 )
+  
+  segments( min(g2$day), predict(fit_g2, data.frame(day=min(g2$day))),
+            max(g2$day), predict(fit_g2, data.frame(day=max(g2$day))),
+            col="black", lty=1, lwd=3 )
+  
+  segments( min(g3$day), predict(fit_g3, data.frame(day=min(g3$day))),
+            max(g3$day), predict(fit_g3, data.frame(day=max(g3$day))),
+            col="black", lty=1, lwd=3 )
+  
+  
+  # Add a point highlighting the beginning of the line
+  points( min(g1$day), predict(fit_g1, data.frame(day=min(g1$day))), col="black", pch=15, cex=1.3)
+  points( min(g2$day), predict(fit_g2, data.frame(day=min(g2$day))), col="black", pch=15, cex=1.3)
+  points( min(g3$day), predict(fit_g3, data.frame(day=min(g3$day))), col="black", pch=15, cex=1.3)
+  
+  
+  
+  
+  ## Set the Title and Legend
+  #################################################################
+  #title("Better: channel utilization in three distinct, flat stages")
+  legend_text = c('2010-04-30 to 2013-10-08', '2013-10-09 to 2014-06-05', '2014-06-06 to present')
+  legend( "bottomleft", legend=legend_text, inset=0.05, pch=c(20,20,20), col=c('red','blue','green') ) 
+  
+  
+  
+
+  
diff --git a/2014/tor-growth/scripts/bandwidth-per-relay.R b/2014/tor-growth/scripts/bandwidth-per-relay.R
new file mode 100644
index 0000000..426e7b3
--- /dev/null
+++ b/2014/tor-growth/scripts/bandwidth-per-relay.R
@@ -0,0 +1,95 @@
+rm( list = ls() )
+setwd('/Users/know/Desktop/tor analytics/')
+library(car)    # for pretty plots
+library(matlab) # for matlab function names
+library(data.table) # for data.table like data.frame
+#library(xtable) # for exporting to LaTeX
+#library(gdata)
+library(lmtest) # for testing linear models
+library(calibrate)
+
+library(plyr) # for renaming columns
+source("colortitles.R")
+#######################################################
+
+## Readin the data into data.frame D with columns: days, relays.stable, relays.fast, relays.all
+#######################################################
+#Dstable <- read.csv('relays-stable.csv')[c('date','relays')]
+#Dfast <- read.csv('relays-fast.csv')[c('date','relays')]
+#Dtemp <- merge(Dstable, Dfast, by="date", suffixes=c('.stable','.fast') )
+
+Dbandwidth <- read.csv('bandwidth-clean.csv')[c('date','advbw','bwread')]
+Drelays <- read.csv('relays-total.csv')[c('date','relays')]
+D <- merge( Dbandwidth, Drelays, by='date' )
+D <- rename(D, c("date"="day","relays"="relays.all"))
+names(D)
+
+# convert units from from B/s to MiB/s
+D$advbw <- D$advbw / 1048576.0
+D$bwread <- D$bwread / 1048576.0
+
+
+D[c("advbw_per_relay","bwread_per_relay")] <- c(NA,NA)
+D$advbw_per_relay <- D$advbw / D$relays.all
+D$bwread_per_relay <- D$bwread / D$relays.all
+
+plot(D$day, log2(D$advbw_per_relay), ylab='Bandwidth (MiB/s) per relay', xlab='Year', yaxt='n', pch=20, cex=0.6, xaxt='n', col='blue' )
+points( D$day, log2(D$bwread_per_relay), pch=20, cex=0.6, col='red' )
+
+
+####### Set the pretty X-axis ###################################
+YearLabels=seq(from=2008,to=2014,by=1)
+YearLocations=c(66,432,797,1162,1527,1893,2258)
+axis(1,at=YearLocations,labels=YearLabels )
+#################################################################
+
+####### Set the pretty Y-axis ###################################
+par(las=1)
+lab <- seq(from=-3,to=1,by=1)
+axis(2,at=lab, labels=c("⅛","¼","½","1","2") )
+#################################################################
+
+## Set the Legend and Title
+##################################################################
+legend_texts = c(
+  expression(paste("Capacity (advertised) ", r^2, "=0.91")),
+  expression(paste("Used (read)                ", r^2, "=0.68"))  
+)
+
+
+legend( "topleft", legend=legend_texts, inset=0.05, pch=c(20,20), col=c('blue','red') ) 
+
+
+multiTitle(color="black","Average bandwidth per relay doubles every ", 
+           color="blue","1.8",
+           color="black",'-',
+           color="red","2.1",
+           color="black"," years")
+
+####### Plot the best-fit lines ############################
+
+# remove data before the 'read' metric started
+temp <- subset( D, !is.na(bwread) )
+first_day <- min(temp$day)
+FD <- D[ which(D$day >= first_day), ]
+
+# fit the Filtered Data to a linear model...
+fit_advbw <- lm( log2(advbw_per_relay) ~ day, data=FD )
+fit_bwread <- lm( log2(bwread_per_relay) ~ day, data=FD )
+
+
+# Add the best-fit lines
+segments( first_day, predict(fit_advbw, data.frame(day=first_day)),
+          max(FD$day), predict(fit_advbw, data.frame(day=max(FD$day))),
+          col="black", lty=2, lwd=3 )
+segments( first_day, predict(fit_bwread, data.frame(day=first_day)),
+          max(FD$day), predict(fit_bwread, data.frame(day=max(FD$day))),
+          col="black", lty=2, lwd=3 )
+
+# Add the black squares
+points( first_day, predict(fit_advbw, data.frame(day=min(FD$day))), col="black", pch=15, cex=1.3)
+points( first_day, predict(fit_bwread, data.frame(day=min(FD$day))), col="black", pch=15, cex=1.3)
+
+#summary( fit_all )
+#summary( fit_stable )
+
diff --git a/2014/tor-growth/scripts/bandwidth.R b/2014/tor-growth/scripts/bandwidth.R
new file mode 100644
index 0000000..cc2ac0a
--- /dev/null
+++ b/2014/tor-growth/scripts/bandwidth.R
@@ -0,0 +1,90 @@
+rm( list = ls() )
+setwd('/Users/know/Desktop/tor analytics/')
+library(car)    # for pretty plots
+#library(matlab) # for matlab function names
+#library(data.table) # for data.table like data.frame
+#library(xtable) # for exporting to LaTeX
+#library(gdata)
+#library(lmtest) # for testing linear models
+#library(calibrate)
+
+library(plyr) # for renaming columns
+#source("colortitles.R")
+#######################################################
+
+D <- read.csv('bandwidth-clean.csv')[c('date','advbw','bwread','bwwrite')]
+D <- rename(D, c("date"="day") )
+
+# convert from B/s to MiB/s
+D$advbw <- D$advbw / 1048576.0
+D$bwread <- D$bwread / 1048576.0
+D$bwwrite <- D$bwwrite / 1048576.0
+
+Xs = D$day;
+Ys_advbw = D$advbw;
+Ys_bwread = D$bwread;
+Ys_bwwrite = D$bwwrite;
+
+# data
+plot(Xs, log2(Ys_advbw), xlab="Year", ylab="Bandwidth (MiB/s)", yaxt='n', xaxt='n', col='blue', pch=20, cex=0.6 )
+points(Xs, log2(Ys_bwread), pch=20, col='red', cex=0.6 )
+
+
+####### Set the pretty X-axis ############################
+YearLabels=seq(from=2008,to=2014,by=1)
+YearLocations=c(66,432,797,1162,1527,1893,2258)
+axis(1,at=YearLocations,labels=YearLabels )
+##########################################################
+
+####### Set the pretty Y-axis ############################
+par(las=1)
+lab <- seq(from=1,to=45,by=1)
+axis(2,at=lab,labels=parse(text=paste("2^", lab, sep="")) )
+##########################################################
+
+
+####### Plot the best-fit lines ############################
+temp <- subset( D, !is.na(bwread), c('day','bwread') )
+first_bwread_day <- min(temp$day)
+
+# remove data before year 2010
+#FD <- D[ which(D$day >= 797), ]
+D <- D[ which(D$day >= first_bwread_day), ]
+
+fit_advbw <- lm( log2(advbw) ~ day, data=D )
+fit_bwread <- lm( log2(bwread) ~ day, data=D )
+summary( fit_advbw )
+summary( fit_bwread )
+
+### we need this for the lines
+
+
+# Add the best-fit line (with black square) for the advertised bandwidth
+segments( min(D$day), predict(fit_advbw, data.frame(day=min(D$day))),
+          max(D$day), predict(fit_advbw, data.frame(day=max(D$day))),
+          col="black", lty=2, lwd=3 )
+points( min(D$day), predict(fit_advbw, data.frame(day=min(D$day))), col="black", pch=15, cex=1.3)
+
+# Add the best-fit line (with black square) for the read bandwidth
+segments( min(D$day), predict(fit_bwread, data.frame(day=min(D$day))),
+          max(D$day), predict(fit_bwread, data.frame(day=max(D$day))),
+          col="black", lty=2, lwd=3 )
+points( min(D$day), predict(fit_bwread, data.frame(day=min(D$day))), col="black", pch=15, cex=1.3)
+
+
+## Set the Legend and Title
+##################################################################
+legend_texts = c(
+  expression(paste("Capacity (advertised) ", r^2, "=0.97")),
+  expression(paste("Used (read)                ", r^2, "=0.88"))  
+)
+
+legend( "topleft", legend=legend_texts, inset=0.05, pch=c(20,20), col=c('blue','red') ) 
+
+multiTitle(color="black","Tor relay bandwidth doubles every ", 
+           color="blue","13",
+           color="black",'-',
+           color="red","14",
+           color="black"," months")
+
+
diff --git a/2014/tor-growth/scripts/doubling-table.R b/2014/tor-growth/scripts/doubling-table.R
new file mode 100644
index 0000000..db64b53
--- /dev/null
+++ b/2014/tor-growth/scripts/doubling-table.R
@@ -0,0 +1,148 @@
+rm( list = ls() )
+setwd('/Users/know/Desktop/tor analytics/')
+
+
+process_torperf_rawdata <- function( filename, filesize_to_consider=5242880 )
+{
+  
+  # Import data from TorPerf
+  Dtp <- read.csv( filename )[c('day','date','size','source','q1','md','q3')]
+  Dtp <- rename(Dtp, c("size"="filesize") )
+  
+  print(unique(Dtp$filesize))
+  # only use the aggregated Tor data for downloading a 5 MiB file
+  Dtp <- subset( Dtp, source=='' & filesize==filesize_to_consider )
+  
+  #print( tail(Dtp) )
+  
+  # drop the source and filesize column
+  #Dtp <- Dtp[ , -which(names(Dtp) %in% c('source','filesize'))]
+  Dtp <- Dtp[ , -which(names(Dtp) %in% c('source'))]
+  
+  # rename the q1, md, and q3 for TIME
+  Dtp <- rename(Dtp, c("q1"="time_q1","md"="time_md","q3"="time_q3") )
+  
+  # convert time from MILLISECONDS -> SECONDS
+  Dtp$time_q1 <- Dtp$time_q1/1000
+  Dtp$time_md <- Dtp$time_md/1000
+  Dtp$time_q3 <- Dtp$time_q3/1000
+  
+  
+  # now create the bw_q1, bw_md, bw_q3 in: KiB/s
+  Dtp[c("bw_q1","bw_md","bw_q3")] <- c(NA,NA,NA)
+  
+  #convert my_filesize to 
+  
+  # Rewrite q1, md, and q3 to be in bandwidth (KiB/s)
+  Dtp$bw_q1 <- (filesize_to_consider / 1024) / Dtp$time_q1;
+  Dtp$bw_md <- (filesize_to_consider / 1024) / Dtp$time_md;
+  Dtp$bw_q3 <- (filesize_to_consider / 1024) / Dtp$time_q3;
+  
+  return(Dtp)
+}
+
+#####################################################################################
+# Number of relays
+#####################################################################################
+
+# remove all entries with values below 937
+Dall <- read.csv('relays-total.csv')[c('date','relays')]
+Dall$day <- NA
+Dall$day <- 1:nrow(Dall)
+
+plot( Dall$day, log2(Dall$relays), pch=20, cex=0.6, col='blue' )
+
+Dall <- subset(Dall, relays >= 938 )
+points(Dall$day, log2(Dall$relays), pch=20, cex=0.6, col='red' )
+
+mm <- lm( log2(Dall$relays) ~ Dall$day  )
+
+rows_to_remove <- abs(resid(mm)) > 0.38
+Dall <- Dall[ !rows_to_remove, ]
+
+points(Dall$day, log2(Dall$relays), pch=20, cex=0.6, col='purple' )
+
+mm <- lm( log2(Dall$relays) ~ Dall$day  )
+
+# from here the doubling rate is gotten by
+(1.0 / mm$coefficients) / 365
+
+# which comes out to 2.99 years
+
+#####################################################################################
+# total network bandwidth
+#####################################################################################
+rm( list = ls() )
+D <- read.csv('bandwidth-clean.csv')[c('date','advbw','bwread')]
+D$advbw <- D$advbw / 1048576.0
+#D <- rename(D, c("date"="day") )
+D$day <- NA
+D$day <- 1:nrow(D)
+
+# convert from B/s to MiB/s
+
+m <- lm( log2(D$advbw) ~ D$day )
+
+# now for the monthly doubling rate
+(1.0 / m$coefficients) / 30
+
+
+#####################################################################################
+# Absolute Torperf
+#####################################################################################
+#rm( list = ls() )
+D_SMALL <- process_torperf_rawdata('torperf-clean.csv', 51200)[c('date','day','bw_md')]
+D_BIG <- process_torperf_rawdata('torperf-clean.csv', 5242880)[c('date','day','bw_md')]
+
+
+D_SMALL <- subset( D_SMALL, day>=547 )
+D_BIG <- subset( D_BIG, day>=547 )
+
+
+mSMALL <- lm( log2(bw_md) ~ day, data=D_SMALL )
+mBIG <- lm( log2(bw_md) ~ day, data=D_BIG )
+
+(1.0 / mSMALL$coefficients) / 30
+(1.0 / mBIG$coefficients) / 30
+
+
+
+#####################################################################################
+# Normalized Torperf
+#####################################################################################
+
+# Readin the netindex and average it
+#######################################################
+D_netindex <- read.csv('country_daily_speeds.csv')[c('date','country_code','download_kbps')]
+
+D_netindex$download_kbps <- D_netindex$download_kbps * (1000 / 1024)
+
+
+# make a single download_rate averaging across the countries: US, DE, RU.
+D_US = subset( D_netindex, country_code=='US' )
+D_DE = subset( D_netindex, country_code=='DE' )
+D_RU = subset( D_netindex, country_code=='RU' )
+
+# merge the US, DE, and RU bandwidths
+D_temp <- merge( D_US, D_DE, by='date' )
+D_ni <- merge( D_temp, D_RU, by='date' )
+
+# drop the country codes
+D_ni <- D_ni[ , -which(names(D_ni) %in% c('country_code.x','country_code.y','country_code'))]
+
+# average the download KiB/s entries into one
+D_ni$avr_download_KBps <- NA
+D_ni$avr_download_KBps <- (D_ni$download_kbps.x + D_ni$download_kbps.y + D_ni$download_kbps) / 3.0
+
+# drop the country-specific download rates
+D_ni <- D_ni[ , -which(names(D_ni) %in% c('download_kbps.x','download_kbps.y','download_kbps'))]
+
+
+# now merge D_ni and {D_SMALL,D_BIG} based on date
+Dnorm_SMALL <- merge( D_ni, D_SMALL, by='date' )
+Dnorm_BIG <- merge( D_ni, D_BIG, by='date' )
+
+Dnorm_SMALL$normalized_bw <- NA; Dnorm_SMALL$normalized_bw <- Dnorm_SMALL$bw_md / Dnorm_SMALL$avr_download_KBps
+Dnorm_BIG$normalized_bw <- NA; Dnorm_BIG$normalized_bw <- Dnorm_BIG$bw_md / Dnorm_BIG$avr_download_KBps
+
+
diff --git a/2014/tor-growth/scripts/non-tor-bandwidth.R b/2014/tor-growth/scripts/non-tor-bandwidth.R
new file mode 100644
index 0000000..c062f55
--- /dev/null
+++ b/2014/tor-growth/scripts/non-tor-bandwidth.R
@@ -0,0 +1,70 @@
+rm( list = ls() )
+setwd('/Users/know/Desktop/tor analytics/non-tor-bw/')
+
+
+library(plyr) # for renaming columns
+
+#######################################################
+D_netindex <- read.csv('country_daily_speeds.csv')[c('date','country_code','download_kbps')]
+
+D_netindex$download_kbps <- D_netindex$download_kbps * (1000 / 1024)
+D_netindex$upload_kbps <- D_netindex$upload_kbps * (1000 / 1024)
+
+
+
+D_US = subset( D_netindex, country_code=='US' )
+D_DE = subset( D_netindex, country_code=='DE' )
+D_RU = subset( D_netindex, country_code=='RU' )
+
+# merge the US, DE, and RU bandwidths
+D_temp <- merge( D_US, D_DE, by='date' )
+D_COMPOSITE <- merge( D_temp, D_RU, by='date' )
+
+# drop the country codes
+D_COMPOSITE <- D_COMPOSITE[ , -which(names(D_COMPOSITE) %in% c('country_code.x','country_code.y','country_code'))]
+
+# average the download KiB/s entries into one
+D_COMPOSITE$avr_download_KBps <- NA
+D_COMPOSITE$avr_download_KBps <- (D_COMPOSITE$download_kbps.x + D_COMPOSITE$download_kbps.y + D_COMPOSITE$download_kbps) / 3.0
+
+# drop the country-specific download rates
+D_COMPOSITE <- D_COMPOSITE[ , -which(names(D_COMPOSITE) %in% c('download_kbps.x','download_kbps.y','download_kbps'))]
+
+
+
+plot( 1:nrow(D_US), log2(D_US$download_kbps), yaxt='n', xaxt='n', col='red', cex=0.7, xlab="Year", ylab="Mean download bandwidth (KiB/s)", pch=20 )
+points( 1:nrow(D_RU), log2(D_RU$download_kbps), yaxt='n', xaxt='n', col='blue', cex=0.7, pch=20 )
+points( 1:nrow(D_DE), log2(D_DE$download_kbps), yaxt='n', xaxt='n', col='orange', cex=0.7, pch=20 )
+
+points( 1:nrow(D_COMPOSITE), log2(D_COMPOSITE$avr_download_KBps), col='black', cex=0.5, pch=20 )
+
+####### Set the pretty Y-axis ############################
+par(las=1)
+#lab <- seq(from=1,to=45,by=1)
+lab <- c(12,12.5,13,13.5,14,14.5)
+axis(2,at=lab,labels=parse(text=paste("2^", lab, sep="")) )
+##########################################################
+
+
+####### Set the pretty X-axis ###################################
+YearLabels=c('2009','2010','2011','2012','2013','2014')
+YearLocations=c(367, 732, 1097, 1462, 1828, 2193)
+axis(1,at=YearLocations,labels=YearLabels )
+#################################################################
+
+
+legend_texts = c("United Staes", "Germany", "Russia", "Composite")
+legend( "topleft", legend=legend_texts, inset=0.01, pch=c(20,20), col=c('red','orange','blue','black') ) 
+
+
+
+day <- 1:nrow(D_US)
+
+mm <- lm( log2(download_kbps) ~ day, data=D_US )
+1.0 / (mm$coefficients["day"] * 30)
+
+mm <- lm( log2(download_kbps) ~ day, data=D_DE )
+1.0 / (mm$coefficients["day"] * 30)
+
+mm <- lm( log2(download_kbps) ~ day, data=D_RU )
+1.0 / (mm$coefficients["day"] * 30)
diff --git a/2014/tor-growth/scripts/relays.R b/2014/tor-growth/scripts/relays.R
new file mode 100644
index 0000000..f95ba77
--- /dev/null
+++ b/2014/tor-growth/scripts/relays.R
@@ -0,0 +1,97 @@
+rm( list = ls() )
+setwd('/Users/know/Desktop/tor analytics/')
+library(car)    # for pretty plots
+#library(matlab) # for matlab function names
+#library(data.table) # for data.table like data.frame
+#library(xtable) # for exporting to LaTeX
+#library(gdata)
+#library(lmtest) # for testing linear models
+#library(calibrate)
+
+library(plyr) # for renaming columns
+
+#######################################################
+
+## Readin the data into data.frame D with columns: days, relays.stable, relays.fast, relays.all
+#######################################################
+Dstable <- read.csv('relays-stable.csv')[c('date','relays')]
+#Dfast <- read.csv('relays-fast.csv')[c('date','relays')]
+#Dtemp <- merge(Dstable, Dfast, by="date", suffixes=c('.stable','.fast') )
+Dall <- read.csv('relays-total.csv')[c('date','relays')]
+D <- merge( Dstable, Dall, by='date' )
+D <- rename(D, c("date"="day","relays"="relays.all"))
+names(D)
+
+plot(D$day, log2(D$relays.all), ylab='Number of Relays', xlab='Year', yaxt='n', pch=20, cex=0.6, xaxt='n', col='blue', ylim=c(8,13) ) 
+
+points( D$day, log2(D$relays.stable), pch=20, cex=0.6, col='purple' )
+
+
+plot(1:nrow(Dall), log2(Dall$relays), ylab='Number of Relays', xlab='Year', yaxt='n', pch=20, cex=0.6, xaxt='n', col='blue', ylim=c(8,13) )
+
+
+####### Set the pretty X-axis ###################################
+YearLabels=seq(from=2008,to=2014,by=1)
+YearLocations=c(66,432,797,1162,1527,1893,2258)
+axis(1,at=YearLocations,labels=YearLabels )
+#################################################################
+
+####### Set the pretty Y-axis ###################################
+par(las=1)
+lab <- seq(from=1,to=45,by=1)
+axis(2,at=lab,labels=parse(text=paste("2^", lab, sep="")) )
+#################################################################
+
+
+
+
+
+
+## Set the Legend and Title
+##################################################################
+legend_texts = c(
+  expression(paste("All relays         ", r^2, "=0.96")),
+  expression(paste("Stable relays   ", r^2, "=0.93"))  
+)
+
+legend( "topleft", legend=legend_texts, inset=0.05, pch=c(20,20), col=c('blue','purple') ) 
+
+multiTitle(color="black","Number of Tor relays doubles every ", 
+           color="purple","2.1",
+           color="black",'-',
+           color="blue","2.6",
+           color="black"," years" )
+
+
+
+####### Plot the best-fit lines ############################
+
+# remove data before year 2010
+FD <- D[ which(D$day >= 750), ]
+
+# remove points with super-high-residuals from the fitted line
+fit_all <- lm( log2(relays.all) ~ day, data=FD )
+rows_to_remove <- abs(resid(fit_all)) > 0.38
+FD <- FD[ !rows_to_remove, ]
+
+
+# fit to a linear model
+fit_all <- lm( log2(relays.all) ~ day, data=FD )
+fit_stable <- lm( log2(relays.stable) ~ day, data=FD )
+
+
+# Add the best-fit lines
+first_day <- min(FD$day)
+segments( first_day, predict(fit_all, data.frame(day=first_day)),
+          max(FD$day), predict(fit_all, data.frame(day=max(FD$day))),
+          col="black", lty=2, lwd=3 )
+segments( first_day, predict(fit_stable, data.frame(day=first_day)),
+          max(FD$day), predict(fit_stable, data.frame(day=max(FD$day))),
+          col="black", lty=2, lwd=3 )
+
+# Add the black squares
+points( first_day, predict(fit_all, data.frame(day=min(FD$day))), col="black", pch=15, cex=1.3)
+points( first_day, predict(fit_stable, data.frame(day=min(FD$day))), col="black", pch=15, cex=1.3)
+
+#summary( fit_all )
+#summary( fit_stable )
\ No newline at end of file
diff --git a/2014/tor-growth/scripts/torperf.R b/2014/tor-growth/scripts/torperf.R
new file mode 100644
index 0000000..eda9162
--- /dev/null
+++ b/2014/tor-growth/scripts/torperf.R
@@ -0,0 +1,136 @@
+rm( list = ls() )
+setwd('/Users/know/Desktop/tor analytics/')
+library(car)    # for pretty plots
+library(matlab) # for matlab function names
+library(data.table) # for data.table like data.frame
+#library(xtable) # for exporting to LaTeX
+#library(gdata)
+library(lmtest) # for testing linear models
+library(calibrate)
+
+library(plyr) # for renaming columns
+source("colortitles.R")
+#######################################################
+
+process_torperf_rawdata <- function( filename, filesize_to_consider=5242880 )
+{
+  
+  # Import data from TorPerf
+  Dtp <- read.csv( filename )[c('day','date','size','source','q1','md','q3')]
+  Dtp <- rename(Dtp, c("size"="filesize") )
+  
+  print(unique(Dtp$filesize))
+  # only use the aggregated Tor data for downloading a 5 MiB file
+  Dtp <- subset( Dtp, source=='' & filesize==filesize_to_consider )
+  
+  #print( tail(Dtp) )
+  
+  # drop the source and filesize column
+  #Dtp <- Dtp[ , -which(names(Dtp) %in% c('source','filesize'))]
+  Dtp <- Dtp[ , -which(names(Dtp) %in% c('source'))]
+  
+  # rename the q1, md, and q3 for TIME
+  Dtp <- rename(Dtp, c("q1"="time_q1","md"="time_md","q3"="time_q3") )
+  
+  # convert time from MILLISECONDS -> SECONDS
+  Dtp$time_q1 <- Dtp$time_q1/1000
+  Dtp$time_md <- Dtp$time_md/1000
+  Dtp$time_q3 <- Dtp$time_q3/1000
+  
+  
+  # now create the bw_q1, bw_md, bw_q3 in: KiB/s
+  Dtp[c("bw_q1","bw_md","bw_q3")] <- c(NA,NA,NA)
+  
+  #convert my_filesize to 
+  
+  # Rewrite q1, md, and q3 to be in bandwidth (KiB/s)
+  Dtp$bw_q1 <- (filesize_to_consider / 1024) / Dtp$time_q1;
+  Dtp$bw_md <- (filesize_to_consider / 1024) / Dtp$time_md;
+  Dtp$bw_q3 <- (filesize_to_consider / 1024) / Dtp$time_q3;
+  
+  return(Dtp)
+}
+
+# get the two groups
+D_BIG <- process_torperf_rawdata('torperf-clean.csv', 5242880)
+D_MED <- process_torperf_rawdata('torperf-clean.csv', 1048576)
+D_SMALL <- process_torperf_rawdata('torperf-clean.csv', 51200)
+
+
+# DO ANALYSIS FROM HERE
+#######################################################
+
+plot(D_BIG$day, log2(D_BIG$bw_md), xlab='Year', ylab="Torperf bandwidth (KiB/s)", pch='', xaxt='n', yaxt='n', ylim=c(2.5,9) )
+#points(D_BIG$day, log2(D_BIG$bw_md), pch=20, col='blue', cex=0.4 )
+#points(D_MED$day, log2(D_MED$bw_md), pch=20, col='purple', cex=0.4 )
+#points(D_SMALL$day, log2(D_SMALL$bw_md), pch=20, col='red', cex=0.4 )
+
+
+points( smooth.spline(D_BIG$day, log2(D_BIG$bw_md)), pch=20, col='blue', cex=0.5 )
+points( smooth.spline(D_MED$day, log2(D_MED$bw_md)), pch=20, col='purple', cex=0.5 )
+points( smooth.spline(D_SMALL$day, log2(D_SMALL$bw_md)), pch=20, col='red', cex=0.5 )
+
+
+####### Set the pretty X-axis and Y-axis ############################
+YearLabels=seq(from=2010,to=2014,by=1)
+YearLocations=c(182,547,912,1278,1643)
+axis(1,at=YearLocations,labels=YearLabels )
+
+par(las=1)
+lab <- seq(from=-5,to=10,by=1)
+labels <- parse(text=paste("2^", lab, sep="") )
+axis(2,at=lab,labels=labels )
+
+
+
+
+####### Plot the best-fit lines ############################
+Dfit <- subset( D_BIG, day>=547 )
+first_fit_day <- min(Dfit$day)
+fit_BIG <- lm( log2(bw_md) ~ day, data=Dfit )
+
+
+# Add the best-fit line (with black square) for the advertised bandwidth
+segments( min(Dfit$day), predict(fit_BIG, data.frame(day=min(Dfit$day))),
+          max(Dfit$day), predict(fit_BIG, data.frame(day=max(Dfit$day))),
+          col="black", lty=2, lwd=3 )
+points( min(Dfit$day), predict(fit_BIG, data.frame(day=min(Dfit$day))), col="black", pch=15, cex=1.3)
+
+
+####################################################################
+
+Dfit <- subset( D_MED, day>=547 )
+first_fit_day <- min(Dfit$day)
+fit_MED <- lm( log2(bw_md) ~ day, data=Dfit )
+
+
+# Add the best-fit line (with black square) for the advertised bandwidth
+segments( min(Dfit$day), predict(fit_MED, data.frame(day=min(Dfit$day))),
+          max(Dfit$day), predict(fit_MED, data.frame(day=max(Dfit$day))),
+          col="black", lty=2, lwd=3 )
+points( min(Dfit$day), predict(fit_MED, data.frame(day=min(Dfit$day))), col="black", pch=15, cex=1.3)
+
+
+####################################################################
+
+Dfit <- subset( D_SMALL, day>=547 )
+fit_SMALL <- lm( log2(bw_md) ~ day, data=Dfit )
+
+# Add the best-fit line (with black square) for the advertised bandwidth
+segments( min(Dfit$day), predict(fit_SMALL, data.frame(day=min(Dfit$day))),
+          max(Dfit$day), predict(fit_SMALL, data.frame(day=max(Dfit$day))),
+          col="black", lty=2, lwd=3 )
+points( min(Dfit$day), predict(fit_SMALL, data.frame(day=min(Dfit$day))), col="black", pch=15, cex=1.3)
+
+
+
+legend_texts = c(
+  expression(paste("5   MiB ", r^2, "=0.46")),
+  expression(paste("1   MiB ", r^2, "=0.48")),
+  expression(paste("50 KiB ", r^2, "=0.55"))  
+)
+
+
+legend( "topleft", legend=legend_texts, inset=0.03, pch=c(20,20), col=c('blue','purple','red') ) 
+
+
diff --git a/2014/tor-growth/tor-growth.tex b/2014/tor-growth/tor-growth.tex
new file mode 100644
index 0000000..593d103
--- /dev/null
+++ b/2014/tor-growth/tor-growth.tex
@@ -0,0 +1,245 @@
+\documentclass{tortechrep}
+\usepackage{afterpage}
+%\usepackage{subfloat}
+\usepackage{subfig}
+\usepackage{url}
+\usepackage{fullpage}
+\usepackage{amsmath}
+\usepackage{booktabs}
+%\usepackage{afterpage}
+%\usepackage{subcanonption}
+\usepackage{graphicx}
+
+
+
+%%%%%%%%% BLUE UNDERLINES
+\usepackage{color}  % << color package is required for blue underline
+\usepackage{ulem} % << ulem package is required for blue underline
+
+%Define a blue underline
+\newcommand{\blueuline}{\bgroup\markoverwith{\hbox{\kern-.03em\vtop%
+{\begingroup\kern.1ex\color{blue}\hrule width .2em\kern1.1pt \endgroup\kern-.03em}}}\ULon}
+%\newcommand\reduline{\bgroup\markoverwith
+%      {\textcolor{red}{\rule[-0.5ex]{2pt}{0.4pt}}}\ULon}
+
+\newcommand{\uhref}[2]{\href{#1}{\blueuline{#2}}}
+%%%%%%%%%%%%% END BLUE UNDERLINES
+
+
+
+\title{Tor growth rates and improving Torperf throughput}
+\author{Virgil Griffith}
+\reportid{2014-10-001}
+\date{October 04, 2014}
+
+\newcommand{\Figref}[1]{Figure~\ref{#1}}
+\newcommand{\figref}[1]{Figure~\ref{#1}}
+
+\begin{document}
+\maketitle
+
+\section{Preliminaries}
+Despite data being available from \uhref{http://metrics.torproject.org}{metrics.torproject.org} for sometime, there’s been little statistical analysis of that data.  Let’s fix that.  From the Metrics data, the most obvious thing to plot is the number of relays over time, see \figref{fig:fig1}.  Plotting in logscale (so a straight line means exponential growth) reveals that the number of relays increases exponentially.  Good to know.  The ``stable relays'' are plotted in purple because they are fabulous.
+
+
+Next in \figref{fig:fig2} we chart the total network bandwidth over time.  Tor's total network bandwidth doubles at a darn impressive 13--14 months!  Moore's Law, doubling every 18 months, is downright torpid by comparison.
+
+ 
+Since 2010 the doubling rates for both relays and bandwidth have been remarkably consistent.  Although recognizing that there are unaccounted for sinusoidal trends, the fact remains that a simple fit of $y = m~\log(x) + b$ accounts for \textasciitilde 90\% of the variance!  Additionally, the 99\% confidence intervals on the predicted data are barely visible without a magnifying glass.  Extrapolation from statistics is a dangerous game, but realistically we can't expect these growth rates to be more predictable.  With this statistical bedrock under our feet, let's go deeper.  In \figref{fig:fig3} we see how the mean relay bandwidth grows over time.  We see that the mean relay bandwidth doubles about every two years.  This is akin to \uhref{http://www.nngroup.com/articles/law-of-bandwidth/}{Nielsen's Law} which states that for high-end home users, bandwidth doubles every two years.  Good job operators---those Tor-shirts are well earned!
+
+
+
+
+ 
+ 
+We see that the mean relay bandwidth increases by Nielsen's Law, but how does this impact client experience?  Fortunately, we have \uhref{https://metrics.torproject.org/performance.html}{Torperf data} to answer this.  Simple things first, and in \figref{fig:fig4} we plot Torperf bandwidth over time.  Torperf's fitted line isn't nearly as good a fit as the number of relays or total bandwidth (Figures \ref{fig:fig1} and \ref{fig:fig2}), but it conveys enough of the trend to be useful.  We see that, depending on file size, Torperf throughput doubles every 25--35 months.\footnote{It's not obvious that Torperf bandwidth increases exponentially, but given that bandwidth and CPU are the primary factors in Torperf and that each of these follow their respective exponential curves, it's reasonable to err on the side of an exponential fit over a linear one.  Statistical modeling often leverages domain knowledge.}  Given such a wide spread in Figure \figref{fig:fig4}, we will separately conside
 r the Torperf bandwidth for downloading a 50 KiB and 5 MiB file.  Lets go deeper.
+ 
+ 
+
+
+Absolute Torperf improvements are great to see, but the key measure is how Torperf throughput compares with clients’ non-Tor throughput.  From \uhref{http://www.netindex.com/}{OOKLA bandwidth data} we calculate the composite mean download rate for the three countries with the greatest number of Tor clients: United States, Germany, and Russia (\figref{fig:nonTor}).  With the composite non-Tor bandwidth in hand, we plot Torperf bandwidth normalized (divided) by the composite non-Tor bandwidth arriving at \figref{fig:fig5}.
+ 
+
+
+For smaller files (50 KiB), we see that although absolute Torperf has been doubling every 35 months, normalized Torperf has been essentially flat.  For larger files (5 MiB), we see a gradual uptick in normalized Torperf.
+
+From the doubling rates of Torperf and composite non-Tor bandwidth we can derive the normalized Torperf growth rates analytically.  Taking the ratio of two exponentials of the form $y = 2^{(1/n) x}$ where $n$ is the doubling rate, we get $y = 2^{(1/n - 1/m) x}$ where $n$ and $m$ are the doubling rates of Torperf bandwidth and composite non-Tor bandwidth respectively.  This results in normalized Torperf doubling every $20$ years for small files and doubling every $5$ years for large files.  To put a five year doubling rate in perspective, this means Torperf will reach $5\%$ of non-Tor bandwidth around year 2022.  Internal optimizations like the \uhref{http://www.robgjansen.com/publications/kist-sec2014.pdf}{KIST scheduler} are great steps to improve this.
+ 
+\section{Will adding advertised bandwidth improve Torperf?}
+ 
+There have been \uhref{https://blog.torproject.org/blog/tor-incentives-research-roundup-goldstar-par-braids-lira-tears-and-torcoin}{various proposals} for improving client speeds by adding operator incentives beyond the established \uhref{https://www.torproject.org/getinvolved/tshirt.html}{T-shirts} and \uhref{https://blog.torservers.net/20131213/torservers-awarded-250000-by-digital-defenders.html}{financial grants}.  Our final analysis is an attempt to predict whether adding more advertised relay bandwidth would reliably improve Torperf throughput.
+
+
+We've established that absolute Torperf improves on its own due to the increasing bandwidth of relays.  Our first step to blunt the influence of increasing relay bandwidth is to always look at the \emph{normalized} Torperf performance.  We explored several different predictors of normalized Torperf, and the most promising was proportion of total read bandwidth to total advertised bandwidth, or the Network Utilization Ratio (NUR).  We plot normalized Torperf as a function of NUR in \figref{fig:fig6}.
+
+
+
+
+We see that NUR doesn't predict much of the normalized bandwidth for small (50 KiB) files.  However, for large files (5 MiB), there's a fuzzy yet definite trend of ``lower NUR means higher normalized Torperf''.  But there's a risk, we see that the lowest NUR data points (purple) are all from 2014.  Therefore NUR could be acting as a mere proxy for the gradual (yet slow per \figref{fig:fig5}) improvement of normalized Torperf over time.
+
+We control for this using a two-factor ANOVA using \texttt{DATE} and \texttt{NUR} as the two factors and normalized Torperf as the dependent variable.  For the stats-literate, the full ANOVA tables are given in Table \ref{tbl:anovatables}, but the take-home message is that \texttt{NUR} provides substantial predictive power for normalized Torperf even after accounting for \texttt{DATE}.  Concretely, while the single-factor model using \texttt{DATE} has an $r^2$ of $0.02$ (50 KiB) and $0.14$ (5 MiB), the two-factor model using \texttt{DATE} and \texttt{NUR} yields an $r^2$ of $0.17$ and $0.44$---a $750\%$ and $208\%$ improvement respectively.  This allows us to tentatively conclude that a sudden uptick in advertised bandwidth would improve normalized Torperf beyond the glacial ascent seen in \figref{fig:fig5}.\footnote{Unsurprisingly, there's some caveats to this conclusion.  Our argument presumes that the distribution of advertised bandwidth across relays is constant---for example, T
 orperf would not improve if $10^{12}$ new relays joined the consensus but each provided only $1$ B/s.  We're aware of no evidence indicating this assumption is unrealistic.}
+
+
+\begin{table}[hbt]
+\centering
+\subfloat[50 KiB.  For aggregate model $r^2=0.17$.] {
+\begin{tabular}{ l l l l l l } \toprule
+ & df    & Sum Sq & Mean Sq & F-value & p-value \\
+\midrule
+\texttt{DATE}    & 1     & 0.04039      & 0.040390    & \ \ 38.418  & 7.309\textsc{e}$^{-10}$ \\
+\texttt{NUR}    & 1     & 0.29005      & 0.290045    & 275.887  & 2\textsc{e}$^{-16}$ \\
+Residuals & 1546     & 1.62534      & 0.001051    &   &  \\
+\bottomrule
+\end{tabular} }
+
+%\vskip
+\bigskip
+
+\subfloat[5 MiB.  For aggregate model $r^2=0.44$.]{
+\begin{tabular}{ l l l l l l } \toprule
+ & df    & Sum Sq & Mean Sq & F-value & p-value \\
+\midrule
+\texttt{DATE}    & 1     & 27.395      & 27.395    & 401.09  & 2\textsc{e}$^{-16}$ \\
+\texttt{NUR}    & 1     & 56.750      & 56.750    & 830.87  & 2\textsc{e}$^{-16}$ \\
+Residuals & 1546     & 105.595      & 0.068   &   &  \\
+\bottomrule
+\end{tabular} }
+
+\caption{ANOVA tables predicting normalized Torperf for downloading a 50 KiB and 5 MiB file.}
+\label{tbl:anovatables}
+\end{table}
+
+
+ 
+\section{Summary}
+We've learned a few things.
+
+\begin{enumerate} 
+    \item Many aspects of Tor follow exponential growth.  Table \ref{tbl:summary} summarizes these results.  Additionally, Tor bandwidth currently sits at $<2\%$ of mean non-Tor bandwidth.
+
+    \item Tor clients' absolute throughput is steadily improving.  However, after normalizing by mean non-Tor bandwidth, this improvement is greatly diminished.  For small files, normalized Torperf has been essentially flat since records have been kept.
+
+    \item An intervention to increase advertised bandwidth would noticeably improve normalized Torperf for large \emph{as well as small} files.
+\end{enumerate}
+
+
+\begin{table}
+\centering
+\begin{tabular}{ l l l l } \toprule
+ & Doubling rate    & $\ \ r^2$ \\
+% & (years)    &  \\ 
+\midrule
+Total advertised bandwidth & \ \;1.2 \ years &  0.96 \\
+Mean relay bandwidth & \ \;2\ \ \; \ years &  0.91 \\
+Number of relays (all) & \ \;3\ \ \; \ years & 0.94 \medskip \\
+
+Absolute Torperf (5 MiB) & \ \;2\ \ \; \ years & 0.46  \\
+Absolute Torperf (50 KiB) & \ \;3\ \ \; \ years & 0.55 \medskip \\
+
+Mean RU download bandwidth & \ \;3.1 \ years & 0.95 \\
+Mean US download bandwidth & \ \;3.4 \ years & 0.97 \\
+Mean DE download bandwidth & \ \;3.9 \ years & 0.88 \\
+Composite download bandwidth & \ \;3.5 \ years & 0.97 \medskip \\
+
+Normalized Torperf (5 MiB) & \ \;5 \ \ \;\ years & \ \ - \\
+Normalized Torperf (50 KiB) & 19.9 \ years & \ \ - \\
+\bottomrule
+\end{tabular} 
+
+\caption{Summary of growth rates}
+\label{tbl:summary}
+\end{table}
+
+
+
+\section{Future Work}
+\label{sect:fw}
+Some natural extensions to this work are:
+\begin{itemize}
+    \item Instead of looking at the \emph{mean} relay bandwidth, instead separately calculate the \emph{expected} bandwidth for the guard, middle, and exit node positions.
+    \item It'd be nice to characterize the \emph{distribution} of advertised bandwidth.  Does it follow a Gaussian?  Pareto?  It'd be nice to know.
+    \item When computing the composite non-Tor bandwidth, instead of doing an unweighted average of the United Staes, Germany, and Russia, it'd be better to do a \emph{weighted average} among all countries in which each country is weighted by its number of originating Tor clients.  We doubt this would change the conclusions.
+    \item Tor's Network Utilization Ratio (NUR), shown in \figref{fig:fig8}, has clear drops of unclear cause.  Given how predictive NUR is of normalized Torperf, we'd like to know the causes of the two drops in NUR on 2013-10-09 and 2014-06-06.
+\end{itemize}
+
+\flushleft \textbf{Acknowledgements.}  We thank Roger Dingledine and Karsten Loesing for their help and review.  All analyses were done in R.
+
+\clearpage
+
+%%%%%%%%%%%%%%%%%
+%% FIGURES
+%%%%%%%%%%%%%%%%%
+
+\begin{figure}[h!bt]
+    \centering
+    \includegraphics[height=3.55in]{figs/fig1-relays.png}
+    \caption{The number of Tor relays increases exponentially, doubling every 2 years (stable) to 2.5 years (all).}
+    \label{fig:fig1}    
+\end{figure}
+\begin{figure}[h!bt]
+    \centering
+    \includegraphics[height=3.55in]{figs/fig2-bw.png}
+    \caption{Total network bandwidth also increases exponentially.}
+    \label{fig:fig2}    
+\end{figure}
+
+\begin{figure}[h!bt]
+    \centering
+    \includegraphics[height=5in]{figs/fig3-mean-bw.png}
+    \caption{Mean relay bandwidth increases exponentially and doubles approximately every 24 months.}
+    \label{fig:fig3}    
+\end{figure}
+
+\begin{figure}[h!bt]
+    \centering
+    \includegraphics[height=5in]{figs/fig4--torperf.png}
+    \caption{Absolute Torperf throughput increases exponentially, doubling every 25 months for 5 MiB files and every 35 months for 50 KiB files.  Unfortunately, the throughput when downloading a 50 KiB file is \textasciitilde 8x slower than downloading a 5 MiB file.  These trends imply that these two rates will continue to diverge.}
+    \label{fig:fig4}    
+\end{figure}
+
+
+\begin{figure}[h!bt]
+    \centering
+    \includegraphics[height=5in]{figs/5a--normalized-torperf-small.png}
+    \caption{The normalized Torperf for 50 KiB and 5 MiB files.}
+    \label{fig:fig5}     
+\end{figure}
+
+
+\begin{figure}
+    \centering
+    \subfloat[50 KiB; $r^2=0.15$.]{ \includegraphics[height=4in]{figs/fig6--NUR-predicts-normtorperf_small.png} \label{fig:6a} }
+    
+    \subfloat[5 MiB; $r^2=0.44$.]{ \includegraphics[height=4in]{figs/fig6--NUR-predicts-normalized-torperf_large.png} \label{fig:6b} }    
+    \caption{Low NUR imples higher normalized Torperf---especially so for larger files.}
+    \label{fig:fig6}
+\end{figure}
+
+
+
+\begin{figure}[h!bt]
+    \centering
+    \includegraphics[height=5in]{figs/non-tor-bw.png}
+    \caption{Mean download bandwidth for United States, Germany, and Russia according to netindex.com.  Composite is the mean of all three.}
+    \label{fig:nonTor}    
+\end{figure}
+
+
+\begin{figure}[h!bt]
+    \centering
+    \includegraphics[height=5in]{figs/appendix--NUR-over-time.png}
+    \caption{Network Utilization Ratio (NUR) falls into three distinct stages.  Within each stage the fitted line is essentially flat.  What happened on 2013-10-08 and 2014-06-06!?  The only thing we see is that on 2014-06-05 (one day prior) the EFF began their Tor Challenge.}
+    \label{fig:fig8}    
+\end{figure}
+
+
+
+
+
+
+
+
+
+
+
+\end{document}
diff --git a/2014/tor-growth/tortechrep.cls b/2014/tor-growth/tortechrep.cls
new file mode 120000
index 0000000..4c24db2
--- /dev/null
+++ b/2014/tor-growth/tortechrep.cls
@@ -0,0 +1 @@
+../../tortechrep.cls
\ No newline at end of file





More information about the tor-commits mailing list