[tor-commits] [metrics-tasks/master] Calculate and graph frac_relays and frac_cw using stem (#7241).

karsten at torproject.org karsten at torproject.org
Fri Jan 18 10:19:14 UTC 2013


commit 0e4f962092b1e4c5545136485bb338cd839401f2
Author: peer <peer at lavabit.com>
Date:   Thu Jan 17 14:41:54 2013 +0000

    Calculate and graph frac_relays and frac_cw using stem (#7241).
---
 task-7241/README        |   22 +++++++++
 task-7241/first_pass.py |  109 +++++++++++++++++++++++++++++++++++++++++++++++
 task-7241/plot.R        |   13 ++++++
 3 files changed, 144 insertions(+), 0 deletions(-)

diff --git a/task-7241/README b/task-7241/README
new file mode 100644
index 0000000..80a710f
--- /dev/null
+++ b/task-7241/README
@@ -0,0 +1,22 @@
+Uses stem to parse network consensus documents to determine frac_relays and frac_cw based on fingerprint.
+
+*Definitions*
+
+Let Y be the consensus listed (now) and X the consensus some hours ago (now - hours).
+
+Let intersection(X,Y) be the routers in both X and Y based on fingerprint.
+
+frac_relay is count(intersection(X,Y))/count(Y).
+
+frac_cw is the sum of consensus weights in Y over intersection(X,Y) divided by the sum of consensus weights in Y.
+
+*Notes*
+
+Output is in CSV format and does not include a header. Fields are consensus, hour, frac_relays, frac_cw, month, day, and day of week.
+
+Unavailable network consensus documents based on path are ignored.
+
+Change initial_time_info_bound, final_time_info_bound, initial_time_data_bound, and final_time_data_bound to explore different time ranges. initial_time_data_bound should be at least 168 hours before initial_time_info_bound.
+
+Four months of hourly data (fingerprint, consensus weights) uses about 1.5GB of space.
+
diff --git a/task-7241/first_pass.py b/task-7241/first_pass.py
new file mode 100644
index 0000000..f3f9707
--- /dev/null
+++ b/task-7241/first_pass.py
@@ -0,0 +1,109 @@
+import sys
+from datetime import datetime, timedelta
+
+from stem.descriptor.networkstatus import NetworkStatusDocumentV3
+
+# http://stackoverflow.com/questions/82831/how-do-i-check-if-a-file-exists-using-python
+def file_check(file_path):
+	try:
+		with open(file_path) as f:
+			return True
+	except IOError:
+		return False
+
+def filepath_from_time(cur_datetime):
+	consensus_path = 'consensuses-'
+	consensus_path += cur_datetime.strftime('%Y-%m')
+	consensus_path += '/'
+	consensus_path += cur_datetime.strftime('%d')
+	consensus_path += '/'
+	consensus_path += cur_datetime.strftime('%Y-%m-%d-%H-%M-%S')
+	consensus_path += '-consensus'
+
+	return consensus_path
+
+def filename_from_time(cur_datetime):
+	consensus_filename = cur_datetime.strftime('%Y-%m-%d-%H-%M-%S')
+	consensus_filename += '-consensus'
+
+	return consensus_filename
+
+time_interval = timedelta(0, 60*60) # one hour
+
+# base consensuses for examination
+initial_time_info_bound = datetime(2012, 1, 1) # inclusive
+final_time_info_bound = datetime(2013, 1, 1) # exclusive
+
+router_data = {}
+
+# data range for consensuses
+initial_time_data_bound = datetime(2011, 12, 1) # inclusive
+final_time_data_bound = datetime(2013, 1, 1) # exclusive
+
+# load information
+cur_datetime = initial_time_data_bound - time_interval
+while cur_datetime < final_time_data_bound - time_interval:
+	cur_datetime += time_interval
+
+	cur_filepath = filepath_from_time(cur_datetime)
+	cur_filename = filename_from_time(cur_datetime)	
+
+	if file_check(cur_filepath) == True:
+		consensus_file = open(cur_filepath, 'r')
+		consensus_file.readline()
+		consensus = NetworkStatusDocumentV3(consensus_file.read())
+		consensus_file.close()
+
+		routers = {}
+		for router in consensus.routers:
+			routers[router.fingerprint] = router.bandwidth
+
+		router_data[cur_filename] = routers
+
+# interval multipliers
+time_interval_list = [1,2,3,4,5,6,12,24,36,48,72,96,120,144,168] # hours
+
+# iterate over base consensuses
+cur_datetime = initial_time_info_bound - time_interval
+while cur_datetime < final_time_info_bound - time_interval:
+	cur_datetime += time_interval
+
+	cur_filepath = filepath_from_time(cur_datetime) # current
+	cur_filename = filename_from_time(cur_datetime) # current	
+
+	if file_check(cur_filepath) == True:
+		base_routers = router_data[cur_filename]
+		base_router_count = 0
+		base_router_bandwidth = 0
+		for fingerprint in router_data[cur_filename].keys():
+			base_router_count += 1
+			base_router_bandwidth += router_data[cur_filename][fingerprint]
+
+		for comparison_time_interval_multiplier in time_interval_list:
+			comparison_time_interval = timedelta(0, comparison_time_interval_multiplier*60*60)
+			comparison_datetime = cur_datetime - comparison_time_interval
+
+			comparison_filepath = filepath_from_time(comparison_datetime) # comparison
+			comparison_filename = filename_from_time(comparison_datetime) # comparison
+
+			if file_check(comparison_filepath) == True:
+				comparison_router_count = 0
+				comparison_router_bandwidth = 0
+				comparison_router_overlap_bandwidth = 0
+				base_router_overlap_bandwidth = 0
+				comparison_router_overlap_count = 0
+		
+				for fingerprint in router_data[comparison_filename].keys():
+					comparison_router_count += 1
+					comparison_router_bandwidth += router_data[comparison_filename][fingerprint]
+
+					if fingerprint in base_routers:
+						base_router_overlap_bandwidth += base_routers[fingerprint]
+						comparison_router_overlap_count += 1
+						comparison_router_overlap_bandwidth += router_data[comparison_filename][fingerprint]
+
+				frac_relays = float(comparison_router_overlap_count)/float(base_router_count)
+				frac_cw = float(base_router_overlap_bandwidth)/float(base_router_bandwidth)
+
+				print '%s,%d,%f,%f,%d,%d,%s' % (cur_filename, comparison_time_interval_multiplier, frac_relays, frac_cw, cur_datetime.month, cur_datetime.day, cur_datetime.strftime('%w'))
+
diff --git a/task-7241/plot.R b/task-7241/plot.R
new file mode 100644
index 0000000..66d39d9
--- /dev/null
+++ b/task-7241/plot.R
@@ -0,0 +1,13 @@
+require(ggplot2)
+data <- read.csv("s2012.csv", header=TRUE)
+
+# frac_relays
+p <- ggplot(data, aes(factor(hours), frac_relays))
+p + geom_boxplot() + ylab("frac_relays") + xlab("time interval") + ggtitle(2012)
+ggsave("2012_frac_relays.png", height=6, width=6)
+
+# frac_cw
+p <- ggplot(data, aes(factor(hours), frac_cw))
+p + geom_boxplot() + ylab("frac_cw") + xlab("time interval") + ggtitle(2012)
+ggsave("2012_frac_cw.png", height=6, width=6)
+



More information about the tor-commits mailing list