commit 0e4f962092b1e4c5545136485bb338cd839401f2 Author: peer peer@lavabit.com Date: Thu Jan 17 14:41:54 2013 +0000
Calculate and graph frac_relays and frac_cw using stem (#7241). --- task-7241/README | 22 +++++++++ task-7241/first_pass.py | 109 +++++++++++++++++++++++++++++++++++++++++++++++ task-7241/plot.R | 13 ++++++ 3 files changed, 144 insertions(+), 0 deletions(-)
diff --git a/task-7241/README b/task-7241/README new file mode 100644 index 0000000..80a710f --- /dev/null +++ b/task-7241/README @@ -0,0 +1,22 @@ +Uses stem to parse network consensus documents to determine frac_relays and frac_cw based on fingerprint. + +*Definitions* + +Let Y be the consensus listed (now) and X the consensus some hours ago (now - hours). + +Let intersection(X,Y) be the routers in both X and Y based on fingerprint. + +frac_relay is count(intersection(X,Y))/count(Y). + +frac_cw is the sum of consensus weights in Y over intersection(X,Y) divided by the sum of consensus weights in Y. + +*Notes* + +Output is in CSV format and does not include a header. Fields are consensus, hour, frac_relays, frac_cw, month, day, and day of week. + +Unavailable network consensus documents based on path are ignored. + +Change initial_time_info_bound, final_time_info_bound, initial_time_data_bound, and final_time_data_bound to explore different time ranges. initial_time_data_bound should be at least 168 hours before initial_time_info_bound. + +Four months of hourly data (fingerprint, consensus weights) uses about 1.5GB of space. + diff --git a/task-7241/first_pass.py b/task-7241/first_pass.py new file mode 100644 index 0000000..f3f9707 --- /dev/null +++ b/task-7241/first_pass.py @@ -0,0 +1,109 @@ +import sys +from datetime import datetime, timedelta + +from stem.descriptor.networkstatus import NetworkStatusDocumentV3 + +# http://stackoverflow.com/questions/82831/how-do-i-check-if-a-file-exists-usi... +def file_check(file_path): + try: + with open(file_path) as f: + return True + except IOError: + return False + +def filepath_from_time(cur_datetime): + consensus_path = 'consensuses-' + consensus_path += cur_datetime.strftime('%Y-%m') + consensus_path += '/' + consensus_path += cur_datetime.strftime('%d') + consensus_path += '/' + consensus_path += cur_datetime.strftime('%Y-%m-%d-%H-%M-%S') + consensus_path += '-consensus' + + return consensus_path + +def filename_from_time(cur_datetime): + consensus_filename = cur_datetime.strftime('%Y-%m-%d-%H-%M-%S') + consensus_filename += '-consensus' + + return consensus_filename + +time_interval = timedelta(0, 60*60) # one hour + +# base consensuses for examination +initial_time_info_bound = datetime(2012, 1, 1) # inclusive +final_time_info_bound = datetime(2013, 1, 1) # exclusive + +router_data = {} + +# data range for consensuses +initial_time_data_bound = datetime(2011, 12, 1) # inclusive +final_time_data_bound = datetime(2013, 1, 1) # exclusive + +# load information +cur_datetime = initial_time_data_bound - time_interval +while cur_datetime < final_time_data_bound - time_interval: + cur_datetime += time_interval + + cur_filepath = filepath_from_time(cur_datetime) + cur_filename = filename_from_time(cur_datetime) + + if file_check(cur_filepath) == True: + consensus_file = open(cur_filepath, 'r') + consensus_file.readline() + consensus = NetworkStatusDocumentV3(consensus_file.read()) + consensus_file.close() + + routers = {} + for router in consensus.routers: + routers[router.fingerprint] = router.bandwidth + + router_data[cur_filename] = routers + +# interval multipliers +time_interval_list = [1,2,3,4,5,6,12,24,36,48,72,96,120,144,168] # hours + +# iterate over base consensuses +cur_datetime = initial_time_info_bound - time_interval +while cur_datetime < final_time_info_bound - time_interval: + cur_datetime += time_interval + + cur_filepath = filepath_from_time(cur_datetime) # current + cur_filename = filename_from_time(cur_datetime) # current + + if file_check(cur_filepath) == True: + base_routers = router_data[cur_filename] + base_router_count = 0 + base_router_bandwidth = 0 + for fingerprint in router_data[cur_filename].keys(): + base_router_count += 1 + base_router_bandwidth += router_data[cur_filename][fingerprint] + + for comparison_time_interval_multiplier in time_interval_list: + comparison_time_interval = timedelta(0, comparison_time_interval_multiplier*60*60) + comparison_datetime = cur_datetime - comparison_time_interval + + comparison_filepath = filepath_from_time(comparison_datetime) # comparison + comparison_filename = filename_from_time(comparison_datetime) # comparison + + if file_check(comparison_filepath) == True: + comparison_router_count = 0 + comparison_router_bandwidth = 0 + comparison_router_overlap_bandwidth = 0 + base_router_overlap_bandwidth = 0 + comparison_router_overlap_count = 0 + + for fingerprint in router_data[comparison_filename].keys(): + comparison_router_count += 1 + comparison_router_bandwidth += router_data[comparison_filename][fingerprint] + + if fingerprint in base_routers: + base_router_overlap_bandwidth += base_routers[fingerprint] + comparison_router_overlap_count += 1 + comparison_router_overlap_bandwidth += router_data[comparison_filename][fingerprint] + + frac_relays = float(comparison_router_overlap_count)/float(base_router_count) + frac_cw = float(base_router_overlap_bandwidth)/float(base_router_bandwidth) + + print '%s,%d,%f,%f,%d,%d,%s' % (cur_filename, comparison_time_interval_multiplier, frac_relays, frac_cw, cur_datetime.month, cur_datetime.day, cur_datetime.strftime('%w')) + diff --git a/task-7241/plot.R b/task-7241/plot.R new file mode 100644 index 0000000..66d39d9 --- /dev/null +++ b/task-7241/plot.R @@ -0,0 +1,13 @@ +require(ggplot2) +data <- read.csv("s2012.csv", header=TRUE) + +# frac_relays +p <- ggplot(data, aes(factor(hours), frac_relays)) +p + geom_boxplot() + ylab("frac_relays") + xlab("time interval") + ggtitle(2012) +ggsave("2012_frac_relays.png", height=6, width=6) + +# frac_cw +p <- ggplot(data, aes(factor(hours), frac_cw)) +p + geom_boxplot() + ylab("frac_cw") + xlab("time interval") + ggtitle(2012) +ggsave("2012_frac_cw.png", height=6, width=6) +