commit ad848ad07f14ed75b5e464b20a069928076f8e36 Author: Sathyanarayanan Gunasekaran gsathya.ceg@gmail.com Date: Sat Sep 15 17:49:56 2012 +0530
Add entropy code for #1854 --- task-1854/pyentropy.py | 262 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 262 insertions(+), 0 deletions(-)
diff --git a/task-1854/pyentropy.py b/task-1854/pyentropy.py new file mode 100644 index 0000000..4ee003c --- /dev/null +++ b/task-1854/pyentropy.py @@ -0,0 +1,262 @@ +""" +Usage - python pyentropy.py -h +Output - A CSV file of the format (without newlines): + <valid-after>, + <min consensus weight>, + <number of relays>, + <entropy for all nodes>, + <max entropy for all nodes>, + <entropy for exit nodes>, + <max entropy for exit nodes>, + <entropy for guard nodes>, + <max entropy for guard nodes>, + <entropy for countries>, + <max entropy for countries>, + <entropy for AS>, + <max entropy for AS> +rsync -arz --delete metrics.torproject.org::metrics-recent/relay-descriptors/consensuses in +""" + +import sys +import math +import os +import pygeoip +import StringIO +import stem.descriptor + +from optparse import OptionParser +from binascii import b2a_hex, a2b_base64, a2b_hex +from stem.descriptor.server_descriptor import RelayDescriptor, BridgeDescriptor + +class Router: + def __init__(self): + self.bandwidth = None + self.advertised_bw = None + self.country = None + self.as_no = None + self.is_exit = None + self.is_guard = None + self.cw = None + + def add_router_info(self, values): + hex_digest = b2a_hex(a2b_base64(values[2]+"=")) + self.advertised_bw = self.get_advertised_bw(hex_digest) + ip = values[5] + self.country = gi_db.country_code_by_addr(ip) + self.as_no = self.get_as_details(ip) + + def add_weights(self, values): + self.bandwidth = int(values[0].split('=')[1]) + + def add_flags(self, values): + if "Exit" in values and not "BadExit" in values: + self.is_exit = True + if "Guard" in values: + self.is_guard = True + + def get_as_details(self, ip): + try: + value = as_db.org_by_addr(str(ip)).split() + return value[0] + except: + return "" + + def get_advertised_bw(self, hex_digest): + try: + with open(options.server_desc+hex_digest) as f: + data = f.read() + + desc_iter = stem.descriptor.server_descriptor.parse_file(StringIO.StringIO(data)) + desc_entries = list(desc_iter) + desc = desc_entries[0] + return min(desc.average_bandwidth, desc.burst_bandwidth, desc.observed_bandwidth) + except: + return 0 + +def parse_bw_weights(values): + data = {} + try: + for value in values: + key, value = value.split("=") + data[key] = float(value) / 10000 + return data + except: + return None + +def run(file_name): + routers = [] + router = None + result_string = [] + Wed, Wee, Wgd, Wgg = 1, 1, 1, 1 + # parse consensus + with open(file_name, 'r') as f: + for line in f.readlines(): + key = line.split()[0] + values = line.split()[1:] + if key =='r': + router = Router() + routers.append(router) + router.add_router_info(values) + elif key == 's': + router.add_flags(values) + elif key == 'w': + router.add_weights(values) + elif key == 'valid-after': + valid_after = ' '.join(values) + elif key == 'bandwidth-weights': + data = parse_bw_weights(values) + try: + Wed = data['Wed'] + Wee = data['Wee'] + Wgd = data['Wgd'] + Wgg = data['Wgg'] + except: + pass + + if len(routers) <= 0: + return + + # calculate consensus weight for each relay + for router in routers: + if not router.bandwidth: + # should we consider relays with 'None' bandwidth? + continue + if router.is_guard and router.is_exit: + router.cw = Wgd*Wed*router.bandwidth + elif router.is_guard: + router.cw = Wgg*router.bandwidth + elif router.is_exit: + router.cw = Wee*router.bandwidth + + # sort list of routers based on consensus weight + routers.sort(key=lambda router: router.cw) + + while(len(routers)>1): + total_bw, total_exit_bw, total_guard_bw = 0, 0, 0 + guards_no, exits_no = 0, 0 + bw_countries, bw_as = {}, {} + max_entropy, max_entropy_as, max_entropy_guard, max_entropy_country, max_entropy_exit = 0.0, 0.0, 0.0, 0.0, 0.0 + # first relay has smallest cw + min_cw = routers[0].cw + + for router in routers: + if not router.bandwidth: + continue + total_bw += router.bandwidth + if router.is_guard and router.is_exit: + total_guard_bw += Wgd*router.bandwidth + total_exit_bw += Wed*router.bandwidth + guards_no += 1 + exits_no += 1 + elif router.is_guard: + total_guard_bw += Wgg*router.bandwidth + guards_no += 1 + elif router.is_exit: + total_exit_bw += Wee*router.bandwidth + exits_no += 1 + if bw_countries.has_key(router.country): + bw_countries[router.country] += router.bandwidth + else: + bw_countries[router.country] = router.bandwidth + if bw_as.has_key(router.as_no): + bw_as[router.as_no] += router.bandwidth + else: + bw_as[router.as_no] = router.bandwidth + + if total_bw == 0: + return + + entropy, entropy_exit, entropy_guard, entropy_country, entropy_as = 0.0, 0.0, 0.0, 0.0, 0.0 + for router in routers: + p = float(router.bandwidth) / float(total_bw) + if p != 0: + entropy += -(p * math.log(p, 2)) + if router.is_guard and router.is_exit: + p = float(Wgd*router.bandwidth) / float(total_guard_bw) + if p != 0: + entropy_guard += -(p * math.log(p, 2)) + p = float(Wed*router.bandwidth) / float(total_exit_bw) + if p != 0: + entropy_exit += -(p * math.log(p, 2)) + elif router.is_guard: + p = float(Wgg*router.bandwidth) / float(total_guard_bw) + if p != 0: + entropy_guard += -(p * math.log(p, 2)) + elif router.is_exit: + p = float(Wee*router.bandwidth) / float(total_exit_bw) + if p != 0: + entropy_exit += -(p * math.log(p, 2)) + + for country in bw_countries.iterkeys(): + p = float(bw_countries[country]) / float(total_bw) + if p != 0: + entropy_country += -(p * math.log(p, 2)) + + for as_no in bw_as.iterkeys(): + p = float(bw_as[as_no]) / float(total_bw) + if p !=0: + entropy_as += -(p * math.log(p, 2)) + + # Entropy of uniform distribution of 'n' possible values: log(n) + max_entropy = math.log(len(routers), 2) + if guards_no: + max_entropy_guard = math.log(guards_no, 2) + if exits_no: + max_entropy_exit = math.log(exits_no, 2) + if bw_countries: + max_entropy_country = math.log(len(bw_countries), 2) + if bw_as: + max_entropy_as = math.log(len(bw_as), 2) + + result_string.append(','.join([valid_after, + str(min_cw), + str(len(routers)), + str(entropy), + str(max_entropy), + str(entropy_exit), + str(max_entropy_exit), + str(entropy_guard), + str(max_entropy_guard), + str(entropy_country), + str(max_entropy_country), + str(entropy_as), + str(max_entropy_as)])) + + # remove routers with min cw + for id, router in enumerate(routers): + if router.cw == min_cw: + del routers[id] + else: + break + + return '\n'.join(result_string) + +def parse_args(): + usage = "Usage - python pyentropy.py [options]" + parser = OptionParser(usage) + + parser.add_option("-g", "--geoip", dest="gi_db", default="GeoIP.dat", + help="Input GeoIP database") + parser.add_option("-a", "--as", dest="as_db", default="GeoIPASNum.dat", + help="Input AS GeoIP database") + parser.add_option("-s", "--server_desc", dest="server_desc", + default="data/relay-descriptors/server-descriptors/", help="Server descriptors directory") + parser.add_option("-o", "--output", dest="output", default="entropy.csv", + help="Output filename") + parser.add_option("-c", "--consensus", dest="consensus", default="in/consensus", + help="Input consensus dir") + + (options, args) = parser.parse_args() + + return options + +if __name__ == "__main__": + options = parse_args() + gi_db = pygeoip.GeoIP(options.gi_db) + as_db = pygeoip.GeoIP(options.as_db) + + with open(options.output, 'w') as f: + for file_name in os.listdir(options.consensus): + string = run(os.path.join(options.consensus, file_name)) + if string: + f.write("%s\n" % (string))
tor-commits@lists.torproject.org