commit ad848ad07f14ed75b5e464b20a069928076f8e36
Author: Sathyanarayanan Gunasekaran <gsathya.ceg(a)gmail.com>
Date: Sat Sep 15 17:49:56 2012 +0530
Add entropy code for #1854
---
task-1854/pyentropy.py | 262 ++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 262 insertions(+), 0 deletions(-)
diff --git a/task-1854/pyentropy.py b/task-1854/pyentropy.py
new file mode 100644
index 0000000..4ee003c
--- /dev/null
+++ b/task-1854/pyentropy.py
@@ -0,0 +1,262 @@
+"""
+Usage - python pyentropy.py -h
+Output - A CSV file of the format (without newlines):
+ <valid-after>,
+ <min consensus weight>,
+ <number of relays>,
+ <entropy for all nodes>,
+ <max entropy for all nodes>,
+ <entropy for exit nodes>,
+ <max entropy for exit nodes>,
+ <entropy for guard nodes>,
+ <max entropy for guard nodes>,
+ <entropy for countries>,
+ <max entropy for countries>,
+ <entropy for AS>,
+ <max entropy for AS>
+rsync -arz --delete metrics.torproject.org::metrics-recent/relay-descriptors/consensuses in
+"""
+
+import sys
+import math
+import os
+import pygeoip
+import StringIO
+import stem.descriptor
+
+from optparse import OptionParser
+from binascii import b2a_hex, a2b_base64, a2b_hex
+from stem.descriptor.server_descriptor import RelayDescriptor, BridgeDescriptor
+
+class Router:
+ def __init__(self):
+ self.bandwidth = None
+ self.advertised_bw = None
+ self.country = None
+ self.as_no = None
+ self.is_exit = None
+ self.is_guard = None
+ self.cw = None
+
+ def add_router_info(self, values):
+ hex_digest = b2a_hex(a2b_base64(values[2]+"="))
+ self.advertised_bw = self.get_advertised_bw(hex_digest)
+ ip = values[5]
+ self.country = gi_db.country_code_by_addr(ip)
+ self.as_no = self.get_as_details(ip)
+
+ def add_weights(self, values):
+ self.bandwidth = int(values[0].split('=')[1])
+
+ def add_flags(self, values):
+ if "Exit" in values and not "BadExit" in values:
+ self.is_exit = True
+ if "Guard" in values:
+ self.is_guard = True
+
+ def get_as_details(self, ip):
+ try:
+ value = as_db.org_by_addr(str(ip)).split()
+ return value[0]
+ except:
+ return ""
+
+ def get_advertised_bw(self, hex_digest):
+ try:
+ with open(options.server_desc+hex_digest) as f:
+ data = f.read()
+
+ desc_iter = stem.descriptor.server_descriptor.parse_file(StringIO.StringIO(data))
+ desc_entries = list(desc_iter)
+ desc = desc_entries[0]
+ return min(desc.average_bandwidth, desc.burst_bandwidth, desc.observed_bandwidth)
+ except:
+ return 0
+
+def parse_bw_weights(values):
+ data = {}
+ try:
+ for value in values:
+ key, value = value.split("=")
+ data[key] = float(value) / 10000
+ return data
+ except:
+ return None
+
+def run(file_name):
+ routers = []
+ router = None
+ result_string = []
+ Wed, Wee, Wgd, Wgg = 1, 1, 1, 1
+ # parse consensus
+ with open(file_name, 'r') as f:
+ for line in f.readlines():
+ key = line.split()[0]
+ values = line.split()[1:]
+ if key =='r':
+ router = Router()
+ routers.append(router)
+ router.add_router_info(values)
+ elif key == 's':
+ router.add_flags(values)
+ elif key == 'w':
+ router.add_weights(values)
+ elif key == 'valid-after':
+ valid_after = ' '.join(values)
+ elif key == 'bandwidth-weights':
+ data = parse_bw_weights(values)
+ try:
+ Wed = data['Wed']
+ Wee = data['Wee']
+ Wgd = data['Wgd']
+ Wgg = data['Wgg']
+ except:
+ pass
+
+ if len(routers) <= 0:
+ return
+
+ # calculate consensus weight for each relay
+ for router in routers:
+ if not router.bandwidth:
+ # should we consider relays with 'None' bandwidth?
+ continue
+ if router.is_guard and router.is_exit:
+ router.cw = Wgd*Wed*router.bandwidth
+ elif router.is_guard:
+ router.cw = Wgg*router.bandwidth
+ elif router.is_exit:
+ router.cw = Wee*router.bandwidth
+
+ # sort list of routers based on consensus weight
+ routers.sort(key=lambda router: router.cw)
+
+ while(len(routers)>1):
+ total_bw, total_exit_bw, total_guard_bw = 0, 0, 0
+ guards_no, exits_no = 0, 0
+ bw_countries, bw_as = {}, {}
+ max_entropy, max_entropy_as, max_entropy_guard, max_entropy_country, max_entropy_exit = 0.0, 0.0, 0.0, 0.0, 0.0
+ # first relay has smallest cw
+ min_cw = routers[0].cw
+
+ for router in routers:
+ if not router.bandwidth:
+ continue
+ total_bw += router.bandwidth
+ if router.is_guard and router.is_exit:
+ total_guard_bw += Wgd*router.bandwidth
+ total_exit_bw += Wed*router.bandwidth
+ guards_no += 1
+ exits_no += 1
+ elif router.is_guard:
+ total_guard_bw += Wgg*router.bandwidth
+ guards_no += 1
+ elif router.is_exit:
+ total_exit_bw += Wee*router.bandwidth
+ exits_no += 1
+ if bw_countries.has_key(router.country):
+ bw_countries[router.country] += router.bandwidth
+ else:
+ bw_countries[router.country] = router.bandwidth
+ if bw_as.has_key(router.as_no):
+ bw_as[router.as_no] += router.bandwidth
+ else:
+ bw_as[router.as_no] = router.bandwidth
+
+ if total_bw == 0:
+ return
+
+ entropy, entropy_exit, entropy_guard, entropy_country, entropy_as = 0.0, 0.0, 0.0, 0.0, 0.0
+ for router in routers:
+ p = float(router.bandwidth) / float(total_bw)
+ if p != 0:
+ entropy += -(p * math.log(p, 2))
+ if router.is_guard and router.is_exit:
+ p = float(Wgd*router.bandwidth) / float(total_guard_bw)
+ if p != 0:
+ entropy_guard += -(p * math.log(p, 2))
+ p = float(Wed*router.bandwidth) / float(total_exit_bw)
+ if p != 0:
+ entropy_exit += -(p * math.log(p, 2))
+ elif router.is_guard:
+ p = float(Wgg*router.bandwidth) / float(total_guard_bw)
+ if p != 0:
+ entropy_guard += -(p * math.log(p, 2))
+ elif router.is_exit:
+ p = float(Wee*router.bandwidth) / float(total_exit_bw)
+ if p != 0:
+ entropy_exit += -(p * math.log(p, 2))
+
+ for country in bw_countries.iterkeys():
+ p = float(bw_countries[country]) / float(total_bw)
+ if p != 0:
+ entropy_country += -(p * math.log(p, 2))
+
+ for as_no in bw_as.iterkeys():
+ p = float(bw_as[as_no]) / float(total_bw)
+ if p !=0:
+ entropy_as += -(p * math.log(p, 2))
+
+ # Entropy of uniform distribution of 'n' possible values: log(n)
+ max_entropy = math.log(len(routers), 2)
+ if guards_no:
+ max_entropy_guard = math.log(guards_no, 2)
+ if exits_no:
+ max_entropy_exit = math.log(exits_no, 2)
+ if bw_countries:
+ max_entropy_country = math.log(len(bw_countries), 2)
+ if bw_as:
+ max_entropy_as = math.log(len(bw_as), 2)
+
+ result_string.append(','.join([valid_after,
+ str(min_cw),
+ str(len(routers)),
+ str(entropy),
+ str(max_entropy),
+ str(entropy_exit),
+ str(max_entropy_exit),
+ str(entropy_guard),
+ str(max_entropy_guard),
+ str(entropy_country),
+ str(max_entropy_country),
+ str(entropy_as),
+ str(max_entropy_as)]))
+
+ # remove routers with min cw
+ for id, router in enumerate(routers):
+ if router.cw == min_cw:
+ del routers[id]
+ else:
+ break
+
+ return '\n'.join(result_string)
+
+def parse_args():
+ usage = "Usage - python pyentropy.py [options]"
+ parser = OptionParser(usage)
+
+ parser.add_option("-g", "--geoip", dest="gi_db", default="GeoIP.dat",
+ help="Input GeoIP database")
+ parser.add_option("-a", "--as", dest="as_db", default="GeoIPASNum.dat",
+ help="Input AS GeoIP database")
+ parser.add_option("-s", "--server_desc", dest="server_desc",
+ default="data/relay-descriptors/server-descriptors/", help="Server descriptors directory")
+ parser.add_option("-o", "--output", dest="output", default="entropy.csv",
+ help="Output filename")
+ parser.add_option("-c", "--consensus", dest="consensus", default="in/consensus",
+ help="Input consensus dir")
+
+ (options, args) = parser.parse_args()
+
+ return options
+
+if __name__ == "__main__":
+ options = parse_args()
+ gi_db = pygeoip.GeoIP(options.gi_db)
+ as_db = pygeoip.GeoIP(options.as_db)
+
+ with open(options.output, 'w') as f:
+ for file_name in os.listdir(options.consensus):
+ string = run(os.path.join(options.consensus, file_name))
+ if string:
+ f.write("%s\n" % (string))