commit e456caea6134de661d24be8a253394e323ac025a Author: Karsten Loesing karsten.loesing@gmx.net Date: Wed Jul 4 23:11:06 2012 +0200
Add #6232 code written by gsathya and others.
Extract consensus weights from consensuses and calculate the Shannon Entropy for them. Most of this code was written by gsathya and later refined by asn and phw. --- task-6232/.gitignore | 3 ++ task-6232/plot-entropy.R | 10 +++++ task-6232/pyentropy.py | 79 ++++++++++++++++++++++++++++++++++++++++++++ task-6232/run-pyentropy.py | 5 +++ 4 files changed, 97 insertions(+), 0 deletions(-)
diff --git a/task-6232/.gitignore b/task-6232/.gitignore new file mode 100644 index 0000000..8a0c627 --- /dev/null +++ b/task-6232/.gitignore @@ -0,0 +1,3 @@ +in/ +entropy.csv + diff --git a/task-6232/plot-entropy.R b/task-6232/plot-entropy.R new file mode 100644 index 0000000..1334b88 --- /dev/null +++ b/task-6232/plot-entropy.R @@ -0,0 +1,10 @@ +library(ggplot2) +d <- read.csv("entropy.csv", header = FALSE, + col.names = c("validafter", "entropy")) +ggplot(d, aes(x = as.POSIXct(validafter), y = entropy)) + +geom_line() + +scale_x_datetime(name = "\nDate") + +scale_y_continuous(name = "Entropy\n") +ggsave("entropy.png", width = 8, height = 6, dpi = 100) + + diff --git a/task-6232/pyentropy.py b/task-6232/pyentropy.py new file mode 100644 index 0000000..f13f709 --- /dev/null +++ b/task-6232/pyentropy.py @@ -0,0 +1,79 @@ +""" +Usage - python pyentropy.py <consensus-dir> <output-file> +Output - A CSV file of the format <valid-after>,<entropy> +rsync -arz --delete metrics.torproject.org::metrics-recent/relay-descriptors/consensuses in +""" + +import sys +import math +import os +from decimal import * + +RESULTS = [] +KEYS = ['r','s','v','w','p','m'] + + +class Router: + def __init__(self): + self.lines = [] + self.nick = None + self.bandwidth = None + self.flags = None + self.probability = None + + def add(self, key, values): + if key == 'r': + self.nick = values[0] + if key == 'w': + self.bandwidth = int(values[0].split('=')[1]) + if key == 's': + self.flags = values + + +def run(file_name): + routers = [] + # parse consensus + with open(file_name, 'r') as f: + for line in f.readlines(): + key = line.split()[0] + values = line.split()[1:] + if key =='r': + router = Router() + router.add(key, values) + elif key == 'p': + router.add(key, values) + routers.append(router) + elif key == 'valid-after': + valid_after = ' '.join(values) + elif key in KEYS: + router.add(key, values) + + # build hash table with freq. distribution + # key: bandwidth + # value: number of bandwidth's observations + bw_dist = {} + for router in routers: + if bw_dist.has_key(router.bandwidth): + bw_dist[router.bandwidth] += 1 + else: + bw_dist[router.bandwidth] = 1 + + if len(routers) <= 0: + print "Error: amount of routers must be > 0." + return; + + print "calculating entropy" + entropy = 0.0 + for bw in bw_dist.iterkeys(): + # p = probability of one particular bandwidth + p = float(bw_dist[bw]) / len(routers) + entropy += -(p * math.log(p, 2)) + + return ",".join([valid_after, str(entropy)]) + + +if __name__ == "__main__": + with open(sys.argv[2], 'w') as f: + for file_name in os.listdir(sys.argv[1]): + string = run(os.path.join(sys.argv[1], file_name)) + f.write("%s\n" % (string)) diff --git a/task-6232/run-pyentropy.py b/task-6232/run-pyentropy.py new file mode 100755 index 0000000..a94a7d6 --- /dev/null +++ b/task-6232/run-pyentropy.py @@ -0,0 +1,5 @@ +#!/bin/bash +#### Uncomment to use most recent data instead of extracted tarballs +###rsync -arz --delete metrics.torproject.org::metrics-recent/relay-descriptors/consensuses in +python pyentropy.py in/consensuses/ entropy.csv +