[tor-commits] [metrics-tasks/master] Add probability graphs (#1854).

karsten at torproject.org karsten at torproject.org
Tue Nov 27 01:42:29 UTC 2012


commit ad05f28662ddb22c44c88111315ae5b3f2ae7f66
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Mon Nov 26 20:41:29 2012 -0500

    Add probability graphs (#1854).
---
 task-1854/plot-entropy.R |   22 ++++++++++++++-
 task-1854/pyextract.py   |   10 +++++-
 task-1854/pylinf.py      |   66 +++++++++++++++++++++++++++++++++------------
 3 files changed, 77 insertions(+), 21 deletions(-)

diff --git a/task-1854/plot-entropy.R b/task-1854/plot-entropy.R
index 95b8b18..d62657e 100644
--- a/task-1854/plot-entropy.R
+++ b/task-1854/plot-entropy.R
@@ -2,7 +2,27 @@ library(ggplot2)
 library(reshape)
 library(scales)
 
-e <- read.csv("extracted.csv", header = FALSE,
+p <- read.csv("prob-extracted.csv", header = FALSE,
+  col.names = c("validafter", "minadvbw", "advbw", "cumprob"),
+  stringsAsFactor = FALSE)
+p <- p[p$minadvbw >= 20480, ]
+c <- data.frame(x = p$advbw, y = p$cumprob,
+  colour = as.factor(p$minadvbw))
+ggplot(c, aes(x = x, y = y, colour = colour)) +
+geom_line() +
+scale_x_log10(name = "\nAdvertised bandwidth in B/s (log scale)") +
+scale_y_continuous(name = "Cumulated probability\n") +
+scale_colour_hue(name = "Adv. bw. cutoff in B/s") +
+opts(legend.position = "top")
+
+ggplot(c, aes(x = x, y = y, colour = colour)) +
+geom_line() +
+scale_x_log10(name = "\nAdvertised bandwidth in B/s (log scale)") +
+scale_y_log10(name = "Cumulated probability (log scale)\n") +
+scale_colour_hue(name = "Adv. bw. cutoff in B/s") +
+opts(legend.position = "top")
+
+e <- read.csv("linf-extracted.csv", header = FALSE,
   col.names = c("validafter", "min_adv_bw", "relays", "linf",
   "excl_adv_bw", "graph"), stringsAsFactor = FALSE)
 
diff --git a/task-1854/pyextract.py b/task-1854/pyextract.py
index 33614e2..bd11ea4 100644
--- a/task-1854/pyextract.py
+++ b/task-1854/pyextract.py
@@ -2,11 +2,11 @@ import os
 import sys
 
 def main():
-    out_file = open('extracted.csv', 'w')
+    out_file = open('linf-extracted.csv', 'w')
     prev_validafter, max_validafter = '', ''
     max_lines = []
     prev_relays, prev_min_adv_bw = 0, 0
-    for line in open('entropy.csv'):
+    for line in open('linf.csv'):
         parts = line.strip().split(',')
         validafter = parts[0]
         min_adv_bw = int(parts[1])
@@ -32,6 +32,12 @@ def main():
         prev_min_adv_bw = min_adv_bw
     for line in max_lines:
         out_file.write(line + ",last\n")
+    out_file.close()
+    prob_out_file = open('prob-extracted.csv', 'w')
+    for line in open('prob.csv'):
+        if line.startswith(max_validafter):
+            prob_out_file.write(line.strip() + '\n')
+    prob_out_file.close()
 
 if __name__ == '__main__':
     main()
diff --git a/task-1854/pylinf.py b/task-1854/pylinf.py
index 88cc773..3bdd8a2 100644
--- a/task-1854/pylinf.py
+++ b/task-1854/pylinf.py
@@ -102,9 +102,12 @@ def load_server_desc(tar_file_path):
         tar_fh.close()
 
 def run(data):
+    """ Return tuple of two strings, one string containing linf values for
+        all possible advertised bandwidth cutoffs, and one containing
+        probability distributions for predefined cutoffs. """
     routers = []
     router = None
-    result_string = []
+    linf_string, prob_string = [], []
     Wed, Wee, Wgd, Wgg = 1, 1, 1, 1
 
     # parse consensus
@@ -151,6 +154,8 @@ def run(data):
     omitted_routers = 0
     min_adv_bw = routers[0].advertised_bw
 
+    cutoffs = [10240, 20480, 51200, 102400, 204800, 512000, 1048576]
+
     while(omitted_routers<len(routers)):
         total_bw = 0
 
@@ -170,11 +175,29 @@ def run(data):
             diff = abs(new_prob - router.prob)
             prob_diff.append(diff)
 
-        result_string.append(','.join([valid_after,
+        linf_string.append(','.join([valid_after,
                                       str(min_adv_bw),
                                       str(len(routers)-omitted_routers),
                                       str(max(prob_diff))]))
 
+        while len(cutoffs) > 0 and min_adv_bw > cutoffs[0]:
+            cumulated_prob = 0.0
+            prev_advertised_bw = 0
+            for router in routers:
+                if router.advertised_bw > cutoffs[0] and \
+                        prev_advertised_bw != router.advertised_bw:
+                    prob_string.append(','.join([valid_after,
+                                                 str(cutoffs[0]),
+                                                 str(prev_advertised_bw),
+                                                 str(cumulated_prob)]))
+                prev_advertised_bw = router.advertised_bw
+                cumulated_prob += float(router.bandwidth)/float(total_bw)
+            prob_string.append(','.join([valid_after,
+                                         str(cutoffs[0]),
+                                         str(prev_advertised_bw),
+                                         str(cumulated_prob)]))
+            cutoffs.pop(0)
+
         # remove routers with min adv_bw
         for router in routers:
             if router.advertised_bw == min_adv_bw:
@@ -184,7 +207,7 @@ def run(data):
                 min_adv_bw = router.advertised_bw
                 break
 
-    return '\n'.join(result_string)
+    return ('\n'.join(linf_string), '\n'.join(prob_string))
 
 def parse_args():
     usage = "Usage - python pyentropy.py [options]"
@@ -196,8 +219,10 @@ def parse_args():
                       help="Input AS GeoIP database")
     parser.add_option("-s", "--server_desc", dest="server_desc",
                       default=False, help="Server descriptors directory")
-    parser.add_option("-o", "--output", dest="output", default="entropy.csv",
-                      help="Output filename")
+    parser.add_option("-l", "--linf-output", dest="linf", default="linf.csv",
+                      help="linf output filename")
+    parser.add_option("-r", "--prob-output", dest="prob", default="prob.csv",
+                      help="Probabilities output filename")
     parser.add_option("-c", "--consensus", dest="consensus", default="in/consensus",
                       help="Input consensus dir")
     parser.add_option("-p", "--pickled_data", dest="pickled_data", default=False,
@@ -227,16 +252,21 @@ if __name__ == "__main__":
         with open('data.pkl', 'wb') as output:
             pickle.dump(descriptors, output)
 
-    with open(options.output, 'w') as out_fh:
-        for file_name in os.listdir(options.consensus):
-            file_path = os.path.join(options.consensus, file_name)
-            tar_fh = tarfile.open(file_path)
-            for member in tar_fh:
-                if not member.isfile():
-                    continue
-                tar_file_data=tar_fh.extractfile(member)
-                data=tar_file_data.read()
-                output_string = run(data)
-                if output_string:
-                    out_fh.write("%s\n" % (output_string))
-            tar_fh.close()
+    linf_fh = open(options.linf, 'w')
+    prob_fh = open(options.prob, 'w')
+    for file_name in os.listdir(options.consensus):
+        file_path = os.path.join(options.consensus, file_name)
+        tar_fh = tarfile.open(file_path)
+        for member in tar_fh:
+            if not member.isfile():
+                continue
+            tar_file_data=tar_fh.extractfile(member)
+            data=tar_file_data.read()
+            (linf_string, prob_string) = run(data)
+            if linf_string:
+                linf_fh.write("%s\n" % (linf_string))
+            if prob_string:
+                prob_fh.write("%s\n" % (prob_string))
+        tar_fh.close()
+    linf_fh.close()
+    prob_fh.close()



More information about the tor-commits mailing list