[tor-commits] [metrics-tasks/master] Add documentation.

karsten at torproject.org karsten at torproject.org
Wed Jun 6 13:10:43 UTC 2012


commit ad7d50bd436e2994c34fd454e969704b4902e418
Author: George Kadianakis <desnacked at riseup.net>
Date:   Tue Jun 5 18:09:16 2012 +0300

    Add documentation.
---
 task-2718/detector.py |   76 ++++++++++++++++++++++++++++++++++--------------
 1 files changed, 54 insertions(+), 22 deletions(-)

diff --git a/task-2718/detector.py b/task-2718/detector.py
index a3d073c..1d6b4c2 100644
--- a/task-2718/detector.py
+++ b/task-2718/detector.py
@@ -38,14 +38,14 @@
 ##  anomalies that might be indicative of censorship.
 
 # Dep: matplotlib
-from pylab import * 
+from pylab import *
 import matplotlib
 
 # Dep: numpy
-import numpy 
+import numpy
 
 # Dep: scipy
-import scipy.stats 
+import scipy.stats
 from scipy.stats.distributions import norm
 from scipy.stats.distributions import poisson
 
@@ -56,7 +56,18 @@ import os.path
 
 days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
 
-# read the .csv file
+"""
+Represents a .csv file containing information on the number of
+connecting Tor users per country.
+
+'store': Dictionary with (<country code>, <counter>) as key, and the number of users as value.
+         <country code> can also be "date"...
+'all_dates': List of the data intervals (with default timedelta: 1 day).
+'country_codes': List of all relevant country codes.
+'MAX_INDEX': Length of store, number of country codes etc.
+'date_min': The oldest date found in the .csv.
+'date_min': The latest date found in the .csv.
+"""
 class torstatstore:
   def __init__(self, file_name):
     f = file(file_name)
@@ -72,13 +83,13 @@ class torstatstore:
             processed_val = None
             if ccode == "date":
                 try:
-                    year, month, day = int(val[:4]), int(val[5:7]), int(val[8:10])                
+                    year, month, day = int(val[:4]), int(val[5:7]), int(val[8:10])
                     processed_val = date(year, month, day)
                 except Exception, e:
                     print "Parsing error (ignoring line %s):" % j
                     print "%s" % val,e
-                    break            
-            
+                    break
+
             elif val != "NA":
                 processed_val = int(val)
             store[(ccode, i)] = processed_val
@@ -91,7 +102,7 @@ class torstatstore:
     d = date_min
     dt = timedelta(days=1)
     while d <= date_max:
-        all_dates += [d]    
+        all_dates += [d]
         d = d + dt
 
     # Save for later
@@ -102,6 +113,9 @@ class torstatstore:
     self.date_min = date_min
     self.date_max = date_max
 
+  """Return a list representing a time series of 'ccode' with respect
+  to the number of connected users.
+  """
   def get_country_series(self, ccode):
     assert ccode in self.country_codes
     series = {}
@@ -114,6 +128,10 @@ class torstatstore:
         sx += [series[d]]
     return sx
 
+  """Return an ordered list containing tuples of the form (<number of
+  users>, <country code>). The list is ordered with respect to the
+  number of users for each country.
+  """
   def get_largest(self, number):
     exclude = set(["all", "??", "date"])
     l = [(self.store[(c, self.MAX_INDEX-1)], c) for c in self.country_codes if c not in exclude]
@@ -121,6 +139,9 @@ class torstatstore:
     l.reverse()
     return l[:number]
 
+  """Return a dictionary, with <country code> as key, and the time
+  series of the country code as the value.
+  """
   def get_largest_locations(self, number):
     l = self.get_largest(number)
     res = {}
@@ -128,14 +149,16 @@ class torstatstore:
       res[ccode] = self.get_country_series(ccode)
     return res
 
-# Computes the difference between today and a number of days in the past
+"""Return a list containing lists (?) where each such list contains
+the difference in users for a time delta of 'days'
+"""
 def n_day_rel(series, days):
   rel = []
   for i, v in enumerate(series):
     if series[i] is None:
       rel += [None]
       continue
-    
+
     if i - days < 0 or series[i-days] is None or series[i-days] == 0:
       rel += [None]
     else:
@@ -175,7 +198,7 @@ def make_tendencies_minmax(l, INTERVAL = 1):
   return minx, maxx
 
 # Makes pretty plots
-def raw_plot(series, minc, maxc, labels, xtitle):    
+def raw_plot(series, minc, maxc, labels, xtitle):
     assert len(xtitle) == 3
     fname, stitle, slegend = xtitle
 
@@ -185,19 +208,19 @@ def raw_plot(series, minc, maxc, labels, xtitle):
     matplotlib.rc('font', **font)
 
     ylim( (-max(series)*0.1, max(series)*1.1) )
-    plot(labels, series, linewidth=1.0, label="Users")    
+    plot(labels, series, linewidth=1.0, label="Users")
 
     wherefill = []
     for mm,mx in zip(minc, maxc):
-      wherefill += [not (mm == None and mx == None)] 
+      wherefill += [not (mm == None and mx == None)]
       assert mm < mx or (mm == None and mx == None)
-          
+
     fill_between(labels, minc, maxc, where=wherefill, color="gray", label="Prediction")
 
     vdown = []
     vup = []
     for i,v in enumerate(series):
-      if minc[i] != None and v < minc[i]: 
+      if minc[i] != None and v < minc[i]:
         vdown += [v]
         vup += [None]
       elif maxc[i] != None and v > maxc[i]:
@@ -206,7 +229,7 @@ def raw_plot(series, minc, maxc, labels, xtitle):
       else:
         vup += [None]
         vdown += [None]
-    
+
     plot(labels, vdown, 'o', ms=10, lw=2, alpha=0.5, mfc='orange', label="Downturns")
     plot(labels, vup, 'o', ms=10, lw=2, alpha=0.5, mfc='green', label="Upturns")
 
@@ -235,9 +258,15 @@ def absolute_plot(series, minc, maxc, labels,INTERVAL, xtitle):
     else:
       in_minc += [None]
       in_maxc += [None]
-  raw_plot(series, in_minc, in_maxc, labels, xtitle)    
+  raw_plot(series, in_minc, in_maxc, labels, xtitle)
 
-# Censorship score by jurisdiction
+"""Return the number of downscores and upscores of a time series
+'series', given tendencies 'minc' and 'maxc' for the time interval
+'INTERVAL'.
+
+If 'scoring_interval' is specifed we only consider upscore/downscore
+that happened in the latest 'scoring_interval' days.
+"""
 def censor_score(series, minc, maxc, INTERVAL):
   upscore = 0
   downscore = 0
@@ -263,17 +292,17 @@ def plot_all(tss, minx, maxx, INTERV, DAYS=None, rdir="img"):
     return
 
   summary_file = file(os.path.join(rdir, "summary.txt"), "w")
-  
+
   if DAYS == None:
     DAYS = 6*31
-    
+
   s = tss.get_largest(200)
   scores = []
   for num, li in s:
     print ".",
     ds,us = censor_score(tss.get_country_series(li)[-DAYS:], minx[-DAYS:], maxx[-DAYS:], INTERV)
     # print ds, us
-    scores += [(ds,num, us, li)]  
+    scores += [(ds,num, us, li)]
   scores.sort()
   scores.reverse()
   s = "\n=======================\n"
@@ -290,6 +319,7 @@ def plot_all(tss, minx, maxx, INTERV, DAYS=None, rdir="img"):
       plot_target(tss, c,xtitle, minx, maxx, DAYS, INTERV)
   summary_file.close()
 
+"""Write a CSV report on the minimum/maximum users of each country per date."""
 def write_all(tss, minc, maxc, INTERVAL=7):
   ranges_file = file("direct-users-ranges.csv", "w")
   ranges_file.write("date,country,minusers,maxusers\n")
@@ -312,9 +342,11 @@ def main():
   # Change these to customize script
   CSV_FILE = "direct-users.csv"
   GRAPH_DIR = "img"
+  # Time interval to model connection rates.
   INTERV = 7
+  # Consider maximum DAYS days back.
   DAYS= 6 * 31
-  
+
   tss = torstatstore(CSV_FILE)
   l = tss.get_largest_locations(50)
   minx, maxx = make_tendencies_minmax(l, INTERV)





More information about the tor-commits mailing list