[tor-commits] [metrics-tasks/master] Add George's censorship detector script.

Fri Mar 25 09:34:19 UTC 2011

commit f5d257dff6af41dbbe33ab20c5bbb218a38c8cd8
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Fri Mar 25 10:29:38 2011 +0100

    Add George's censorship detector script.
---
 task-2718/detector.py |  306 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 306 insertions(+), 0 deletions(-)

diff --git a/task-2718/detector.py b/task-2718/detector.py
new file mode 100644
index 0000000..0370d02
--- /dev/null
+++ b/task-2718/detector.py
@@ -0,0 +1,306 @@
+##  Copyright (c) 2011 George Danezis <gdane at microsoft.com>
+##
+##  All rights reserved.
+##
+##  Redistribution and use in source and binary forms, with or without
+##  modification, are permitted (subject to the limitations in the
+##  disclaimer below) provided that the following conditions are met:
+##
+##   * Redistributions of source code must retain the above copyright
+##     notice, this list of conditions and the following disclaimer.
+##
+##   * Redistributions in binary form must reproduce the above copyright
+##     notice, this list of conditions and the following disclaimer in the
+##     documentation and/or other materials provided with the
+##     distribution.
+##
+##   * Neither the name of <Owner Organization> nor the names of its
+##     contributors may be used to endorse or promote products derived
+##     from this software without specific prior written permission.
+##
+##  NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+##  GRANTED BY THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+##  HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+##  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+##  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+##  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+##  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+##  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+##  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+##  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+##  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+##  OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+##  IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+##
+##  (Clear BSD license: http://labs.metacarta.com/license-explanation.html#license)
+
+##  This script reads a .csv file of the number of Tor users and finds
+##  anomalies that might be indicative of censorship.
+
+# Dep: matplotlib
+from pylab import * 
+import matplotlib
+
+# Dep: numpy
+import numpy 
+
+# Dep: scipy
+import scipy.stats 
+from scipy.stats.distributions import norm
+from scipy.stats.distributions import poisson
+
+# Std lib
+from datetime import date
+from datetime import timedelta
+import os.path
+
+days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
+
+# read the .csv file
+class torstatstore:
+  def __init__(self, file_name):
+    f = file(file_name)
+    country_codes = f.readline()
+    country_codes = country_codes.strip().split(",")
+
+    store = {}
+    MAX_INDEX = 0
+    for i, line in enumerate(f):
+        MAX_INDEX += 1
+        line_parsed = line.strip().split(",")
+        for j, (ccode, val) in enumerate(zip(country_codes,line_parsed)):
+            processed_val = None
+            if ccode == "date":
+                try:
+                    year, month, day = int(val[:4]), int(val[5:7]), int(val[8:10])                
+                    processed_val = date(year, month, day)
+                except Exception, e:
+                    print "Parsing error (ignoring line %s):" % j
+                    print "%s" % val,e
+                    break            
+            
+            elif val != "NA":
+                processed_val = int(val)
+            store[(ccode, i)] = processed_val
+
+    # min and max
+    date_min = store[("date", 0)]
+    date_max = store[("date", i)]
+
+    all_dates = []
+    d = date_min
+    dt = timedelta(days=1)
+    while d <= date_max:
+        all_dates += [d]    
+        d = d + dt
+
+    # Save for later
+    self.store = store
+    self.all_dates = all_dates
+    self.country_codes = country_codes
+    self.MAX_INDEX = MAX_INDEX
+    self.date_min = date_min
+    self.date_max = date_max
+
+  def get_country_series(self, ccode):
+    assert ccode in self.country_codes
+    series = {}
+    for d in self.all_dates:
+        series[d] = None
+    for i in range(self.MAX_INDEX):
+        series[self.store[("date", i)]] = self.store[(ccode, i)]
+    sx = []
+    for d in self.all_dates:
+        sx += [series[d]]
+    return sx
+
+  def get_largest(self, number):
+    exclude = set(["all", "??", "date"])
+    l = [(self.store[(c, self.MAX_INDEX-1)], c) for c in self.country_codes if c not in exclude]
+    l.sort()
+    l.reverse()
+    return l[:number]
+
+  def get_largest_locations(self, number):
+    l = self.get_largest(number)
+    res = {}
+    for _, ccode in l[:number]:
+      res[ccode] = self.get_country_series(ccode)
+    return res
+
+# Computes the difference between today and a number of days in the past
+def n_day_rel(series, days):
+  rel = []
+  for i, v in enumerate(series):
+    if series[i] is None:
+      rel += [None]
+      continue
+    
+    if i - days < 0 or series[i-days] is None or series[i-days] == 0:
+      rel += [None]
+    else:
+      rel += [ float(series[i]) / series[i-days]]
+  return rel
+
+# Main model: computes the expected min / max range of number of users
+def make_tendencies_minmax(l, INTERVAL = 1):
+  lminus1 = dict([(ccode, n_day_rel(l[ccode], INTERVAL)) for ccode in l])
+  c = lminus1[lminus1.keys()[0]]
+  dists = []
+  minx = []
+  maxx = []
+  for i in range(len(c)):
+    vals = [lminus1[ccode][i] for ccode in lminus1.keys() if lminus1[ccode][i] != None]
+    if len(vals) < 8:
+      dists += [None]
+      minx += [None]
+      maxx += [None]
+    else:
+      vals.sort()
+      median = vals[len(vals)/2]
+      q1 = vals[len(vals)/4]
+      q2 = vals[(3*len(vals))/4]
+      qd = q2 - q1
+      vals = [v for v in vals if median - qd*4 < v and  v < median + qd*4]
+      if len(vals) < 8:
+        dists += [None]
+        minx += [None]
+        maxx += [None]
+        continue
+      mu, signma = norm.fit(vals)
+      dists += [(mu, signma)]
+      maxx += [norm.ppf(0.9999, mu, signma)]
+      minx += [norm.ppf(1 - 0.9999, mu, signma)]
+  ## print minx[-1], maxx[-1]
+  return minx, maxx
+
+# Makes pretty plots
+def raw_plot(series, minc, maxc, labels, xtitle):    
+    assert len(xtitle) == 3
+    fname, stitle, slegend = xtitle
+
+    font = {'family' : 'Bitstream Vera Sans',
+        'weight' : 'normal',
+        'size'   : 8}
+    matplotlib.rc('font', **font)
+
+    ylim( (-max(series)*0.1, max(series)*1.1) )
+    plot(labels, series, linewidth=1.0, label="Users")    
+
+    wherefill = []
+    for mm,mx in zip(minc, maxc):
+      wherefill += [not (mm == None and mx == None)] 
+      assert mm < mx or (mm == None and mx == None)
+          
+    fill_between(labels, minc, maxc, where=wherefill, color="gray", label="Prediction")
+
+    vdown = []
+    vup = []
+    for i,v in enumerate(series):
+      if minc[i] != None and v < minc[i]: 
+        vdown += [v]
+        vup += [None]
+      elif maxc[i] != None and v > maxc[i]:
+        vdown += [None]
+        vup += [v]
+      else:
+        vup += [None]
+        vdown += [None]
+    
+    plot(labels, vdown, 'o', ms=10, lw=2, alpha=0.5, mfc='orange', label="Downturns")
+    plot(labels, vup, 'o', ms=10, lw=2, alpha=0.5, mfc='green', label="Upturns")
+
+    legend(loc=2)
+
+    xlabel('Time (days)')
+    ylabel('Users')
+    title(stitle)
+    grid(True)
+    F = gcf()
+
+    F.set_size_inches(10,5)
+    F.savefig(fname,  format="png", dpi = (150))
+    close()
+
+def absolute_plot(series, minc, maxc, labels,INTERVAL, xtitle):
+  in_minc = []
+  in_maxc = []
+  for i, v in enumerate(series):
+    if i > 0 and i - INTERVAL >= 0 and series[i] != None and series[i-INTERVAL] != None and series[i-INTERVAL] != 0 and minc[i]!= None and maxc[i]!= None:
+      in_minc += [minc[i] * poisson.ppf(1-0.9999, series[i-INTERVAL])]
+      in_maxc += [maxc[i] * poisson.ppf(0.9999, series[i-INTERVAL])]
+      if not in_minc[-1] < in_maxc[-1]:
+        print in_minc[-1], in_maxc[-1], series[i-INTERVAL], minc[i], maxc[i]
+      assert in_minc[-1] < in_maxc[-1]
+    else:
+      in_minc += [None]
+      in_maxc += [None]
+  raw_plot(series, in_minc, in_maxc, labels, xtitle)    
+
+# Censorship score by jurisdiction
+def censor_score(series, minc, maxc, INTERVAL):
+  upscore = 0
+  downscore = 0
+  for i, v in enumerate(series):
+    if i > 0 and i - INTERVAL >= 0 and series[i] != None and series[i-INTERVAL] != None and series[i-INTERVAL] != 0 and minc[i]!= None and maxc[i]!= None:
+      in_minc = minc[i] * poisson.ppf(1-0.9999, series[i-INTERVAL])
+      in_maxc = maxc[i] * poisson.ppf(0.9999, series[i-INTERVAL])
+      downscore += 1 if minc[i] != None and v < in_minc else 0
+      upscore += 1 if maxc[i] != None and v > in_maxc else 0
+  return downscore, upscore
+
+def plot_target(tss, TARGET, xtitle, minx, maxx, DAYS=365, INTERV = 7):
+  ctarget = tss.get_country_series(TARGET)
+  c = n_day_rel(ctarget, INTERV)
+  absolute_plot(ctarget[-DAYS:], minx[-DAYS:], maxx[-DAYS:], tss.all_dates[-DAYS:],INTERV, xtitle = xtitle)
+
+
+## Make a league table of censorship + nice graphs
+def plot_all(tss, minx, maxx, INTERV, DAYS=None, rdir="img"):
+  rdir = os.path.realpath(rdir)
+  if not os.path.exists(rdir) or not os.path.isdir(rdir):
+    print "ERROR: %s does not exist or is not a directory." % rdir
+    return
+
+  summary_file = file(os.path.join(rdir, "summary.txt"), "w")
+  
+  if DAYS == None:
+    DAYS = 6*31
+    
+  s = tss.get_largest(200)
+  scores = []
+  for num, li in s:
+    print ".",
+    ds,us = censor_score(tss.get_country_series(li)[-DAYS:], minx[-DAYS:], maxx[-DAYS:], INTERV)
+    # print ds, us
+    scores += [(ds,num, us, li)]  
+  scores.sort()
+  scores.reverse()
+  s = "\n=======================\n"
+  s+= "Report for %s to %s\n" % (tss.all_dates[-DAYS], tss.all_dates[-1])
+  s+= "=======================\n"
+  print s
+  summary_file.write(s)
+  for a,nx, b,c in scores:
+    if a > 0:
+      s = "%s -- down: %2d (up: %2d affected: %s)" % (c, a, b, nx)
+      print s
+      summary_file.write(s + "\n")
+      xtitle = (os.path.join(rdir, "%03d-%s-censor.png" % (a,c)), "Tor report for %s -- down: %2d (up: %2d affected: %s)" % (c, a, b, nx),"")
+      plot_target(tss, c,xtitle, minx, maxx, DAYS, INTERV)
+  summary_file.close()
+
+def main():
+  # Change these to customize script
+  CSV_FILE = "direct-users.csv"
+  GRAPH_DIR = "img"
+  INTERV = 7
+  DAYS= 6 * 31
+  
+  tss = torstatstore(CSV_FILE)
+  l = tss.get_largest_locations(50)
+  minx, maxx = make_tendencies_minmax(l, INTERV)
+  plot_all(tss, minx, maxx, INTERV, DAYS, rdir=GRAPH_DIR)
+
+if __name__ == "__main__":
+    main()