commit 13a1a7911d8a84923cd071160775eccc24d8b47e Author: Karsten Loesing karsten.loesing@gmx.net Date: Wed Oct 5 08:37:06 2011 +0200
Add George's v2 detector script (#2718). --- task-2718/.gitignore | 2 + task-2718/detectorv2.py | 531 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 533 insertions(+), 0 deletions(-)
diff --git a/task-2718/.gitignore b/task-2718/.gitignore index 917bb1b..0c965b5 100644 --- a/task-2718/.gitignore +++ b/task-2718/.gitignore @@ -1,3 +1,5 @@ *.csv *.pdf +*.aux +*.log
diff --git a/task-2718/detectorv2.py b/task-2718/detectorv2.py new file mode 100755 index 0000000..5dce711 --- /dev/null +++ b/task-2718/detectorv2.py @@ -0,0 +1,531 @@ +## Copyright (c) 2011 George Danezis gdane@microsoft.com +## +## All rights reserved. +## +## Redistribution and use in source and binary forms, with or without +## modification, are permitted (subject to the limitations in the +## disclaimer below) provided that the following conditions are met: +## +## * Redistributions of source code must retain the above copyright +## notice, this list of conditions and the following disclaimer. +## +## * Redistributions in binary form must reproduce the above copyright +## notice, this list of conditions and the following disclaimer in the +## documentation and/or other materials provided with the +## distribution. +## +## * Neither the name of <Owner Organization> nor the names of its +## contributors may be used to endorse or promote products derived +## from this software without specific prior written permission. +## +## NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE +## GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT +## HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED +## WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +## MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +## DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +## LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +## CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +## SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +## BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +## IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +## +## (Clear BSD license: http://labs.metacarta.com/license-explanation.html#license) + +## This script reads a .csv file of the number of Tor users and finds +## anomalies that might be indicative of censorship. + +# Dep: matplotlib +from pylab import * +import matplotlib + +# Dep: numpy +import numpy + +# Dep: scipy +import scipy.stats +from scipy.stats.distributions import norm +from scipy.stats.distributions import poisson +from scipy.stats.distributions import gamma + +# Std lib +from datetime import date +from datetime import timedelta +import os.path + +import random + +days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] + +# read the .csv file +class torstatstore: + def __init__(self, file_name, DAYS): + self.DAYS = DAYS + f = file(file_name) + country_codes = f.readline() + country_codes = country_codes.strip().split(",") + + store = {} + MAX_INDEX = 0 + for i, line in enumerate(f): + MAX_INDEX += 1 + line_parsed = line.strip().split(",") + for j, (ccode, val) in enumerate(zip(country_codes,line_parsed)): + processed_val = None + if ccode == "date": + try: + year, month, day = int(val[:4]), int(val[5:7]), int(val[8:10]) + processed_val = date(year, month, day) + except Exception, e: + print "Parsing error (ignoring line %s):" % j + print "%s" % val,e + break + + elif val != "NA": + processed_val = int(val) + store[(ccode, i)] = processed_val + + # min and max + date_min = store[("date", 0)] + date_max = store[("date", i)] + + all_dates = [] + d = date_min + dt = timedelta(days=1) + while d <= date_max: + all_dates += [d] + d = d + dt + + # Save for later + self.store = store + self.all_dates = all_dates + self.country_codes = country_codes + self.MAX_INDEX = MAX_INDEX + self.date_min = date_min + self.date_max = date_max + + def get_country_series(self, ccode): + assert ccode in self.country_codes + series = {} + for d in self.all_dates: + series[d] = None + for i in range(self.MAX_INDEX): + series[self.store[("date", i)]] = self.store[(ccode, i)] + sx = [] + for d in self.all_dates: + sx += [series[d]] + return sx[-self.DAYS:] + + def get_dates(self): + return self.all_dates[-self.DAYS:] + + def get_largest(self, number): + exclude = set(["all", "??", "date"]) + l = [(self.store[(c, self.MAX_INDEX-1)], c) for c in self.country_codes if c not in exclude] + l.sort() + l.reverse() + return [c for _, c in l][:number] + + def get_largest_locations(self, number): + l = self.get_largest(number) + res = {} + for ccode in l[:number]: + res[ccode] = self.get_country_series(ccode) + return res + + def get_codes(self): + return self.country_codes + [] + +## Run a particle filter based inference algorithm +## given a data series and a model of traffic over time +def particle_filter_detector(ser1, taps, models): + # particle : (id, rate, censor, last_censor prev_particle) + + # Model paramaters + normal_std_factor = 4 + censorship_std_factor = 7 + censorship_prior_model = 0.01 + change_tap_prior_model = 0.1 + + # Sampling parameters + change_tap_sample = 0.2 + censorship_prior_sample = 0.3 + particle_number = 1000 + mult_particles = 1 + + # Check consistancy once + for t in models: + assert len(ser1) == len(models[t]) + + # Clean up a bit the data + series2 = [] + last = None + first = None + # Process series + for s in ser1: + if s == None: + series2 += [last] + else: + if first == None: + first = s + series2 += [s] + last = s + + series2 = [s if s != None else first for s in series2] + series = series2 + + # Data structures to keep logs + particles = {} + outputlog = [(series[0],series[0])] + + # Initial particles: + particles[0] = [] + G = gamma(max(1,series[0]), 1) + for pi, r in enumerate(G.rvs(particle_number)): + particles[0] += [(pi, r, False, None, 0, random.choice(taps), False)] + + # Now run the sampler for all times + for pi in range(1, len(series)): + assert models != None + assert taps != None + + # Normal distributions from taps and the model standard deviation for normality and censorship + round_models = {} + for ti in taps: + NoCensor = norm(models[ti][pi][0], (models[ti][pi][1] * normal_std_factor)**2) + Censor = norm(models[ti][pi][0], (models[ti][pi][1] * censorship_std_factor)**2) + round_models[ti] = (NoCensor, Censor) + + # Store for expanded pool of particles + temporary_particles = [] + + # Expand the distribution + for p in particles[pi-1]: + p_old, C_old, j = tracebackp(particles, p, pi-1, p[5] - 1) # taps[0] - 1) + + # Serial number of old particle + p_old_num = None + if p_old != None: + p_old_num = p_old[0] + + # Create a number of candidate particles from each previous particle + for _ in range(mult_particles): + + # Sample a new tap for the candidate particle + new_tap = p[5] + if random.random() < change_tap_sample: + new_tap = random.choice(taps) + + # Update this censorship flag + C = False + if random.random() < censorship_prior_sample: + C = True + + # Determine new rate + new_p = None + if p_old == None: + new_p = p[1] # continue as before + if C | C_old: + while new_p == None or new_p < 0: + new_p = p_old[1] * (1 + round_models[new_tap][1].rvs(1)[0]) ## censor models + else: + while new_p == None or new_p < 0: + new_p = p_old[1] * (1 + round_models[new_tap][0].rvs(1)[0]) ## no censor models + + # Build and register new particle + newpi = (None, new_p, C, p[0], pi, new_tap, C | C_old) + temporary_particles += [newpi] + + + # Assign a weight to each sampled candidtae particle + weights = [] + for px in temporary_particles: + wx = 1.0 + + # Adjust weight to observation + if not series[pi] == None: + poisson_prob = poisson.pmf(series[pi], px[1]) + #print poisson_prob, px + wx *= poisson_prob + + # Adjust the probability of censorship + if px[2]: + wx *= censorship_prior_model / censorship_prior_sample + else: + wx *= (1 - censorship_prior_model) / (1 - censorship_prior_sample) + + # Adjust the probability of changing the tap + if px[5] == particles[pi-1][px[3]][5]: + wx *= (1 - change_tap_prior_model) / (((1-change_tap_sample) + change_tap_sample*(1.0 / len(taps)))) + else: + wx *= (change_tap_prior_model) / (1 - (((1-change_tap_sample) + change_tap_sample*(1.0 / len(taps))))) + + weights += [wx] + + weights_sum = sum(weights) + + ## Resample according to weight + particles[pi] = [] + for pid in range(particle_number): + px = samplep(weights, weights_sum, temporary_particles) + px = (pid, px[1], px[2], px[3], px[4], px[5], px[6]) + particles[pi] += [px] + + ## Collect some statistics + + ## stats + Ci = 0 + mean = 0 + for px in particles[pi]: + if px[2]: + Ci += 1 + mean += px[1] + mean = mean / len(particles[pi]) + + # Diversity + Div = len(set([pv[3] for pv in particles[pi]])) + + # Range of values + range_normal = sorted([pn[1] for pn in temporary_particles if not pn[2]]) + Base = range_normal[len(range_normal)/2] + Mn = range_normal[len(range_normal)*1/100] + Mx = range_normal[len(range_normal)*99/100] + outputlog += [(Mn, Mx)] + + # How many are using the censorship model at any time? + censor_model_stat = len([1 for pn in particles[pi] if pn[6]])* 100 / len(particles[pi]) + + # Build histogram of taps + tap_hist = {} + for px in particles[pi]: + tap_hist[px[5]] = tap_hist.get(px[5], 0) + 1 + + print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (pi, Ci, mean, series[pi], tap_hist, Base, Mn, Mx, Div, censor_model_stat) + # print " [%s - %s]" % (key_series_point*(1+NoCensor.ppf(0.00001)), key_series_point*(1+NoCensor.ppf(0.99999))) + + return particles, outputlog + +## Get number of censorship particles, particles that use previous censorship models, +## and total number of particles over time. +def get_events(particles): + events = [] + for ps in sorted(particles): + censor_model_stat = len([1 for pn in particles[ps] if pn[6]]) + events += [(len([1 for p in particles[ps] if p[2]]), censor_model_stat, len(particles[ps]))] + return events + +## Make pretty graphs of the data and censorship events +def plotparticles(series, particles, outputlog, labels, xtitle, events): + assert len(xtitle) == 3 + fname, stitle, slegend = xtitle + + font = {'family' : 'Bitstream Vera Sans', + 'weight' : 'normal', + 'size' : 8} + matplotlib.rc('font', **font) + + mmx = max(series) + if mmx == None: + return # There is no data here! + diff = abs(-mmx*0.1 - mmx*1.1) + + ylim( (-mmx*0.1, 1+mmx*1.1) ) + plot(labels, series, linewidth=1.0, label="Users") + + F = gcf() + + wherefill = [] + minc, maxc = [], [] + for mm,mx in outputlog: + wherefill += [not (mm == None and mx == None)] + assert mm <= mx or (mm == None and mx == None) + minc += [mm] + maxc += [mx] + + fill_between(labels, minc, maxc, where=wherefill, color="gray", label="Prediction") + + vdown = [] + vup = [] + active_region_20 = [] + active_region_50 = [] + for i,v in enumerate(series): + if minc[i] == None or maxc[i] == None: + continue + + mean = (minc[i] + maxc[i]) / 2 + + v2 = v + if v2 == None: + v2 = 0 + + if events[i][0] * 100 / events[i][2] > 10: + if v2 <= mean: + vdown += [(labels[i], v2, events[i][0], events[i][2])] + # print vdown[-1] + else: + vup += [(labels[i], v2, events[i][0], events[i][2])] + + active_region_20 += [events[i][1] * 100 / events[i][2] > 20] + active_region_50 += [events[i][1] * 100 / events[i][2] > 50] + + fill_between(labels, -mmx*0.1*ones(len(labels)), (1+mmx*1.1)*ones(len(labels)), where=active_region_20, color="r", alpha=0.15) + fill_between(labels, -mmx*0.1*ones(len(labels)), (1+mmx*1.1)*ones(len(labels)), where=active_region_50, color="r", alpha=0.15) + + x = [p[0] for p in vdown] + y = [p[1] for p in vdown] + s = [20 + p[2]*100 / p[3] for p in vdown] + if len(x) > 0: + scatter(x,y,s=s, marker='v', c='r') + for xi,yi, score, total in vdown: + if 100 * score / total > 10: + text(xi, yi - diff*5 / 100, "%2d%%" % (100 * float(score) / total), color="r") + + x = [p[0] for p in vup] + y = [p[1] for p in vup] + s = [20+ p[2]*100 / p[3] for p in vup] + if len(x) > 0: + scatter(x,y,s=s, marker='^', c='g') + for xi,yi, score, total in vup: + if 100 * score / total > 10: + text(xi, yi+diff*5 / 100, "%2d%%" % (100 * float(score) / total), color="g") + + + legend(loc=2) + + xlabel('Time (days)') + ylabel('Users') + title(stitle) + grid(True) + + + F.set_size_inches(10,5) + F.savefig(fname, format="png", dpi = (150)) + close() + +## Get a particle from a trace at time current_round - delay +def tracebackp(particles, start_particle, current_round, delay): + if current_round - delay < 0: + return None, False, 0 + + j = current_round + this_particle = start_particle + C = False + r = None + # print "-----" + while not j < current_round - delay: + # print this_particle + C |= this_particle[2] # set the censorship flag + j = j-1 + if not (not j < current_round - delay): + break + this_particle = particles[j][this_particle[3]] + assert j+1 == this_particle[4] == current_round - delay + return (this_particle, C, j+1) + +# Sample a number of items according to their weights +def samplep(weights, total, samples): + rx = random.random() * total + stotal = 0.0 + for i,w in enumerate(weights): + stotal += w + if stotal >= rx: + return samples[i] + + assert False + +# Makes an estimate of the rate from sample observations +def infer_sample_rate(series): + seriesNone = series + [] + series = series + [] # we need a fresh copy! + for i,s in enumerate(series): + if seriesNone[i] == None: + series[i] = 0 + if series[i] == 0: + series[i] = 0.001 + rates = list(gamma.rvs(series, 1)) + for i,r in enumerate(rates): + if seriesNone[i] == None or seriesNone[i] == 0: + rates[i] = None + return rates + +# Get (mean, std) for the top-50 series and different day delays (e.g. taps=[1,7]) +def make_daytoday_normal_models(tss, taps): + codes = tss.get_largest_locations(50) + series = {} + L = len(codes.values()[0]) + for c in codes: + series[(c, 0)] = infer_sample_rate(tss.get_country_series(c)) + assert len(series[(c, 0)]) == L + for d in taps: + series[(c, d)] = historic_frac(series[(c, 0)], d) + assert len(series[(c, d)]) == L + + models = {} + for d in taps: + models[d] = [] + for i in range(L): + v = [] + for c in codes: + vi = series[(c, d)][i] + if not vi == None: + v += [vi] + if len(v) > 1: + v.sort() + v = v[len(v)*5/100:len(v)*95/100:] + if (numpy.mean(v) > 10) or (numpy.mean(v) < -10): + # models[d] += [(0.0, 0.02)] + print i, [(numpy.mean(v), numpy.std(v))] + models[d] += [(numpy.mean(v), numpy.std(v))] + else: + models[d] += [(numpy.mean(v), numpy.std(v))] + else: + models[d] += [(0.0, 0.02)] + # print d, i, models[d][-1] # , v[:5] + + return models + +# Get historic fractions of traffic to train models +def historic_frac(rates, delay): + assert delay > 0 + diff= [] + for i,r in enumerate(rates): + if i - delay < 0 or rates[i-delay] == None or rates[i] == None or rates[i-delay] == 0: + diff += [None] + else: + diff += [(rates[i] - rates[i-delay]) / rates[i-delay]] + return diff + +def plot_country(tss, models, taps, country_code, GRAPH_DIR): + series = tss.get_country_series(country_code) + particles, outputlog = particle_filter_detector(series, taps, models) ## Run the inference algorithm + events = get_events(particles) ## Extract events from particles -- % censorship over time + labels = tss.get_dates() + xtitle = (os.path.join(GRAPH_DIR, "%s-censor.png" % country_code), "Tor report for %s" % country_code,"") + plotparticles(series, particles, outputlog, labels, xtitle, events) + + +#def main(): +if True: + # Change these to customize script + ## (Model parameters are still in particle filter function) + CSV_FILE = "direct-users.csv" + GRAPH_DIR = "img2" + DAYS= 4 * 31 + + tss = torstatstore(CSV_FILE, DAYS) + gr = tss.get_country_series('gr') + rates_gr = infer_sample_rate(gr) + # print historic_frac(rates_gr, 1) + + models = make_daytoday_normal_models(tss, [1, 7]) + + # for country_code in tss.get_largest(250): + #country_code = "kr" + for country_code in ["cn", "eg", "ly", "kr", "de", "mm"]: + print country_code + plot_country(tss, models, [1, 7], country_code, GRAPH_DIR) + +#if __name__ == "__main__": +# main()
tor-commits@lists.torproject.org