commit a367168a782e864bdacb610857b1dc5d58fd192d Author: Karsten Loesing karsten.loesing@gmx.net Date: Sun Dec 9 12:02:42 2018 +0100
Rewrite censorship detector in Java.
This allows us to remove the last remaining Python parts from the daily updater.
Implements #21588. --- build.xml | 26 -- .../torproject/metrics/stats/clients/Detector.java | 433 +++++++++++++++++++++ .../org/torproject/metrics/stats/clients/Main.java | 5 + src/main/python/clients/country_info.py | 255 ------------ src/main/python/clients/detector.py | 242 ------------ 5 files changed, 438 insertions(+), 523 deletions(-)
diff --git a/build.xml b/build.xml index 6736e19..93eda7b 100644 --- a/build.xml +++ b/build.xml @@ -23,7 +23,6 @@ <property name="tardepends" value="war" />
<property name="Rsources" value="${basedir}/src/main/R" /> - <property name="pysources" value="${basedir}/src/main/python" />
<property name="specdir" value="${basedir}/generated/spec" />
@@ -360,32 +359,7 @@
<target name="clients" > <property name="module.name" value="clients" /> - <property name="localmoddir" value="${modulebase}/${module.name}" /> - - <property name="statsdir" - value="${localmoddir}/stats" /> - <mkdir dir="${statsdir}" /> - <antcall target="run-java" /> - - <antcall target="run-R" > - <param name="module.Rscript" value="userstats-detector.R" /> - </antcall> - - <exec executable="python" - dir="${localmoddir}" - failonerror="true" > - <arg value="${pysources}/${module.name}/detector.py" /> - <arg value="userstats-detector.csv" /> - <arg value="userstats-ranges.csv" /> - </exec> - - <antcall target="run-R" > - <param name="module.Rscript" value="merge-clients.R" /> - </antcall> - - <copy file="${localmoddir}/clients.csv" todir="${statsdir}" /> - <copy file="${localmoddir}/userstats-combined.csv" todir="${statsdir}" /> </target>
<target name="servers" > diff --git a/src/main/java/org/torproject/metrics/stats/clients/Detector.java b/src/main/java/org/torproject/metrics/stats/clients/Detector.java new file mode 100644 index 0000000..1a523c2 --- /dev/null +++ b/src/main/java/org/torproject/metrics/stats/clients/Detector.java @@ -0,0 +1,433 @@ +/* Copyright 2011 George Danezis gdane@microsoft.com + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted (subject to the limitations in the + * disclaimer below) provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of <Owner Organization> nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE + * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT + * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * (Clear BSD license: + * http://labs.metacarta.com/license-explanation.html#license) + * + * Copyright 2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.metrics.stats.clients; + +import org.apache.commons.math3.distribution.NormalDistribution; +import org.apache.commons.math3.distribution.PoissonDistribution; +import org.apache.commons.math3.stat.descriptive.moment.Mean; +import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation; +import org.apache.commons.math3.stat.descriptive.rank.Percentile; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.LineNumberReader; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.LocalDate; +import java.time.format.DateTimeParseException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.stream.Collectors; + +/** Censorship detector that reads a .csv file of the number of Tor clients and + * finds anomalies that might be indicative of censorship. */ +public class Detector { + + /** Input file. */ + private static final Path INPUT_PATH = Paths.get("stats", "userstats.csv"); + + /** Output file. */ + private static final Path OUTPUT_PATH = Paths.get("stats", "clients.csv"); + + /** Number of largest locations to be included in the detection algorithm. */ + private static final int NUM_LARGEST_LOCATIONS = 50; + + /** Time interval in days to model connection rates. */ + private static final int INTERV = 7; + + /** Compound key under which client estimates are stored in both input and + * output files. */ + private static class ClientsKey implements Comparable<ClientsKey> { + + /** Date when clients connected to the Tor network. */ + private LocalDate date; + + /** Whether clients connected via relays (true) or bridges (false). */ + private boolean nodeIsRelay; + + /** Two-letter lower-case country code of the country from which clients + * connected, "??" if the country could not be resolved, or left empty for + * all countries together. */ + private String country; + + /** Name of the transport used by clients to connect using bridges, or left + * empty for all transports together. */ + private String transport = ""; + + /** IP version used by clients to connect using bridges, or left empty for + * all IP versions together. */ + private String version = ""; + + ClientsKey(LocalDate date, boolean nodeIsRelay, String country) { + this.date = date; + this.nodeIsRelay = nodeIsRelay; + this.country = country; + } + + ClientsKey(LocalDate date, boolean nodeIsRelay, String country, + String transport, String version) { + this(date, nodeIsRelay, country); + this.transport = transport; + this.version = version; + } + + @Override + public int compareTo(ClientsKey other) { + if (!this.date.equals(other.date)) { + return this.date.compareTo(other.date); + } else if (!this.nodeIsRelay && other.nodeIsRelay) { + return -1; + } else if (this.nodeIsRelay && !other.nodeIsRelay) { + return 1; + } else if (!this.country.equals(other.country)) { + return this.country.compareTo(other.country); + } else if (!this.transport.equals(other.transport)) { + return this.transport.compareTo(other.transport); + } else if (!this.version.equals(other.version)) { + return this.version.compareTo(other.version); + } else { + return 0; + } + } + + @Override + public boolean equals(Object otherObject) { + if (!(otherObject instanceof ClientsKey)) { + return false; + } else { + ClientsKey other = (ClientsKey) otherObject; + return this.date.equals(other.date) + && this.nodeIsRelay == other.nodeIsRelay + && this.country.equals(other.country) + && this.transport.equals(other.transport) + && this.version.equals(other.version); + } + } + + @Override + public int hashCode() { + return 3 * this.date.hashCode() + (this.nodeIsRelay ? 5 : 0) + + 7 * this.country.hashCode() + 11 * this.transport.hashCode() + + 13 * this.version.hashCode(); + } + + @Override + public String toString() { + return String.format("%s,%s,%s,%s,%s", + this.date.toString(), this.nodeIsRelay ? "relay" : "bridge", + this.country, this.transport, this.version); + } + } + + /** Value class that stores everything we already knew about a specific + * subset of clients from the input file. */ + private static class ClientsEstimates { + + /** Estimated number of clients. */ + private int clients; + + /** Fraction of relays or bridges in percent that the estimate is based on, + * between 0 and 100. */ + private int frac; + + ClientsEstimates(int clients, int frac) { + this.clients = clients; + this.frac = frac; + } + + @Override + public String toString() { + return String.format("%d,%d", this.clients, this.frac); + } + } + + /** Value class that stores everything we're computing here about a specific + * subset of clients from the input file. */ + private static class ClientsRanges { + + /** Lower number of expected clients under the assumption that there has + * been no censorship event, as computed here. */ + private int lower; + + /** Upper number of expected clients under the assumption that there has + * been no release of censorship, as computed here. */ + private int upper; + + ClientsRanges(int lower, int upper) { + this.lower = lower; + this.upper = upper; + } + + @Override + public String toString() { + return String.format("%d,%d", this.lower, this.upper); + } + } + + /** Run censorship detection. */ + public void detect() throws IOException { + SortedMap<ClientsKey, ClientsEstimates> estimates = readInputFile(); + Set<String> largestLocations = findLargestLocations(estimates); + Map<LocalDate, List<Double>> ratios = computeRatiosOfLargestLocations( + estimates, largestLocations); + Map<LocalDate, List<Double>> ratiosWithoutOutliers = removeOutliers(ratios); + SortedMap<ClientsKey, ClientsRanges> ranges = computeRanges(estimates, + ratiosWithoutOutliers); + writeOutputFile(estimates, ranges); + } + + /** Read and return the parsed input file containing comma-separated estimates + * of client numbers. */ + private static SortedMap<ClientsKey, ClientsEstimates> readInputFile() + throws IOException { + SortedMap<ClientsKey, ClientsEstimates> estimates = new TreeMap<>(); + File inputFile = INPUT_PATH.toFile(); + if (!inputFile.exists()) { + throw new IOException(String.format("Input file %s does not exist.", + inputFile)); + } + try (LineNumberReader lnr = new LineNumberReader( + new FileReader(inputFile))) { + String line = lnr.readLine(); + if (!"date,node,country,transport,version,frac,users".equals(line)) { + throw new IOException(String.format("Unable to read input file %s with " + + "unrecognized header line '%s'. Not running detector.", inputFile, + line)); + } + while ((line = lnr.readLine()) != null) { + ClientsKey key = null; + ClientsEstimates value = null; + boolean invalidLine = false; + String[] lineParts = line.split(","); + if (lineParts.length == 7) { + try { + LocalDate date = LocalDate.parse(lineParts[0]); + boolean nodeIsRelay = false; + if ("relay".equals(lineParts[1])) { + nodeIsRelay = true; + } else if (!"bridge".equals(lineParts[1])) { + invalidLine = true; + } + String country = lineParts[2].replaceAll(""", ""); + String transport = lineParts[3].replaceAll(""", ""); + String version = lineParts[4].replaceAll(""", ""); + key = new ClientsKey(date, nodeIsRelay, country, transport, + version); + } catch (DateTimeParseException e) { + invalidLine = true; + } + try { + int frac = Integer.parseInt(lineParts[5]); + int clients = Integer.parseInt(lineParts[6]); + value = new ClientsEstimates(clients, frac); + } catch (NumberFormatException e) { + invalidLine = true; + } + } else { + invalidLine = true; + } + if (invalidLine) { + throw new IOException(String.format( + "Invalid line %d '%s' in input file %s.", lnr.getLineNumber(), + line, inputFile)); + } else { + estimates.put(key, value); + } + } + } + return estimates; + } + + /** Return the NUM_LARGEST_LOCATIONS countries (except for "??") with the + * largest number of estimated clients on the last known date in the input + * data set. + * + * <p>Note that this implies that lower/upper values are going to change, + * depending on which countries had most clients on the last known date in the + * input data set.</p> */ + private static Set<String> findLargestLocations( + SortedMap<ClientsKey, ClientsEstimates> clients) throws IOException { + LocalDate lastKnownDate = clients.keySet().stream() + .filter(c -> c.nodeIsRelay) + .map(c -> c.date) + .max(LocalDate::compareTo) + .orElseThrow(() -> new IOException("Unable to find maximum date. Was " + + "the input file empty or otherwise corrupt?")); + return clients.entrySet().stream() + .filter(c -> lastKnownDate.equals(c.getKey().date)) + .filter(c -> c.getKey().nodeIsRelay) + .filter(c -> !"".equals(c.getKey().country)) + .filter(c -> !"??".equals(c.getKey().country)) + .sorted((c1, c2) -> Integer.compare(c2.getValue().clients, + c1.getValue().clients)) + .map(c -> c.getKey().country) + .limit(NUM_LARGEST_LOCATIONS) + .collect(Collectors.toSet()); + } + + /** Compute the ratio of the client number estimate for a given date and + * country as compared to 1 week before, for all dates, for relay users, and + * for the largest locations. */ + private static Map<LocalDate, List<Double>> computeRatiosOfLargestLocations( + SortedMap<ClientsKey, ClientsEstimates> estimates, + Set<String> largestLocations) { + Map<LocalDate, List<Double>> ratios = new HashMap<>(); + for (Map.Entry<ClientsKey, ClientsEstimates> numerator + : estimates.entrySet()) { + if (!numerator.getKey().nodeIsRelay + || !largestLocations.contains(numerator.getKey().country)) { + continue; + } + ClientsEstimates denominator = estimates.get(new ClientsKey( + numerator.getKey().date.minusDays(INTERV), true, + numerator.getKey().country)); + if (null == denominator || denominator.clients == 0) { + continue; + } + if (!ratios.containsKey(numerator.getKey().date)) { + ratios.put(numerator.getKey().date, new ArrayList<>()); + } + ratios.get(numerator.getKey().date).add( + ((double) numerator.getValue().clients) + / (double) denominator.clients); + } + return ratios; + } + + /** Exclude outliers from the given ratios by date that fall outside four + * inter-quartile ranges of the median and make sure that at least 8 ratio + * values remain. */ + private static SortedMap<LocalDate, List<Double>> removeOutliers( + Map<LocalDate, List<Double>> ratios) { + SortedMap<LocalDate, List<Double>> ratiosWithoutOutliers = new TreeMap<>(); + for (Map.Entry<LocalDate, List<Double>> e : ratios.entrySet()) { + double[] values = e.getValue().stream().mapToDouble(Double::doubleValue) + .toArray(); + Percentile percentile = new Percentile() + .withEstimationType(Percentile.EstimationType.R_7); + percentile.setData(values); + double median = percentile.evaluate(50.0); + double firstQuarter = percentile.evaluate(25.0); + double thirdQuarter = percentile.evaluate(75.0); + double interQuartileRange = thirdQuarter - firstQuarter; + List<Double> valuesWithoutOutliers = new ArrayList<>(); + for (double value : values) { + if (value > median - 4 * interQuartileRange + && value < median + 4 * interQuartileRange) { + valuesWithoutOutliers.add(value); + } + } + if (valuesWithoutOutliers.size() < 8) { + continue; + } + LocalDate date = e.getKey(); + ratiosWithoutOutliers.put(date, valuesWithoutOutliers); + } + return ratiosWithoutOutliers; + } + + /** Compute ranges as the expected minimum and maximum number of users. */ + private static SortedMap<ClientsKey, ClientsRanges> computeRanges( + SortedMap<ClientsKey, ClientsEstimates> estimates, + Map<LocalDate, List<Double>> ratiosWithoutOutliers) { + SortedMap<ClientsKey, ClientsRanges> ranges = new TreeMap<>(); + for (Map.Entry<ClientsKey, ClientsEstimates> estimatesEntry + : estimates.entrySet()) { + LocalDate date = estimatesEntry.getKey().date; + if (!estimatesEntry.getKey().nodeIsRelay + || "".equals(estimatesEntry.getKey().country) + || "??".equals(estimatesEntry.getKey().country) + || !ratiosWithoutOutliers.containsKey(date)) { + continue; + } + ClientsEstimates referenceEstimate = estimates.get( + new ClientsKey(date.minusDays(INTERV), + true, estimatesEntry.getKey().country)); + if (null == referenceEstimate || referenceEstimate.clients == 0) { + continue; + } + double[] values = ratiosWithoutOutliers.get(date).stream() + .mapToDouble(Double::doubleValue).toArray(); + double mean = new Mean().evaluate(values); + double std = new StandardDeviation(false).evaluate(values); + NormalDistribution normalDistribution = new NormalDistribution(mean, std); + PoissonDistribution poissonDistribution + = new PoissonDistribution(referenceEstimate.clients); + int lower = Math.max(0, + (int) (normalDistribution.inverseCumulativeProbability(0.0001) + * poissonDistribution.inverseCumulativeProbability(0.0001))); + int upper = + (int) (normalDistribution.inverseCumulativeProbability(0.9999) + * poissonDistribution.inverseCumulativeProbability(0.9999)); + ranges.put(estimatesEntry.getKey(), new ClientsRanges(lower, upper)); + } + return ranges; + } + + /** Write client number estimates together with lower and upper bounds as + * comma-separated values to the output file. */ + private static void writeOutputFile( + SortedMap<ClientsKey, ClientsEstimates> estimates, + SortedMap<ClientsKey, ClientsRanges> ranges) throws IOException { + try (BufferedWriter bw = new BufferedWriter( + new FileWriter(OUTPUT_PATH.toFile()))) { + bw.write( + "date,node,country,transport,version,lower,upper,clients,frac\n"); + for (Map.Entry<ClientsKey, ClientsEstimates> e : estimates.entrySet()) { + String rangesString = ","; + if (ranges.containsKey(e.getKey())) { + rangesString = ranges.get(e.getKey()).toString(); + } + bw.write(String.format("%s,%s,%s%n", e.getKey().toString(), + rangesString, e.getValue().toString())); + } + } + } +} + diff --git a/src/main/java/org/torproject/metrics/stats/clients/Main.java b/src/main/java/org/torproject/metrics/stats/clients/Main.java index 48d8d8d..0f1087b 100644 --- a/src/main/java/org/torproject/metrics/stats/clients/Main.java +++ b/src/main/java/org/torproject/metrics/stats/clients/Main.java @@ -59,6 +59,11 @@ public class Main {
log.info("Disconnecting from database."); database.close(); + + log.info("Running detector."); + new Detector().detect(); + + log.info("Terminating clients module."); }
private static final long ONE_HOUR_MILLIS = 60L * 60L * 1000L; diff --git a/src/main/python/clients/country_info.py b/src/main/python/clients/country_info.py deleted file mode 100644 index 1a505d0..0000000 --- a/src/main/python/clients/country_info.py +++ /dev/null @@ -1,255 +0,0 @@ -# -*- coding: utf-8 -*- - -countries = { - "ad" : "Andorra", - "ae" : "the United Arab Emirates", - "af" : "Afghanistan", - "ag" : "Antigua and Barbuda", - "ai" : "Anguilla", - "al" : "Albania", - "am" : "Armenia", - "an" : "the Netherlands Antilles", - "ao" : "Angola", - "aq" : "Antarctica", - "ar" : "Argentina", - "as" : "American Samoa", - "at" : "Austria", - "au" : "Australia", - "aw" : "Aruba", - "ax" : "the Aland Islands", - "az" : "Azerbaijan", - "ba" : "Bosnia and Herzegovina", - "bb" : "Barbados", - "bd" : "Bangladesh", - "be" : "Belgium", - "bf" : "Burkina Faso", - "bg" : "Bulgaria", - "bh" : "Bahrain", - "bi" : "Burundi", - "bj" : "Benin", - "bl" : "Saint Bartelemey", - "bm" : "Bermuda", - "bn" : "Brunei", - "bo" : "Bolivia", - "bq" : "Bonaire, Sint Eustatius and Saba", - "br" : "Brazil", - "bs" : "the Bahamas", - "bt" : "Bhutan", - "bv" : "the Bouvet Island", - "bw" : "Botswana", - "by" : "Belarus", - "bz" : "Belize", - "ca" : "Canada", - "cc" : "the Cocos (Keeling) Islands", - "cd" : "the Democratic Republic of the Congo", - "cf" : "Central African Republic", - "cg" : "Congo", - "ch" : "Switzerland", - "ci" : u"Côte d'Ivoire", - "ck" : "the Cook Islands", - "cl" : "Chile", - "cm" : "Cameroon", - "cn" : "China", - "co" : "Colombia", - "cr" : "Costa Rica", - "cu" : "Cuba", - "cv" : "Cape Verde", - "cw" : u"Curaçao", - "cx" : "the Christmas Island", - "cy" : "Cyprus", - "cz" : "the Czech Republic", - "de" : "Germany", - "dj" : "Djibouti", - "dk" : "Denmark", - "dm" : "Dominica", - "do" : "the Dominican Republic", - "dz" : "Algeria", - "ec" : "Ecuador", - "ee" : "Estonia", - "eg" : "Egypt", - "eh" : "the Western Sahara", - "er" : "Eritrea", - "es" : "Spain", - "et" : "Ethiopia", - "fi" : "Finland", - "fj" : "Fiji", - "fk" : "the Falkland Islands (Malvinas)", - "fm" : "the Federated States of Micronesia", - "fo" : "the Faroe Islands", - "fr" : "France", - "ga" : "Gabon", - "gb" : "the United Kingdom", - "gd" : "Grenada", - "ge" : "Georgia", - "gf" : "French Guiana", - "gg" : "Guernsey", - "gh" : "Ghana", - "gi" : "Gibraltar", - "gl" : "Greenland", - "gm" : "Gambia", - "gn" : "Guinea", - "gp" : "Guadeloupe", - "gq" : "Equatorial Guinea", - "gr" : "Greece", - "gs" : "South Georgia and the South Sandwich Islands", - "gt" : "Guatemala", - "gu" : "Guam", - "gw" : "Guinea-Bissau", - "gy" : "Guyana", - "hk" : "Hong Kong", - "hm" : "Heard Island and McDonald Islands", - "hn" : "Honduras", - "hr" : "Croatia", - "ht" : "Haiti", - "hu" : "Hungary", - "id" : "Indonesia", - "ie" : "Ireland", - "il" : "Israel", - "im" : "the Isle of Man", - "in" : "India", - "io" : "the British Indian Ocean Territory", - "iq" : "Iraq", - "ir" : "Iran", - "is" : "Iceland", - "it" : "Italy", - "je" : "Jersey", - "jm" : "Jamaica", - "jo" : "Jordan", - "jp" : "Japan", - "ke" : "Kenya", - "kg" : "Kyrgyzstan", - "kh" : "Cambodia", - "ki" : "Kiribati", - "km" : "Comoros", - "kn" : "Saint Kitts and Nevis", - "kp" : "North Korea", - "kr" : "the Republic of Korea", - "kw" : "Kuwait", - "ky" : "the Cayman Islands", - "kz" : "Kazakhstan", - "la" : "Laos", - "lb" : "Lebanon", - "lc" : "Saint Lucia", - "li" : "Liechtenstein", - "lk" : "Sri Lanka", - "lr" : "Liberia", - "ls" : "Lesotho", - "lt" : "Lithuania", - "lu" : "Luxembourg", - "lv" : "Latvia", - "ly" : "Libya", - "ma" : "Morocco", - "mc" : "Monaco", - "md" : "the Republic of Moldova", - "me" : "Montenegro", - "mf" : "Saint Martin", - "mg" : "Madagascar", - "mh" : "the Marshall Islands", - "mk" : "Macedonia", - "ml" : "Mali", - "mm" : "Burma", - "mn" : "Mongolia", - "mo" : "Macau", - "mp" : "the Northern Mariana Islands", - "mq" : "Martinique", - "mr" : "Mauritania", - "ms" : "Montserrat", - "mt" : "Malta", - "mu" : "Mauritius", - "mv" : "the Maldives", - "mw" : "Malawi", - "mx" : "Mexico", - "my" : "Malaysia", - "mz" : "Mozambique", - "na" : "Namibia", - "nc" : "New Caledonia", - "ne" : "Niger", - "nf" : "Norfolk Island", - "ng" : "Nigeria", - "ni" : "Nicaragua", - "nl" : "the Netherlands", - "no" : "Norway", - "np" : "Nepal", - "nr" : "Nauru", - "nu" : "Niue", - "nz" : "New Zealand", - "om" : "Oman", - "pa" : "Panama", - "pe" : "Peru", - "pf" : "French Polynesia", - "pg" : "Papua New Guinea", - "ph" : "the Philippines", - "pk" : "Pakistan", - "pl" : "Poland", - "pm" : "Saint Pierre and Miquelon", - "pn" : "the Pitcairn Islands", - "pr" : "Puerto Rico", - "ps" : "the Palestinian Territory", - "pt" : "Portugal", - "pw" : "Palau", - "py" : "Paraguay", - "qa" : "Qatar", - "re" : "Reunion", - "ro" : "Romania", - "rs" : "Serbia", - "ru" : "Russia", - "rw" : "Rwanda", - "sa" : "Saudi Arabia", - "sb" : "the Solomon Islands", - "sc" : "the Seychelles", - "sd" : "Sudan", - "se" : "Sweden", - "sg" : "Singapore", - "sh" : "Saint Helena", - "si" : "Slovenia", - "sj" : "Svalbard and Jan Mayen", - "sk" : "Slovakia", - "sl" : "Sierra Leone", - "sm" : "San Marino", - "sn" : "Senegal", - "so" : "Somalia", - "sr" : "Suriname", - "ss" : "South Sudan", - "st" : u"São Tomé and Príncipe", - "sv" : "El Salvador", - "sx" : "Sint Maarten", - "sy" : "the Syrian Arab Republic", - "sz" : "Swaziland", - "tc" : "Turks and Caicos Islands", - "td" : "Chad", - "tf" : "the French Southern Territories", - "tg" : "Togo", - "th" : "Thailand", - "tj" : "Tajikistan", - "tk" : "Tokelau", - "tl" : "East Timor", - "tm" : "Turkmenistan", - "tn" : "Tunisia", - "to" : "Tonga", - "tr" : "Turkey", - "tt" : "Trinidad and Tobago", - "tv" : "Tuvalu", - "tw" : "Taiwan", - "tz" : "the United Republic of Tanzania", - "ua" : "Ukraine", - "ug" : "Uganda", - "um" : "the United States Minor Outlying Islands", - "us" : "the United States", - "uy" : "Uruguay", - "uz" : "Uzbekistan", - "va" : "Vatican City", - "vc" : "Saint Vincent and the Grenadines", - "ve" : "Venezuela", - "vg" : "the British Virgin Islands", - "vi" : "the United States Virgin Islands", - "vn" : "Vietnam", - "vu" : "Vanuatu", - "wf" : "Wallis and Futuna", - "ws" : "Samoa", - "xk" : "Kosovo", - "ye" : "Yemen", - "yt" : "Mayotte", - "za" : "South Africa", - "zm" : "Zambia", - "zw" : "Zimbabwe" - } diff --git a/src/main/python/clients/detector.py b/src/main/python/clients/detector.py deleted file mode 100644 index b0a98af..0000000 --- a/src/main/python/clients/detector.py +++ /dev/null @@ -1,242 +0,0 @@ -## Copyright (c) 2011 George Danezis gdane@microsoft.com -## -## All rights reserved. -## -## Redistribution and use in source and binary forms, with or without -## modification, are permitted (subject to the limitations in the -## disclaimer below) provided that the following conditions are met: -## -## * Redistributions of source code must retain the above copyright -## notice, this list of conditions and the following disclaimer. -## -## * Redistributions in binary form must reproduce the above copyright -## notice, this list of conditions and the following disclaimer in the -## documentation and/or other materials provided with the -## distribution. -## -## * Neither the name of <Owner Organization> nor the names of its -## contributors may be used to endorse or promote products derived -## from this software without specific prior written permission. -## -## NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE -## GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT -## HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED -## WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -## MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -## DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -## LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -## CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -## SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -## BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE -## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN -## IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -## -## (Clear BSD license: http://labs.metacarta.com/license-explanation.html#license) - -## This script reads a .csv file of the number of Tor users and finds -## anomalies that might be indicative of censorship. - -# Dep: numpy -import numpy -from numpy import mean, std, percentile - -# Dep: scipy -import scipy.stats -from scipy.stats.distributions import norm -from scipy.stats.distributions import poisson - -# Std lib -from datetime import date -from datetime import timedelta -import os.path - -# Country code -> Country names -import country_info - -# write utf8 to file -import codecs - -def get_country_name_from_cc(country_code): - if (country_code.lower() in country_info.countries): - return country_info.countries[country_code.lower()] - return country_code # if we didn't find the cc in our map - -""" -Represents a .csv file containing information on the number of -connecting Tor users per country. - -'store': Dictionary with (<country code>, <counter>) as key, and the number of users as value. - <country code> can also be "date"... -'all_dates': List of the data intervals (with default timedelta: 1 day). -'country_codes': List of all relevant country codes. -'MAX_INDEX': Length of store, number of country codes etc. -'date_min': The oldest date found in the .csv. -'date_min': The latest date found in the .csv. -""" -class torstatstore: - def __init__(self, file_name): - f = file(file_name) - country_codes = f.readline() - country_codes = country_codes.strip().split(",") - - store = {} - MAX_INDEX = 0 - for i, line in enumerate(f): - MAX_INDEX += 1 - line_parsed = line.strip().split(",") - for j, (ccode, val) in enumerate(zip(country_codes,line_parsed)): - processed_val = None - if ccode == "date": - try: - year, month, day = int(val[:4]), int(val[5:7]), int(val[8:10]) - processed_val = date(year, month, day) - except Exception, e: - print "Parsing error (ignoring line %s):" % j - print "%s" % val,e - break - - elif val != "NA": - processed_val = int(val) - store[(ccode, i)] = processed_val - - # min and max - date_min = store[("date", 0)] - date_max = store[("date", i)] - - all_dates = [] - d = date_min - dt = timedelta(days=1) - while d <= date_max: - all_dates += [d] - d = d + dt - - # Save for later - self.store = store - self.all_dates = all_dates - self.country_codes = country_codes - self.MAX_INDEX = MAX_INDEX - self.date_min = date_min - self.date_max = date_max - - """Return a list representing a time series of 'ccode' with respect - to the number of connected users. - """ - def get_country_series(self, ccode): - assert ccode in self.country_codes - series = {} - for d in self.all_dates: - series[d] = None - for i in range(self.MAX_INDEX): - series[self.store[("date", i)]] = self.store[(ccode, i)] - sx = [] - for d in self.all_dates: - sx += [series[d]] - return sx - - """Return an ordered list containing tuples of the form (<number of - users>, <country code>). The list is ordered with respect to the - number of users for each country. - """ - def get_largest(self, number): - exclude = set(["all", "??", "date"]) - l = [(self.store[(c, self.MAX_INDEX-1)], c) for c in self.country_codes if c not in exclude] - l.sort() - l.reverse() - return l[:number] - - """Return a dictionary, with <country code> as key, and the time - series of the country code as the value. - """ - def get_largest_locations(self, number): - l = self.get_largest(number) - res = {} - for _, ccode in l[:number]: - res[ccode] = self.get_country_series(ccode) - return res - -"""Return a list containing lists (?) where each such list contains -the difference in users for a time delta of 'days' -""" -def n_day_rel(series, days): - rel = [] - for i, v in enumerate(series): - if series[i] is None: - rel += [None] - continue - - if i - days < 0 or series[i-days] is None or series[i-days] == 0: - rel += [None] - else: - rel += [ float(series[i]) / series[i-days]] - return rel - -# Main model: computes the expected min / max range of number of users -def make_tendencies_minmax(l, INTERVAL = 1): - lminus1 = dict([(ccode, n_day_rel(l[ccode], INTERVAL)) for ccode in l]) - c = lminus1[lminus1.keys()[0]] - dists = [] - minx = [] - maxx = [] - for i in range(len(c)): - vals = [lminus1[ccode][i] for ccode in lminus1.keys() if lminus1[ccode][i] != None] - if len(vals) < 8: - dists += [None] - minx += [None] - maxx += [None] - else: - vals.sort() - median = percentile(vals, 50) - q1 = percentile(vals, 25) - q2 = percentile(vals, 75) - qd = q2 - q1 - vals = [v for v in vals if median - qd*4 < v and v < median + qd*4] - if len(vals) < 8: - dists += [None] - minx += [None] - maxx += [None] - continue - mu = mean(vals) - signma = std(vals) - dists += [(mu, signma)] - maxx += [norm.ppf(0.9999, mu, signma)] - minx += [norm.ppf(1 - 0.9999, mu, signma)] - ## print minx[-1], maxx[-1] - return minx, maxx - -"""Write a CSV report on the minimum/maximum users of each country per date.""" -def write_all(tss, minc, maxc, RANGES_FILE, INTERVAL=7): - ranges_file = file(RANGES_FILE, "w") - ranges_file.write("date,country,minusers,maxusers\n") - exclude = set(["all", "??", "date"]) - for c in tss.country_codes: - if c in exclude: - continue - series = tss.get_country_series(c) - for i, v in enumerate(series): - if i > 0 and i - INTERVAL >= 0 and series[i] != None and series[i-INTERVAL] != None and series[i-INTERVAL] != 0 and minc[i]!= None and maxc[i]!= None: - minv = minc[i] * poisson.ppf(1-0.9999, series[i-INTERVAL]) - maxv = maxc[i] * poisson.ppf(0.9999, series[i-INTERVAL]) - if not minv < maxv: - print minv, maxv, series[i-INTERVAL], minc[i], maxc[i] - assert minv < maxv - if minv < 0.0: - minv = 0.0 - ranges_file.write("%s,%s,%s,%s\n" % (tss.all_dates[i], c, minv, maxv)) - ranges_file.close() - -# INTERV is the time interval to model connection rates; -# consider maximum DAYS days back. -def detect(CSV_FILE = "userstats-detector.csv", - RANGES_FILE = "userstats-ranges.csv", - INTERV = 7, DAYS = 6 * 31): - tss = torstatstore(CSV_FILE) - l = tss.get_largest_locations(50) - minx, maxx = make_tendencies_minmax(l, INTERV) - write_all(tss, minx, maxx, RANGES_FILE, INTERV) - -def main(): - detect() - -if __name__ == "__main__": - main()
tor-commits@lists.torproject.org