[tor-commits] [metrics-web/master] Rewrite censorship detector in Java.

karsten at torproject.org karsten at torproject.org
Thu Dec 20 11:21:22 UTC 2018


commit a367168a782e864bdacb610857b1dc5d58fd192d
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Sun Dec 9 12:02:42 2018 +0100

    Rewrite censorship detector in Java.
    
    This allows us to remove the last remaining Python parts from the daily
    updater.
    
    Implements #21588.
---
 build.xml                                          |  26 --
 .../torproject/metrics/stats/clients/Detector.java | 433 +++++++++++++++++++++
 .../org/torproject/metrics/stats/clients/Main.java |   5 +
 src/main/python/clients/country_info.py            | 255 ------------
 src/main/python/clients/detector.py                | 242 ------------
 5 files changed, 438 insertions(+), 523 deletions(-)

diff --git a/build.xml b/build.xml
index 6736e19..93eda7b 100644
--- a/build.xml
+++ b/build.xml
@@ -23,7 +23,6 @@
   <property name="tardepends" value="war" />
 
   <property name="Rsources" value="${basedir}/src/main/R" />
-  <property name="pysources" value="${basedir}/src/main/python" />
 
   <property name="specdir" value="${basedir}/generated/spec" />
 
@@ -360,32 +359,7 @@
 
   <target name="clients" >
     <property name="module.name" value="clients" />
-    <property name="localmoddir" value="${modulebase}/${module.name}" />
-
-    <property name="statsdir"
-              value="${localmoddir}/stats" />
-    <mkdir dir="${statsdir}" />
-
     <antcall target="run-java" />
-
-    <antcall target="run-R" >
-      <param name="module.Rscript" value="userstats-detector.R" />
-    </antcall>
-
-    <exec executable="python"
-          dir="${localmoddir}"
-          failonerror="true" >
-      <arg value="${pysources}/${module.name}/detector.py" />
-      <arg value="userstats-detector.csv" />
-      <arg value="userstats-ranges.csv" />
-    </exec>
-
-    <antcall target="run-R" >
-      <param name="module.Rscript" value="merge-clients.R" />
-    </antcall>
-
-    <copy file="${localmoddir}/clients.csv" todir="${statsdir}" />
-    <copy file="${localmoddir}/userstats-combined.csv" todir="${statsdir}" />
   </target>
 
   <target name="servers" >
diff --git a/src/main/java/org/torproject/metrics/stats/clients/Detector.java b/src/main/java/org/torproject/metrics/stats/clients/Detector.java
new file mode 100644
index 0000000..1a523c2
--- /dev/null
+++ b/src/main/java/org/torproject/metrics/stats/clients/Detector.java
@@ -0,0 +1,433 @@
+/* Copyright 2011 George Danezis <gdane at microsoft.com>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted (subject to the limitations in the
+ * disclaimer below) provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the
+ *    distribution.
+ *
+ *  * Neither the name of <Owner Organization> nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ * GRANTED BY THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+ * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * (Clear BSD license:
+ * http://labs.metacarta.com/license-explanation.html#license)
+ *
+ * Copyright 2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.metrics.stats.clients;
+
+import org.apache.commons.math3.distribution.NormalDistribution;
+import org.apache.commons.math3.distribution.PoissonDistribution;
+import org.apache.commons.math3.stat.descriptive.moment.Mean;
+import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation;
+import org.apache.commons.math3.stat.descriptive.rank.Percentile;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.time.LocalDate;
+import java.time.format.DateTimeParseException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+
+/** Censorship detector that reads a .csv file of the number of Tor clients and
+ * finds anomalies that might be indicative of censorship. */
+public class Detector {
+
+  /** Input file. */
+  private static final Path INPUT_PATH = Paths.get("stats", "userstats.csv");
+
+  /** Output file. */
+  private static final Path OUTPUT_PATH = Paths.get("stats", "clients.csv");
+
+  /** Number of largest locations to be included in the detection algorithm. */
+  private static final int NUM_LARGEST_LOCATIONS = 50;
+
+  /** Time interval in days to model connection rates. */
+  private static final int INTERV = 7;
+
+  /** Compound key under which client estimates are stored in both input and
+   * output files. */
+  private static class ClientsKey implements Comparable<ClientsKey> {
+
+    /** Date when clients connected to the Tor network. */
+    private LocalDate date;
+
+    /** Whether clients connected via relays (true) or bridges (false). */
+    private boolean nodeIsRelay;
+
+    /** Two-letter lower-case country code of the country from which clients
+     * connected, "??" if the country could not be resolved, or left empty for
+     * all countries together. */
+    private String country;
+
+    /** Name of the transport used by clients to connect using bridges, or left
+     * empty for all transports together. */
+    private String transport = "";
+
+    /** IP version used by clients to connect using bridges, or left empty for
+     * all IP versions together. */
+    private String version = "";
+
+    ClientsKey(LocalDate date, boolean nodeIsRelay, String country) {
+      this.date = date;
+      this.nodeIsRelay = nodeIsRelay;
+      this.country = country;
+    }
+
+    ClientsKey(LocalDate date, boolean nodeIsRelay, String country,
+        String transport, String version) {
+      this(date, nodeIsRelay, country);
+      this.transport = transport;
+      this.version = version;
+    }
+
+    @Override
+    public int compareTo(ClientsKey other) {
+      if (!this.date.equals(other.date)) {
+        return this.date.compareTo(other.date);
+      } else if (!this.nodeIsRelay && other.nodeIsRelay) {
+        return -1;
+      } else if (this.nodeIsRelay && !other.nodeIsRelay) {
+        return 1;
+      } else if (!this.country.equals(other.country)) {
+        return this.country.compareTo(other.country);
+      } else if (!this.transport.equals(other.transport)) {
+        return this.transport.compareTo(other.transport);
+      } else if (!this.version.equals(other.version)) {
+        return this.version.compareTo(other.version);
+      } else {
+        return 0;
+      }
+    }
+
+    @Override
+    public boolean equals(Object otherObject) {
+      if (!(otherObject instanceof ClientsKey)) {
+        return false;
+      } else {
+        ClientsKey other = (ClientsKey) otherObject;
+        return this.date.equals(other.date)
+            && this.nodeIsRelay == other.nodeIsRelay
+            && this.country.equals(other.country)
+            && this.transport.equals(other.transport)
+            && this.version.equals(other.version);
+      }
+    }
+
+    @Override
+    public int hashCode() {
+      return 3 * this.date.hashCode() + (this.nodeIsRelay ? 5 : 0)
+          + 7 * this.country.hashCode() + 11 * this.transport.hashCode()
+          + 13 * this.version.hashCode();
+    }
+
+    @Override
+    public String toString() {
+      return String.format("%s,%s,%s,%s,%s",
+          this.date.toString(), this.nodeIsRelay ? "relay" : "bridge",
+          this.country, this.transport, this.version);
+    }
+  }
+
+  /** Value class that stores everything we already knew about a specific
+   * subset of clients from the input file. */
+  private static class ClientsEstimates {
+
+    /** Estimated number of clients. */
+    private int clients;
+
+    /** Fraction of relays or bridges in percent that the estimate is based on,
+     * between 0 and 100. */
+    private int frac;
+
+    ClientsEstimates(int clients, int frac) {
+      this.clients = clients;
+      this.frac = frac;
+    }
+
+    @Override
+    public String toString() {
+      return String.format("%d,%d", this.clients, this.frac);
+    }
+  }
+
+  /** Value class that stores everything we're computing here about a specific
+   * subset of clients from the input file. */
+  private static class ClientsRanges {
+
+    /** Lower number of expected clients under the assumption that there has
+     * been no censorship event, as computed here. */
+    private int lower;
+
+    /** Upper number of expected clients under the assumption that there has
+     * been no release of censorship, as computed here. */
+    private int upper;
+
+    ClientsRanges(int lower, int upper) {
+      this.lower = lower;
+      this.upper = upper;
+    }
+
+    @Override
+    public String toString() {
+      return String.format("%d,%d", this.lower, this.upper);
+    }
+  }
+
+  /** Run censorship detection. */
+  public void detect() throws IOException {
+    SortedMap<ClientsKey, ClientsEstimates> estimates = readInputFile();
+    Set<String> largestLocations = findLargestLocations(estimates);
+    Map<LocalDate, List<Double>> ratios = computeRatiosOfLargestLocations(
+        estimates, largestLocations);
+    Map<LocalDate, List<Double>> ratiosWithoutOutliers = removeOutliers(ratios);
+    SortedMap<ClientsKey, ClientsRanges> ranges = computeRanges(estimates,
+        ratiosWithoutOutliers);
+    writeOutputFile(estimates, ranges);
+  }
+
+  /** Read and return the parsed input file containing comma-separated estimates
+   * of client numbers. */
+  private static SortedMap<ClientsKey, ClientsEstimates> readInputFile()
+      throws IOException {
+    SortedMap<ClientsKey, ClientsEstimates> estimates = new TreeMap<>();
+    File inputFile = INPUT_PATH.toFile();
+    if (!inputFile.exists()) {
+      throw new IOException(String.format("Input file %s does not exist.",
+          inputFile));
+    }
+    try (LineNumberReader lnr = new LineNumberReader(
+        new FileReader(inputFile))) {
+      String line = lnr.readLine();
+      if (!"date,node,country,transport,version,frac,users".equals(line)) {
+        throw new IOException(String.format("Unable to read input file %s with "
+            + "unrecognized header line '%s'. Not running detector.", inputFile,
+            line));
+      }
+      while ((line = lnr.readLine()) != null) {
+        ClientsKey key = null;
+        ClientsEstimates value = null;
+        boolean invalidLine = false;
+        String[] lineParts = line.split(",");
+        if (lineParts.length == 7) {
+          try {
+            LocalDate date = LocalDate.parse(lineParts[0]);
+            boolean nodeIsRelay = false;
+            if ("relay".equals(lineParts[1])) {
+              nodeIsRelay = true;
+            } else if (!"bridge".equals(lineParts[1])) {
+              invalidLine = true;
+            }
+            String country = lineParts[2].replaceAll("\"", "");
+            String transport = lineParts[3].replaceAll("\"", "");
+            String version = lineParts[4].replaceAll("\"", "");
+            key = new ClientsKey(date, nodeIsRelay, country, transport,
+                version);
+          } catch (DateTimeParseException e) {
+            invalidLine = true;
+          }
+          try {
+            int frac = Integer.parseInt(lineParts[5]);
+            int clients = Integer.parseInt(lineParts[6]);
+            value = new ClientsEstimates(clients, frac);
+          } catch (NumberFormatException e) {
+            invalidLine = true;
+          }
+        } else {
+          invalidLine = true;
+        }
+        if (invalidLine) {
+          throw new IOException(String.format(
+              "Invalid line %d '%s' in input file %s.", lnr.getLineNumber(),
+              line, inputFile));
+        } else {
+          estimates.put(key, value);
+        }
+      }
+    }
+    return estimates;
+  }
+
+  /** Return the NUM_LARGEST_LOCATIONS countries (except for "??") with the
+   * largest number of estimated clients on the last known date in the input
+   * data set.
+   *
+   * <p>Note that this implies that lower/upper values are going to change,
+   * depending on which countries had most clients on the last known date in the
+   * input data set.</p> */
+  private static Set<String> findLargestLocations(
+      SortedMap<ClientsKey, ClientsEstimates> clients) throws IOException {
+    LocalDate lastKnownDate = clients.keySet().stream()
+        .filter(c -> c.nodeIsRelay)
+        .map(c -> c.date)
+        .max(LocalDate::compareTo)
+        .orElseThrow(() -> new IOException("Unable to find maximum date. Was "
+            + "the input file empty or otherwise corrupt?"));
+    return clients.entrySet().stream()
+        .filter(c -> lastKnownDate.equals(c.getKey().date))
+        .filter(c -> c.getKey().nodeIsRelay)
+        .filter(c -> !"".equals(c.getKey().country))
+        .filter(c -> !"??".equals(c.getKey().country))
+        .sorted((c1, c2) -> Integer.compare(c2.getValue().clients,
+            c1.getValue().clients))
+        .map(c -> c.getKey().country)
+        .limit(NUM_LARGEST_LOCATIONS)
+        .collect(Collectors.toSet());
+  }
+
+  /** Compute the ratio of the client number estimate for a given date and
+   * country as compared to 1 week before, for all dates, for relay users, and
+   * for the largest locations. */
+  private static Map<LocalDate, List<Double>> computeRatiosOfLargestLocations(
+      SortedMap<ClientsKey, ClientsEstimates> estimates,
+      Set<String> largestLocations) {
+    Map<LocalDate, List<Double>> ratios = new HashMap<>();
+    for (Map.Entry<ClientsKey, ClientsEstimates> numerator
+        : estimates.entrySet()) {
+      if (!numerator.getKey().nodeIsRelay
+          || !largestLocations.contains(numerator.getKey().country)) {
+        continue;
+      }
+      ClientsEstimates denominator = estimates.get(new ClientsKey(
+          numerator.getKey().date.minusDays(INTERV), true,
+          numerator.getKey().country));
+      if (null == denominator || denominator.clients == 0) {
+        continue;
+      }
+      if (!ratios.containsKey(numerator.getKey().date)) {
+        ratios.put(numerator.getKey().date, new ArrayList<>());
+      }
+      ratios.get(numerator.getKey().date).add(
+          ((double) numerator.getValue().clients)
+              / (double) denominator.clients);
+    }
+    return ratios;
+  }
+
+  /** Exclude outliers from the given ratios by date that fall outside four
+   * inter-quartile ranges of the median and make sure that at least 8 ratio
+   * values remain. */
+  private static SortedMap<LocalDate, List<Double>> removeOutliers(
+      Map<LocalDate, List<Double>> ratios) {
+    SortedMap<LocalDate, List<Double>> ratiosWithoutOutliers = new TreeMap<>();
+    for (Map.Entry<LocalDate, List<Double>> e : ratios.entrySet()) {
+      double[] values = e.getValue().stream().mapToDouble(Double::doubleValue)
+          .toArray();
+      Percentile percentile = new Percentile()
+          .withEstimationType(Percentile.EstimationType.R_7);
+      percentile.setData(values);
+      double median = percentile.evaluate(50.0);
+      double firstQuarter = percentile.evaluate(25.0);
+      double thirdQuarter = percentile.evaluate(75.0);
+      double interQuartileRange = thirdQuarter - firstQuarter;
+      List<Double> valuesWithoutOutliers = new ArrayList<>();
+      for (double value : values) {
+        if (value > median - 4 * interQuartileRange
+            && value < median + 4 * interQuartileRange) {
+          valuesWithoutOutliers.add(value);
+        }
+      }
+      if (valuesWithoutOutliers.size() < 8) {
+        continue;
+      }
+      LocalDate date = e.getKey();
+      ratiosWithoutOutliers.put(date, valuesWithoutOutliers);
+    }
+    return ratiosWithoutOutliers;
+  }
+
+  /** Compute ranges as the expected minimum and maximum number of users. */
+  private static SortedMap<ClientsKey, ClientsRanges> computeRanges(
+      SortedMap<ClientsKey, ClientsEstimates> estimates,
+      Map<LocalDate, List<Double>> ratiosWithoutOutliers) {
+    SortedMap<ClientsKey, ClientsRanges> ranges = new TreeMap<>();
+    for (Map.Entry<ClientsKey, ClientsEstimates> estimatesEntry
+        : estimates.entrySet()) {
+      LocalDate date = estimatesEntry.getKey().date;
+      if (!estimatesEntry.getKey().nodeIsRelay
+          || "".equals(estimatesEntry.getKey().country)
+          || "??".equals(estimatesEntry.getKey().country)
+          || !ratiosWithoutOutliers.containsKey(date)) {
+        continue;
+      }
+      ClientsEstimates referenceEstimate = estimates.get(
+          new ClientsKey(date.minusDays(INTERV),
+          true, estimatesEntry.getKey().country));
+      if (null == referenceEstimate || referenceEstimate.clients == 0) {
+        continue;
+      }
+      double[] values = ratiosWithoutOutliers.get(date).stream()
+          .mapToDouble(Double::doubleValue).toArray();
+      double mean = new Mean().evaluate(values);
+      double std = new StandardDeviation(false).evaluate(values);
+      NormalDistribution normalDistribution = new NormalDistribution(mean, std);
+      PoissonDistribution poissonDistribution
+          = new PoissonDistribution(referenceEstimate.clients);
+      int lower = Math.max(0,
+          (int) (normalDistribution.inverseCumulativeProbability(0.0001)
+              * poissonDistribution.inverseCumulativeProbability(0.0001)));
+      int upper =
+          (int) (normalDistribution.inverseCumulativeProbability(0.9999)
+              * poissonDistribution.inverseCumulativeProbability(0.9999));
+      ranges.put(estimatesEntry.getKey(), new ClientsRanges(lower, upper));
+    }
+    return ranges;
+  }
+
+  /** Write client number estimates together with lower and upper bounds as
+   * comma-separated values to the output file. */
+  private static void writeOutputFile(
+      SortedMap<ClientsKey, ClientsEstimates> estimates,
+      SortedMap<ClientsKey, ClientsRanges> ranges) throws IOException {
+    try (BufferedWriter bw = new BufferedWriter(
+        new FileWriter(OUTPUT_PATH.toFile()))) {
+      bw.write(
+          "date,node,country,transport,version,lower,upper,clients,frac\n");
+      for (Map.Entry<ClientsKey, ClientsEstimates> e : estimates.entrySet()) {
+        String rangesString = ",";
+        if (ranges.containsKey(e.getKey())) {
+          rangesString = ranges.get(e.getKey()).toString();
+        }
+        bw.write(String.format("%s,%s,%s%n", e.getKey().toString(),
+            rangesString, e.getValue().toString()));
+      }
+    }
+  }
+}
+
diff --git a/src/main/java/org/torproject/metrics/stats/clients/Main.java b/src/main/java/org/torproject/metrics/stats/clients/Main.java
index 48d8d8d..0f1087b 100644
--- a/src/main/java/org/torproject/metrics/stats/clients/Main.java
+++ b/src/main/java/org/torproject/metrics/stats/clients/Main.java
@@ -59,6 +59,11 @@ public class Main {
 
     log.info("Disconnecting from database.");
     database.close();
+
+    log.info("Running detector.");
+    new Detector().detect();
+
+    log.info("Terminating clients module.");
   }
 
   private static final long ONE_HOUR_MILLIS = 60L * 60L * 1000L;
diff --git a/src/main/python/clients/country_info.py b/src/main/python/clients/country_info.py
deleted file mode 100644
index 1a505d0..0000000
--- a/src/main/python/clients/country_info.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# -*- coding: utf-8 -*-
-
-countries = {
-    "ad" : "Andorra",
-    "ae" : "the United Arab Emirates",
-    "af" : "Afghanistan",
-    "ag" : "Antigua and Barbuda",
-    "ai" : "Anguilla",
-    "al" : "Albania",
-    "am" : "Armenia",
-    "an" : "the Netherlands Antilles",
-    "ao" : "Angola",
-    "aq" : "Antarctica",
-    "ar" : "Argentina",
-    "as" : "American Samoa",
-    "at" : "Austria",
-    "au" : "Australia",
-    "aw" : "Aruba",
-    "ax" : "the Aland Islands",
-    "az" : "Azerbaijan",
-    "ba" : "Bosnia and Herzegovina",
-    "bb" : "Barbados",
-    "bd" : "Bangladesh",
-    "be" : "Belgium",
-    "bf" : "Burkina Faso",
-    "bg" : "Bulgaria",
-    "bh" : "Bahrain",
-    "bi" : "Burundi",
-    "bj" : "Benin",
-    "bl" : "Saint Bartelemey",
-    "bm" : "Bermuda",
-    "bn" : "Brunei",
-    "bo" : "Bolivia",
-    "bq" : "Bonaire, Sint Eustatius and Saba",
-    "br" : "Brazil",
-    "bs" : "the Bahamas",
-    "bt" : "Bhutan",
-    "bv" : "the Bouvet Island",
-    "bw" : "Botswana",
-    "by" : "Belarus",
-    "bz" : "Belize",
-    "ca" : "Canada",
-    "cc" : "the Cocos (Keeling) Islands",
-    "cd" : "the Democratic Republic of the Congo",
-    "cf" : "Central African Republic",
-    "cg" : "Congo",
-    "ch" : "Switzerland",
-    "ci" :  u"Côte d'Ivoire",
-    "ck" : "the Cook Islands",
-    "cl" : "Chile",
-    "cm" : "Cameroon",
-    "cn" : "China",
-    "co" : "Colombia",
-    "cr" : "Costa Rica",
-    "cu" : "Cuba",
-    "cv" : "Cape Verde",
-    "cw" : u"Curaçao",
-    "cx" : "the Christmas Island",
-    "cy" : "Cyprus",
-    "cz" : "the Czech Republic",
-    "de" : "Germany",
-    "dj" : "Djibouti",
-    "dk" : "Denmark",
-    "dm" : "Dominica",
-    "do" : "the Dominican Republic",
-    "dz" : "Algeria",
-    "ec" : "Ecuador",
-    "ee" : "Estonia",
-    "eg" : "Egypt",
-    "eh" : "the Western Sahara",
-    "er" : "Eritrea",
-    "es" : "Spain",
-    "et" : "Ethiopia",
-    "fi" : "Finland",
-    "fj" : "Fiji",
-    "fk" : "the Falkland Islands (Malvinas)",
-    "fm" : "the Federated States of Micronesia",
-    "fo" : "the Faroe Islands",
-    "fr" : "France",
-    "ga" : "Gabon",
-    "gb" : "the United Kingdom",
-    "gd" : "Grenada",
-    "ge" : "Georgia",
-    "gf" : "French Guiana",
-    "gg" : "Guernsey",
-    "gh" : "Ghana",
-    "gi" : "Gibraltar",
-    "gl" : "Greenland",
-    "gm" : "Gambia",
-    "gn" : "Guinea",
-    "gp" : "Guadeloupe",
-    "gq" : "Equatorial Guinea",
-    "gr" : "Greece",
-    "gs" : "South Georgia and the South Sandwich Islands",
-    "gt" : "Guatemala",
-    "gu" : "Guam",
-    "gw" : "Guinea-Bissau",
-    "gy" : "Guyana",
-    "hk" : "Hong Kong",
-    "hm" : "Heard Island and McDonald Islands",
-    "hn" : "Honduras",
-    "hr" : "Croatia",
-    "ht" : "Haiti",
-    "hu" : "Hungary",
-    "id" : "Indonesia",
-    "ie" : "Ireland",
-    "il" : "Israel",
-    "im" : "the Isle of Man",
-    "in" : "India",
-    "io" : "the British Indian Ocean Territory",
-    "iq" : "Iraq",
-    "ir" : "Iran",
-    "is" : "Iceland",
-    "it" : "Italy",
-    "je" : "Jersey",
-    "jm" : "Jamaica",
-    "jo" : "Jordan",
-    "jp" : "Japan",
-    "ke" : "Kenya",
-    "kg" : "Kyrgyzstan",
-    "kh" : "Cambodia",
-    "ki" : "Kiribati",
-    "km" : "Comoros",
-    "kn" : "Saint Kitts and Nevis",
-    "kp" : "North Korea",
-    "kr" : "the Republic of Korea",
-    "kw" : "Kuwait",
-    "ky" : "the Cayman Islands",
-    "kz" : "Kazakhstan",
-    "la" : "Laos",
-    "lb" : "Lebanon",
-    "lc" : "Saint Lucia",
-    "li" : "Liechtenstein",
-    "lk" : "Sri Lanka",
-    "lr" : "Liberia",
-    "ls" : "Lesotho",
-    "lt" : "Lithuania",
-    "lu" : "Luxembourg",
-    "lv" : "Latvia",
-    "ly" : "Libya",
-    "ma" : "Morocco",
-    "mc" : "Monaco",
-    "md" : "the Republic of Moldova",
-    "me" : "Montenegro",
-    "mf" : "Saint Martin",
-    "mg" : "Madagascar",
-    "mh" : "the Marshall Islands",
-    "mk" : "Macedonia",
-    "ml" : "Mali",
-    "mm" : "Burma",
-    "mn" : "Mongolia",
-    "mo" : "Macau",
-    "mp" : "the Northern Mariana Islands",
-    "mq" : "Martinique",
-    "mr" : "Mauritania",
-    "ms" : "Montserrat",
-    "mt" : "Malta",
-    "mu" : "Mauritius",
-    "mv" : "the Maldives",
-    "mw" : "Malawi",
-    "mx" : "Mexico",
-    "my" : "Malaysia",
-    "mz" : "Mozambique",
-    "na" : "Namibia",
-    "nc" : "New Caledonia",
-    "ne" : "Niger",
-    "nf" : "Norfolk Island",
-    "ng" : "Nigeria",
-    "ni" : "Nicaragua",
-    "nl" : "the Netherlands",
-    "no" : "Norway",
-    "np" : "Nepal",
-    "nr" : "Nauru",
-    "nu" : "Niue",
-    "nz" : "New Zealand",
-    "om" : "Oman",
-    "pa" : "Panama",
-    "pe" : "Peru",
-    "pf" : "French Polynesia",
-    "pg" : "Papua New Guinea",
-    "ph" : "the Philippines",
-    "pk" : "Pakistan",
-    "pl" : "Poland",
-    "pm" : "Saint Pierre and Miquelon",
-    "pn" : "the Pitcairn Islands",
-    "pr" : "Puerto Rico",
-    "ps" : "the Palestinian Territory",
-    "pt" : "Portugal",
-    "pw" : "Palau",
-    "py" : "Paraguay",
-    "qa" : "Qatar",
-    "re" : "Reunion",
-    "ro" : "Romania",
-    "rs" : "Serbia",
-    "ru" : "Russia",
-    "rw" : "Rwanda",
-    "sa" : "Saudi Arabia",
-    "sb" : "the Solomon Islands",
-    "sc" : "the Seychelles",
-    "sd" : "Sudan",
-    "se" : "Sweden",
-    "sg" : "Singapore",
-    "sh" : "Saint Helena",
-    "si" : "Slovenia",
-    "sj" : "Svalbard and Jan Mayen",
-    "sk" : "Slovakia",
-    "sl" : "Sierra Leone",
-    "sm" : "San Marino",
-    "sn" : "Senegal",
-    "so" : "Somalia",
-    "sr" : "Suriname",
-    "ss" : "South Sudan",
-    "st" : u"São Tomé and Príncipe",
-    "sv" : "El Salvador",
-    "sx" : "Sint Maarten",
-    "sy" : "the Syrian Arab Republic",
-    "sz" : "Swaziland",
-    "tc" : "Turks and Caicos Islands",
-    "td" : "Chad",
-    "tf" : "the French Southern Territories",
-    "tg" : "Togo",
-    "th" : "Thailand",
-    "tj" : "Tajikistan",
-    "tk" : "Tokelau",
-    "tl" : "East Timor",
-    "tm" : "Turkmenistan",
-    "tn" : "Tunisia",
-    "to" : "Tonga",
-    "tr" : "Turkey",
-    "tt" : "Trinidad and Tobago",
-    "tv" : "Tuvalu",
-    "tw" : "Taiwan",
-    "tz" : "the United Republic of Tanzania",
-    "ua" : "Ukraine",
-    "ug" : "Uganda",
-    "um" : "the United States Minor Outlying Islands",
-    "us" : "the United States",
-    "uy" : "Uruguay",
-    "uz" : "Uzbekistan",
-    "va" : "Vatican City",
-    "vc" : "Saint Vincent and the Grenadines",
-    "ve" : "Venezuela",
-    "vg" : "the British Virgin Islands",
-    "vi" : "the United States Virgin Islands",
-    "vn" : "Vietnam",
-    "vu" : "Vanuatu",
-    "wf" : "Wallis and Futuna",
-    "ws" : "Samoa",
-    "xk" : "Kosovo",
-    "ye" : "Yemen",
-    "yt" : "Mayotte",
-    "za" : "South Africa",
-    "zm" : "Zambia",
-    "zw" : "Zimbabwe"
-    }
diff --git a/src/main/python/clients/detector.py b/src/main/python/clients/detector.py
deleted file mode 100644
index b0a98af..0000000
--- a/src/main/python/clients/detector.py
+++ /dev/null
@@ -1,242 +0,0 @@
-##  Copyright (c) 2011 George Danezis <gdane at microsoft.com>
-##
-##  All rights reserved.
-##
-##  Redistribution and use in source and binary forms, with or without
-##  modification, are permitted (subject to the limitations in the
-##  disclaimer below) provided that the following conditions are met:
-##
-##   * Redistributions of source code must retain the above copyright
-##     notice, this list of conditions and the following disclaimer.
-##
-##   * Redistributions in binary form must reproduce the above copyright
-##     notice, this list of conditions and the following disclaimer in the
-##     documentation and/or other materials provided with the
-##     distribution.
-##
-##   * Neither the name of <Owner Organization> nor the names of its
-##     contributors may be used to endorse or promote products derived
-##     from this software without specific prior written permission.
-##
-##  NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-##  GRANTED BY THIS LICENSE.  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
-##  HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-##  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-##  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-##  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-##  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-##  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-##  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-##  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-##  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-##  OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
-##  IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-##
-##  (Clear BSD license: http://labs.metacarta.com/license-explanation.html#license)
-
-##  This script reads a .csv file of the number of Tor users and finds
-##  anomalies that might be indicative of censorship.
-
-# Dep: numpy
-import numpy
-from numpy import mean, std, percentile
-
-# Dep: scipy
-import scipy.stats
-from scipy.stats.distributions import norm
-from scipy.stats.distributions import poisson
-
-# Std lib
-from datetime import date
-from datetime import timedelta
-import os.path
-
-# Country code -> Country names
-import country_info
-
-# write utf8 to file
-import codecs
-
-def get_country_name_from_cc(country_code):
-  if (country_code.lower() in country_info.countries):
-    return country_info.countries[country_code.lower()]
-  return country_code # if we didn't find the cc in our map
-
-"""
-Represents a .csv file containing information on the number of
-connecting Tor users per country.
-
-'store': Dictionary with (<country code>, <counter>) as key, and the number of users as value.
-         <country code> can also be "date"...
-'all_dates': List of the data intervals (with default timedelta: 1 day).
-'country_codes': List of all relevant country codes.
-'MAX_INDEX': Length of store, number of country codes etc.
-'date_min': The oldest date found in the .csv.
-'date_min': The latest date found in the .csv.
-"""
-class torstatstore:
-  def __init__(self, file_name):
-    f = file(file_name)
-    country_codes = f.readline()
-    country_codes = country_codes.strip().split(",")
-
-    store = {}
-    MAX_INDEX = 0
-    for i, line in enumerate(f):
-        MAX_INDEX += 1
-        line_parsed = line.strip().split(",")
-        for j, (ccode, val) in enumerate(zip(country_codes,line_parsed)):
-            processed_val = None
-            if ccode == "date":
-                try:
-                    year, month, day = int(val[:4]), int(val[5:7]), int(val[8:10])
-                    processed_val = date(year, month, day)
-                except Exception, e:
-                    print "Parsing error (ignoring line %s):" % j
-                    print "%s" % val,e
-                    break
-
-            elif val != "NA":
-                processed_val = int(val)
-            store[(ccode, i)] = processed_val
-
-    # min and max
-    date_min = store[("date", 0)]
-    date_max = store[("date", i)]
-
-    all_dates = []
-    d = date_min
-    dt = timedelta(days=1)
-    while d <= date_max:
-        all_dates += [d]
-        d = d + dt
-
-    # Save for later
-    self.store = store
-    self.all_dates = all_dates
-    self.country_codes = country_codes
-    self.MAX_INDEX = MAX_INDEX
-    self.date_min = date_min
-    self.date_max = date_max
-
-  """Return a list representing a time series of 'ccode' with respect
-  to the number of connected users.
-  """
-  def get_country_series(self, ccode):
-    assert ccode in self.country_codes
-    series = {}
-    for d in self.all_dates:
-        series[d] = None
-    for i in range(self.MAX_INDEX):
-        series[self.store[("date", i)]] = self.store[(ccode, i)]
-    sx = []
-    for d in self.all_dates:
-        sx += [series[d]]
-    return sx
-
-  """Return an ordered list containing tuples of the form (<number of
-  users>, <country code>). The list is ordered with respect to the
-  number of users for each country.
-  """
-  def get_largest(self, number):
-    exclude = set(["all", "??", "date"])
-    l = [(self.store[(c, self.MAX_INDEX-1)], c) for c in self.country_codes if c not in exclude]
-    l.sort()
-    l.reverse()
-    return l[:number]
-
-  """Return a dictionary, with <country code> as key, and the time
-  series of the country code as the value.
-  """
-  def get_largest_locations(self, number):
-    l = self.get_largest(number)
-    res = {}
-    for _, ccode in l[:number]:
-      res[ccode] = self.get_country_series(ccode)
-    return res
-
-"""Return a list containing lists (?) where each such list contains
-the difference in users for a time delta of 'days'
-"""
-def n_day_rel(series, days):
-  rel = []
-  for i, v in enumerate(series):
-    if series[i] is None:
-      rel += [None]
-      continue
-
-    if i - days < 0 or series[i-days] is None or series[i-days] == 0:
-      rel += [None]
-    else:
-      rel += [ float(series[i]) / series[i-days]]
-  return rel
-
-# Main model: computes the expected min / max range of number of users
-def make_tendencies_minmax(l, INTERVAL = 1):
-  lminus1 = dict([(ccode, n_day_rel(l[ccode], INTERVAL)) for ccode in l])
-  c = lminus1[lminus1.keys()[0]]
-  dists = []
-  minx = []
-  maxx = []
-  for i in range(len(c)):
-    vals = [lminus1[ccode][i] for ccode in lminus1.keys() if lminus1[ccode][i] != None]
-    if len(vals) < 8:
-      dists += [None]
-      minx += [None]
-      maxx += [None]
-    else:
-      vals.sort()
-      median = percentile(vals, 50)
-      q1 = percentile(vals, 25)
-      q2 = percentile(vals, 75)
-      qd = q2 - q1
-      vals = [v for v in vals if median - qd*4 < v and  v < median + qd*4]
-      if len(vals) < 8:
-        dists += [None]
-        minx += [None]
-        maxx += [None]
-        continue
-      mu = mean(vals)
-      signma = std(vals)
-      dists += [(mu, signma)]
-      maxx += [norm.ppf(0.9999, mu, signma)]
-      minx += [norm.ppf(1 - 0.9999, mu, signma)]
-  ## print minx[-1], maxx[-1]
-  return minx, maxx
-
-"""Write a CSV report on the minimum/maximum users of each country per date."""
-def write_all(tss, minc, maxc, RANGES_FILE, INTERVAL=7):
-  ranges_file = file(RANGES_FILE, "w")
-  ranges_file.write("date,country,minusers,maxusers\n")
-  exclude = set(["all", "??", "date"])
-  for c in tss.country_codes:
-    if c in exclude:
-      continue
-    series = tss.get_country_series(c)
-    for i, v in enumerate(series):
-      if i > 0 and i - INTERVAL >= 0 and series[i] != None and series[i-INTERVAL] != None and series[i-INTERVAL] != 0 and minc[i]!= None and maxc[i]!= None:
-        minv = minc[i] * poisson.ppf(1-0.9999, series[i-INTERVAL])
-        maxv = maxc[i] * poisson.ppf(0.9999, series[i-INTERVAL])
-        if not minv < maxv:
-          print minv, maxv, series[i-INTERVAL], minc[i], maxc[i]
-        assert minv < maxv
-        if minv < 0.0:
-          minv = 0.0
-        ranges_file.write("%s,%s,%s,%s\n" % (tss.all_dates[i], c, minv, maxv))
-  ranges_file.close()
-
-# INTERV is the time interval to model connection rates;
-# consider maximum DAYS days back.
-def detect(CSV_FILE = "userstats-detector.csv",
-           RANGES_FILE = "userstats-ranges.csv",
-           INTERV = 7, DAYS = 6 * 31):
-  tss = torstatstore(CSV_FILE)
-  l = tss.get_largest_locations(50)
-  minx, maxx = make_tendencies_minmax(l, INTERV)
-  write_all(tss, minx, maxx, RANGES_FILE, INTERV)
-
-def main():
-  detect()
-
-if __name__ == "__main__":
-    main()



More information about the tor-commits mailing list