[tor-commits] [metrics-web/master] Extend censorship detector to new user estimates.

karsten at torproject.org karsten at torproject.org
Mon Sep 16 18:00:12 UTC 2013


commit 41565da93ef5cc3e562f21b8695dbf251b55459a
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Mon Sep 16 19:17:27 2013 +0200

    Extend censorship detector to new user estimates.
---
 detector/detector.py                               |   30 ++++++-----
 detector/detector.sh                               |    1 +
 rserve/csv.R                                       |   16 ++++++
 rserve/graphs.R                                    |   57 +++++++++++++++-----
 rserve/tables.R                                    |   38 +++++++++++--
 .../ernie/web/graphs/GraphsSubpagesServlet.java    |    2 +-
 .../ernie/web/graphs/RObjectGenerator.java         |    5 +-
 web/WEB-INF/users.jsp                              |   45 ++++++++++++++++
 8 files changed, 160 insertions(+), 34 deletions(-)

diff --git a/detector/detector.py b/detector/detector.py
index 7f924db..2c51b5b 100644
--- a/detector/detector.py
+++ b/detector/detector.py
@@ -348,8 +348,8 @@ def plot_all(tss, minx, maxx, INTERV, DAYS=None, rdir="img"):
   summary_file.close()
 
 """Write a CSV report on the minimum/maximum users of each country per date."""
-def write_all(tss, minc, maxc, INTERVAL=7):
-  ranges_file = file("direct-users-ranges.csv", "w")
+def write_all(tss, minc, maxc, RANGES_FILE, INTERVAL=7):
+  ranges_file = file(RANGES_FILE, "w")
   ranges_file.write("date,country,minusers,maxusers\n")
   exclude = set(["all", "??", "date"])
   for c in tss.country_codes:
@@ -415,23 +415,25 @@ def write_ml_report(tss, minx, maxx, INTERV, DAYS, notification_period=None):
 
   report_file.close()
 
-def main():
-  # Change these to customize script
-  CSV_FILE = "direct-users.csv"
-  GRAPH_DIR = "img"
-  # Time interval to model connection rates.
-  INTERV = 7
-  # Consider maximum DAYS days back.
-  DAYS= 6 * 31
-
+# INTERV is the time interval to model connection rates;
+# consider maximum DAYS days back.
+def detect(CSV_FILE = "userstats-detector.csv",
+           RANGES_FILE = "userstats-ranges.csv", GRAPH_DIR = "img",
+           INTERV = 7, DAYS = 6 * 31, REPORT = True):
   tss = torstatstore(CSV_FILE)
   l = tss.get_largest_locations(50)
   minx, maxx = make_tendencies_minmax(l, INTERV)
   #plot_all(tss, minx, maxx, INTERV, DAYS, rdir=GRAPH_DIR)
-  write_all(tss, minx, maxx, INTERV)
+  write_all(tss, minx, maxx, RANGES_FILE, INTERV)
 
-  # Make our short report; only consider events of the last day
-  write_ml_report(tss, minx, maxx, INTERV, DAYS, 1)
+  if REPORT:
+    # Make our short report; only consider events of the last day
+    write_ml_report(tss, minx, maxx, INTERV, DAYS, 1)
+
+def main():
+  detect(CSV_FILE = "direct-users.csv",
+         RANGES_FILE = "direct-users-ranges.csv")
+  detect(REPORT = False)
 
 if __name__ == "__main__":
     main()
diff --git a/detector/detector.sh b/detector/detector.sh
index 8e2ea47..56f6886 100755
--- a/detector/detector.sh
+++ b/detector/detector.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 wget -qO direct-users.csv --no-check-certificate https://metrics.torproject.org/csv/direct-users.csv
+wget -qO userstats-detector.csv --no-check-certificate https://metrics.torproject.org/csv/userstats-detector.csv
 python detector.py
 cat short_censorship_report.txt | mail -E -s 'Possible censorship events' tor-censorship-events at lists.torproject.org
 
diff --git a/rserve/csv.R b/rserve/csv.R
index 531e73f..34a3f91 100644
--- a/rserve/csv.R
+++ b/rserve/csv.R
@@ -328,3 +328,19 @@ export_monthly_userstats_average <- function(path) {
   help_export_monthly_userstats(path, mean)
 }
 
+export_userstats_detector <- function(path) {
+  u <- read.csv(paste("/srv/metrics.torproject.org/task-8462-graphs/",
+    "task-8462/userstats.csv", sep = ""),
+    stringsAsFactors = FALSE)
+  u <- u[u$country != '' & u$transport == '' & u$version == '' &
+         u$node == 'relay', c("country", "date", "users")]
+  u <- rbind(u, data.frame(country = "zy",
+                aggregate(list(users = u$users),
+                          by = list(date = u$date), sum)))
+  u <- data.frame(date = u$date, country = u$country,
+                  users = floor(u$users))
+  u <- cast(u, date ~ country, value = "users")
+  names(u)[names(u) == "zy"] <- "all"
+  write.csv(u, path, quote = FALSE, row.names = FALSE)
+}
+
diff --git a/rserve/graphs.R b/rserve/graphs.R
index 69f9aa9..da39327 100644
--- a/rserve/graphs.R
+++ b/rserve/graphs.R
@@ -690,8 +690,8 @@ plot_direct_users <- function(start, end, country, events, path) {
       if (length(r$maxusers) > 0)
         max_y <- max(max_y, max(r$maxusers, na.rm = TRUE))
       plot <- plot +
-        geom_ribbon(data = r, aes(ymin = minusers, ymax = maxusers),
-            fill = "gray")
+        geom_ribbon(data = r, aes(ymin = max(0, minusers),
+            ymax = maxusers), fill = "gray")
     }
     if (length(upturns$date) > 0)
       plot <- plot +
@@ -1022,7 +1022,8 @@ plot_bandwidth_flags <- function(start, end, path) {
   ggsave(filename = path, width = 8, height = 5, dpi = 72)
 }
 
-plot_userstats <- function(start, end, node, variable, value, path) {
+plot_userstats <- function(start, end, node, variable, value, events,
+                           path) {
   end <- min(end, as.character(Sys.Date()))
   u <- read.csv(paste("/srv/metrics.torproject.org/task-8462-graphs/",
     "task-8462/userstats.csv", sep = ""),
@@ -1052,7 +1053,7 @@ plot_userstats <- function(start, end, node, variable, value, path) {
                      " (BETA)\n", sep = "")
     } else {
       u <- u[u$country == '' & u$transport == '' & u$version == '' &
-            u$node == 'bridge', ]
+             u$node == 'bridge', ]
       title <- "Bridge users (BETA)\n"
     }
   }
@@ -1068,32 +1069,62 @@ plot_userstats <- function(start, end, node, variable, value, path) {
   formatter <- function(x, ...) { format(x, scientific = FALSE, ...) }
   date_breaks <- date_breaks(
     as.numeric(max(u$date) - min(u$date)))
-  ggplot(u, aes(x = date, y = users)) +
+  max_y <- ifelse(length(na.omit(u$users)) == 0, 0,
+      max(u$users, na.rm = TRUE))
+  plot <- ggplot(u, aes(x = date, y = users))
+  if (length(na.omit(u$users)) > 0 & events != "off" &
+      variable == 'country' & value != "all") {
+    r <- read.csv(
+      "/srv/metrics.torproject.org/web/detector/userstats-ranges.csv",
+      stringsAsFactors = FALSE)
+    r <- r[r$date >= start & r$date <= end & r$country == value,
+        c("date", "minusers", "maxusers")]
+    r <- cast(rbind(melt(u, id.vars = "date"), melt(r, id.vars = "date")))
+    upturns <- r[r$users > r$maxusers, 1:2]
+    downturns <- r[r$users < r$minusers, 1:2]
+    if (events == "on") {
+      if (length(r$maxusers) > 0)
+        max_y <- max(max_y, max(r$maxusers, na.rm = TRUE))
+      plot <- plot +
+        geom_ribbon(data = r, aes(ymin = max(0, minusers),
+            ymax = maxusers), fill = "gray")
+    }
+    if (length(upturns$date) > 0)
+      plot <- plot +
+          geom_point(data = upturns, aes(x = date, y = users), size = 5,
+          colour = "dodgerblue2")
+    if (length(downturns$date) > 0)
+      plot <- plot +
+          geom_point(data = downturns, aes(x = date, y = users), size = 5,
+          colour = "firebrick2")
+  }
+  plot <- plot +
     geom_line(size = 1) +
     scale_x_date(name = paste("\nThe Tor Project - ",
         "https://metrics.torproject.org/", sep = ""),
         format = date_breaks$format, major = date_breaks$major,
         minor = date_breaks$minor) +
-    scale_y_continuous(name = "", limits = c(0,
-        ifelse(length(na.omit(u$users)) == 0, 0,
-        max(u$users, na.rm = TRUE))), formatter = formatter) +
+    scale_y_continuous(name = "", limits = c(0, max_y),
+        formatter = formatter)
     opts(title = title)
   ggsave(filename = path, width = 8, height = 5, dpi = 72)
 }
 
-plot_userstats_relay_country <- function(start, end, country, path) {
-  plot_userstats(start, end, 'relay', 'country', country, path)
+plot_userstats_relay_country <- function(start, end, country, events,
+    path) {
+  plot_userstats(start, end, 'relay', 'country', country, events, path)
 }
 
 plot_userstats_bridge_country <- function(start, end, country, path) {
-  plot_userstats(start, end, 'bridge', 'country', country, path)
+  plot_userstats(start, end, 'bridge', 'country', country, 'off', path)
 }
 
 plot_userstats_bridge_transport <- function(start, end, transport, path) {
-  plot_userstats(start, end, 'bridge', 'transport', transport, path)
+  plot_userstats(start, end, 'bridge', 'transport', transport, 'off',
+    path)
 }
 
 plot_userstats_bridge_version <- function(start, end, version, path) {
-  plot_userstats(start, end, 'bridge', 'version', version, path)
+  plot_userstats(start, end, 'bridge', 'version', version, 'off', path)
 }
 
diff --git a/rserve/tables.R b/rserve/tables.R
index e0dc1e4..59593d6 100644
--- a/rserve/tables.R
+++ b/rserve/tables.R
@@ -42,9 +42,6 @@ write_censorship_events <- function(start, end, path) {
   u <- data.frame(date = u$date, country = u$country,
        users = u$r * (u$bwp * u$brn / u$bwn - u$brp) /
                (u$bwr * u$brn / u$bwn - u$brr) / 10)
-  dates <- seq(from = as.Date(start, "%Y-%m-%d"),
-      to = as.Date(end, "%Y-%m-%d"), by="1 day")
-  missing <- setdiff(dates, u$date)
   r <- read.csv(
     "/srv/metrics.torproject.org/web/detector/direct-users-ranges.csv",
     stringsAsFactors = FALSE)
@@ -60,10 +57,10 @@ write_censorship_events <- function(start, end, path) {
     by = list(country = r$country), sum)
   r <- r[!(r$country %in% c("zy", "??", "a1", "a2", "o1", "ap", "eu")), ]
   r <- r[order(r$downturn, r$upturn, decreasing = TRUE), ]
-  r <- r[1:10, ] 
+  r <- r[1:10, ]
   r <- data.frame(cc = r$country,
     country = sub('the ', '', countrynames(as.character(r$country))),
-    downturns = r$downturn, 
+    downturns = r$downturn,
     upturns = r$upturn)
   write.csv(r, path, quote = FALSE, row.names = FALSE)
 }
@@ -122,3 +119,34 @@ write_userstats_bridge <- function(start, end, path) {
   write_userstats(start, end, 'bridge', path)
 }
 
+write_userstats_censorship_events <- function(start, end, path) {
+  end <- min(end, as.character(Sys.Date()))
+  u <- read.csv(paste("/srv/metrics.torproject.org/task-8462-graphs/",
+    "task-8462/userstats.csv", sep = ""),
+    stringsAsFactors = FALSE)
+  u <- u[u$date >= start & u$date <= end & u$country != '' &
+         u$transport == '' & u$version == '' & u$node == 'relay',
+         c("date", "country", "users")]
+  r <- read.csv(
+    "/srv/metrics.torproject.org/web/detector/userstats-ranges.csv",
+    stringsAsFactors = FALSE)
+  r <- r[r$date >= start & r$date <= end,
+      c("date", "country", "minusers", "maxusers")]
+  r <- cast(rbind(melt(u, id.vars = c("date", "country")),
+      melt(r, id.vars = c("date", "country"))))
+  r <- na.omit(r[r$users < r$minusers | r$users > r$maxusers, ])
+  r <- data.frame(date = r$date, country = r$country,
+    upturn = ifelse(r$users > r$maxusers, 1, 0),
+    downturn = ifelse(r$users < r$minusers, 1, 0))
+  r <- aggregate(r[, c("upturn", "downturn")],
+    by = list(country = r$country), sum)
+  r <- r[!(r$country %in% c("zy", "??", "a1", "a2", "o1", "ap", "eu")), ]
+  r <- r[order(r$downturn, r$upturn, decreasing = TRUE), ]
+  r <- r[1:10, ]
+  r <- data.frame(cc = r$country,
+    country = sub('the ', '', countrynames(as.character(r$country))),
+    downturns = r$downturn,
+    upturns = r$upturn)
+  write.csv(r, path, quote = FALSE, row.names = FALSE)
+}
+
diff --git a/src/org/torproject/ernie/web/graphs/GraphsSubpagesServlet.java b/src/org/torproject/ernie/web/graphs/GraphsSubpagesServlet.java
index 3ac99bb..c79f1e3 100644
--- a/src/org/torproject/ernie/web/graphs/GraphsSubpagesServlet.java
+++ b/src/org/torproject/ernie/web/graphs/GraphsSubpagesServlet.java
@@ -52,7 +52,7 @@ public class GraphsSubpagesServlet extends HttpServlet {
     this.availableGraphsSubpageTables.put("users.html",
         new HashSet<String>(Arrays.asList((
         "direct-users,censorship-events,bridge-users,userstats-relay,"
-        + "userstats-bridge").split(","))));
+        + "userstats-censorship-events,userstats-bridge").split(","))));
 
     this.knownCountries = Countries.getInstance().getCountryList();
   }
diff --git a/src/org/torproject/ernie/web/graphs/RObjectGenerator.java b/src/org/torproject/ernie/web/graphs/RObjectGenerator.java
index 84d61c6..2fa0cc6 100644
--- a/src/org/torproject/ernie/web/graphs/RObjectGenerator.java
+++ b/src/org/torproject/ernie/web/graphs/RObjectGenerator.java
@@ -76,6 +76,7 @@ public class RObjectGenerator implements ServletContextListener {
     this.availableCsvFiles.add("torperf");
     this.availableCsvFiles.add("torperf-failures");
     this.availableCsvFiles.add("userstats");
+    this.availableCsvFiles.add("userstats-detector");
     this.availableCsvFiles.add("versions");
 
     this.availableTables = new HashMap<String, String>();
@@ -84,6 +85,8 @@ public class RObjectGenerator implements ServletContextListener {
     this.availableTables.put("bridge-users", "start,end,filename");
     this.availableTables.put("userstats-relay", "start,end,filename");
     this.availableTables.put("userstats-bridge", "start,end,filename");
+    this.availableTables.put("userstats-censorship-events",
+        "start,end,filename");
     TableParameterChecker.getInstance().setAvailableTables(
         availableTables);
 
@@ -111,7 +114,7 @@ public class RObjectGenerator implements ServletContextListener {
     this.availableGraphs.put("fast-exits", "start,end,filename");
     this.availableGraphs.put("almost-fast-exits", "start,end,filename");
     this.availableGraphs.put("userstats-relay-country",
-        "start,end,country,filename");
+        "start,end,country,events,filename");
     this.availableGraphs.put("userstats-bridge-country",
         "start,end,country,filename");
     this.availableGraphs.put("userstats-bridge-transport",
diff --git a/web/WEB-INF/users.jsp b/web/WEB-INF/users.jsp
index 2a7bede..456aaab 100644
--- a/web/WEB-INF/users.jsp
+++ b/web/WEB-INF/users.jsp
@@ -208,6 +208,9 @@ Tor users (direct and bridge) per month by country.</p>
 daily Tor users (direct and bridge) per month by country.</p>
 <br>
 
+<hr>
+<hr>
+
 <a name="userstats"></a>
 <h3><a href="#userstats" class="anchor">New approach to estimating daily
 Tor users (BETA)</a></h3>
@@ -257,6 +260,14 @@ It's yet to be decided which approach is more correct.</font>
         </c:forEach>
       </select>
     </p><p>
+      Show possible censorship events if available (<a
+      href="http://research.torproject.org/techreports/detector-2011-09-09.pdf">BETA</a>)
+      <select name="events">
+        <option value="off">Off</option>
+        <option value="on"<c:if test="${userstats_relay_country_events[0] eq 'on'}"> selected</c:if>>On: both points and expected range</option>
+        <option value="points"<c:if test="${userstats_relay_country_events[0] eq 'points'}"> selected</c:if>>On: points only, no expected range</option>
+      </select>
+    </p><p>
     <input class="submit" type="submit" value="Update graph">
     </p>
   </div>
@@ -296,6 +307,40 @@ It's yet to be decided which approach is more correct.</font>
   </c:forEach>
 </table>
 <hr>
+<a name="userstats-censorship-events"></a>
+<p><b>Top-10 countries by possible censorship events (<a
+      href="http://research.torproject.org/techreports/detector-2011-09-09.pdf">BETA</a>):</b></p>
+<form action="users.html#userstats-censorship-events">
+  <div class="formrow">
+    <input type="hidden" name="table" value="userstats-censorship-events">
+    <p>
+    <label>Start date (yyyy-mm-dd):</label>
+      <input type="text" name="start" size="10"
+             value="<c:choose><c:when test="${fn:length(userstats_censorship_events_start) == 0}">${default_start_date}</c:when><c:otherwise>${userstats_censorship_events_start[0]}</c:otherwise></c:choose>">
+    <label>End date (yyyy-mm-dd):</label>
+      <input type="text" name="end" size="10"
+             value="<c:choose><c:when test="${fn:length(userstats_censorship_events_end) == 0}">${default_end_date}</c:when><c:otherwise>${userstats_censorship_events_end[0]}</c:otherwise></c:choose>">
+    </p><p>
+    <input class="submit" type="submit" value="Update table">
+    </p>
+  </div>
+</form>
+<br>
+<table>
+  <tr>
+    <th>Country</th>
+    <th>Downturns</th>
+    <th>Upturns</th>
+  </tr>
+  <c:forEach var="row" items="${userstats_censorship_events_tabledata}">
+    <tr>
+      <td><a href="users.html?graph=direct-users&country=${row['cc']}&events=on#direct-users">${row['country']}</a> </td>
+      <td>${row['downturns']}</td>
+      <td>${row['upturns']}</td>
+    </tr>
+  </c:forEach>
+</table>
+<hr>
 
 <a name="userstats-bridge-country"></a>
 <p><b>Bridge users by country (BETA):</b></p>



More information about the tor-commits mailing list