[tor-commits] [metrics-tasks/master] Add sources for bridge-blocking case study (#4030).

karsten at torproject.org karsten at torproject.org
Thu Sep 15 18:57:04 UTC 2011


commit 73f278eb1ab44d6691dbbe349f608a1cbb72b42b
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Thu Sep 15 20:49:40 2011 +0200

    Add sources for bridge-blocking case study (#4030).
---
 task-4030/.gitignore                 |    7 +
 task-4030/DetectBridgeBlockings.java |  140 ++++++++++++++++++++
 task-4030/README                     |    9 ++
 task-4030/blocking.tex               |  237 ++++++++++++++++++++++++++++++++++
 task-4030/bridge-blockings.R         |   60 +++++++++
 5 files changed, 453 insertions(+), 0 deletions(-)

diff --git a/task-4030/.gitignore b/task-4030/.gitignore
new file mode 100644
index 0000000..94e7a87
--- /dev/null
+++ b/task-4030/.gitignore
@@ -0,0 +1,7 @@
+bridge-stats
+*.csv
+*.png
+*.aux
+*.log
+*.pdf
+
diff --git a/task-4030/DetectBridgeBlockings.java b/task-4030/DetectBridgeBlockings.java
new file mode 100644
index 0000000..ca9416a
--- /dev/null
+++ b/task-4030/DetectBridgeBlockings.java
@@ -0,0 +1,140 @@
+import java.io.*;
+import java.text.*;
+import java.util.*;
+
+/**
+ * Processes previously extracted bridge usage statistics to detect
+ * possible bridge blockings in a given country.
+ *
+ * Before running this tool, make sure you download (sanitized) bridge
+ * descriptors from https://metrics.torproject.org/data.html#bridgedesc,
+ * extract them to a local directory, and run the following command:
+ *
+ * $ grep -Rl "^bridge-ips [a-z]" bridge-descriptors-* | \
+ *   xargs -I {} grep -E "^extra-info|^bridge" {} > bridge-stats
+ **/
+
+public class DetectBridgeBlockings {
+  public static void main(String[] args) throws Exception {
+
+    /* Run the analysis for the country with this country code. */
+    final String COUNTRY = "cn";
+
+    /* Consider bridges with at most this many users as potentially
+     * blocked. */
+    final int BLOCKED_THRESHOLD = 36;
+
+    /* Consider bridges blocked that report no more than BLOCKED_THRESHOLD
+     * users for at least this number of days after having reported more
+     * than BLOCKED_THRESHOLD users at least once before. */
+    final int BLOCKED_DAYS = 7;
+
+    /* Only include bridges in the results that have reported at least
+     * this number of statistics. */
+    final int MIN_REPORTS = 30;
+
+    /* Begin of analysis interval (inclusive). */
+    final String INTERVAL_BEGIN = "2010-01-01";
+    
+    /* End of analysis interval (inclusive). */
+    final String INTERVAL_END = "2010-07-31";
+
+    /* Check whether we have an input file. */
+    File inputFile = new File("bridge-stats");
+    if (!inputFile.exists()) {
+      System.out.println("File " + inputFile + " not found.  Please see "
+          + "the README.");
+      System.exit(1);
+    }
+
+    /* Read the relevant bridge statistics parts into memory. */
+    BufferedReader br = new BufferedReader(new FileReader(inputFile));
+    String line, fingerprint = null, date = null;
+    SortedMap<String, SortedMap<String, Integer>> usersPerBridgeAndDay =
+        new TreeMap<String, SortedMap<String, Integer>>();
+    while ((line = br.readLine()) != null) {
+      if (line.startsWith("extra-info ")) {
+        fingerprint = line.split(" ")[2];
+      } else if (line.startsWith("bridge-stats-end ")) {
+        date = line.substring("bridge-stats-end ".length(),
+            "bridge-stats-end YYYY-MM-DD".length());
+      } else if (line.startsWith("bridge-ips ")) {
+        if (date.compareTo(INTERVAL_BEGIN) < 0 ||
+            date.compareTo(INTERVAL_END) > 0) {
+          continue;
+        }
+        int ipsFromCountry = 0;
+        for (String part : line.split(" ")[1].split(",")) {
+          String country = part.split("=")[0];
+          if (country.equals(COUNTRY)) {
+            ipsFromCountry = Integer.parseInt(part.split("=")[1]);
+            break;
+          }
+        }
+        if (!usersPerBridgeAndDay.containsKey(fingerprint)) {
+          usersPerBridgeAndDay.put(fingerprint,
+              new TreeMap<String, Integer>());
+        }
+        usersPerBridgeAndDay.get(fingerprint).put(date, ipsFromCountry);
+      }
+    }
+    br.close();
+
+    /* Write processed statistics for COUNTRY to disk including a column
+     * for suspected blockings. */
+    SimpleDateFormat dateFormat = new SimpleDateFormat(
+        "yyyy-MM-dd");
+    dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+    BufferedWriter bw = new BufferedWriter(new FileWriter(
+        "bridge-blockings.csv"));
+    bw.write("fingerprint,date,ips,blocked\n");
+    for (Map.Entry<String, SortedMap<String, Integer>> e :
+        usersPerBridgeAndDay.entrySet()) {
+      fingerprint = e.getKey();
+      SortedMap<String, Integer> usersPerDay = e.getValue();
+      if (usersPerDay.size() < MIN_REPORTS) {
+        continue;
+      }
+      long lastDateMillis = 0L;
+      String lastExceededBlockedThreshold = null;
+      SortedSet<String> datesNotExceedingBlockedThreshold =
+          new TreeSet<String>();
+      for (Map.Entry<String, Integer> e1 : usersPerDay.entrySet()) {
+        date = e1.getKey();
+        long dateMillis = dateFormat.parse(date).getTime();
+        while (lastDateMillis > 0L &&
+            dateMillis > lastDateMillis + 24L * 60L * 60L * 1000L) {
+          lastDateMillis += 24L * 60L * 60L * 1000L;
+          bw.write(fingerprint + "," + dateFormat.format(lastDateMillis)
+              + ",NA,NA\n");
+        }
+        lastDateMillis = dateMillis;
+        int ips = e1.getValue();
+        String bwLinePart = fingerprint + "," + date + "," + ips;
+        if (ips > BLOCKED_THRESHOLD) {
+          String blocked = "FALSE";
+          if (lastExceededBlockedThreshold != null &&
+              dateFormat.parse(date).getTime() -
+              dateFormat.parse(lastExceededBlockedThreshold).getTime()
+              > BLOCKED_DAYS * 24L * 60L * 60L * 1000L) {
+            blocked = "TRUE";
+          }
+          for (String buffered : datesNotExceedingBlockedThreshold) {
+            bw.write(buffered + "," + blocked + "\n");
+          }
+          datesNotExceedingBlockedThreshold.clear();
+          bw.write(bwLinePart + ",FALSE\n");
+          lastExceededBlockedThreshold = date;
+        } else {
+          datesNotExceedingBlockedThreshold.add(bwLinePart);
+        }
+      }
+      for (String buffered : datesNotExceedingBlockedThreshold) {
+        bw.write(buffered + ",TRUE\n");
+      }
+      datesNotExceedingBlockedThreshold.clear();
+    }
+    bw.close();
+  }
+}
+
diff --git a/task-4030/README b/task-4030/README
new file mode 100644
index 0000000..53ffb68
--- /dev/null
+++ b/task-4030/README
@@ -0,0 +1,9 @@
+$ grep -Rl "^bridge-ips [a-z]" bridge-descriptors-* | \
+  xargs -I {} grep -E "^extra-info|^bridge" {} > bridge-stats
+
+$ javac DetectBridgeBlockings.java && java DetectBridgeBlockings
+
+$ R --slave -f bridge-blockings.R
+
+$ pdflatex blocking.tex
+
diff --git a/task-4030/blocking.tex b/task-4030/blocking.tex
new file mode 100644
index 0000000..bb2ad90
--- /dev/null
+++ b/task-4030/blocking.tex
@@ -0,0 +1,237 @@
+\documentclass{article}
+\usepackage[pdftex]{graphicx}
+\usepackage{graphics}
+\usepackage{color}
+\usepackage{url}
+
+\begin{document}
+
+\author{Karsten Loesing\\{\tt karsten at torproject.org}}
+\title{Case study:\\Learning whether a Tor bridge is blocked\\by looking
+at its aggregate usage statistics\\-- Part one --}
+\maketitle
+
+\section{Introduction}
+
+Tor bridges\footnote{\url{https://www.torproject.org/docs/bridges}} are
+relays that are not listed in the main directory.
+Clients which cannot access the Tor network directly can try to learn a
+few bridge addresses and use these bridges to connect to the Tor network.
+Bridges have been introduced to impede censoring the Tor network, but in
+the past we experienced successful blocking of bridges in a few countries.
+
+In this report we investigate whether we can learn that a bridge is
+blocked in a given country only by looking at its reported aggregate
+statistics on usage by country.
+By knowing that a bridge is blocked, we can, for example, avoid giving
+out its address to users from that country.
+
+Learning whether a bridge is blocked is somewhat related to our recent
+efforts to detect censorship of direct access to the Tor
+network.\footnote{\url{https://metrics.torproject.org/papers/detector-2011-09-09.pdf}}
+The main difference is that we want to know which bridges are blocked and
+which are not, whereas we don't care which relays are accessible in the
+case of blocked direct access.
+It's easy to block all relays, but it should be difficult to block all
+bridges.
+
+This report can only be seen as a first step towards researching bridge
+blocking.
+Even if a bridge reports that it had zero users from a country, we're
+lacking the confirmation that the bridge was really blocked.
+There can be other reasons for low user numbers which may be completely
+unrelated.
+The results of this analysis should be considered when actively scanning
+bridge reachability from inside a country, both to decide how frequently a
+bridge should be scanned and to evaluate how reliable an analysis of
+passive usage statistics can be.
+
+\section{Bridge usage statistics}
+
+Bridges report aggregate usage statistics on the number of connecting
+clients.
+Bridges gather these statistics by memorizing unique IP addresses of
+connecting clients over 24 hour periods and resolving IP addresses to
+country codes using an internal GeoIP database.
+Archives of these statistics are available for analysis from the metrics
+website.\footnote{\url{https://metrics.torproject.org/data.html#bridgedesc}}
+Figure~\ref{fig:bridgeextrainfo} shows an example of bridge usage
+statistics.
+This bridge observed 41 to 48 connecting clients from Saudi Arabia
+(all numbers are rounded up to the next multiple of 8), 33 to 40
+connecting clients from the U.S.A., 25 to 32 from Germany, 25 to 32 from
+Iran, and so on.
+These connecting clients were observed in the 24~hours (86,400 seconds)
+before December 27, 2010, 14:56:29 UTC.
+
+\begin{figure}[h]
+\begin{quote}
+\begin{verbatim}
+extra-info Unnamed A5FA7F38B02A415E72FE614C64A1E5A92BA99BBD
+published 2010-12-27 18:55:01
+[...]
+bridge-stats-end 2010-12-27 14:56:29 (86400 s)
+bridge-ips sa=48,us=40,de=32,ir=32,[...]
+\end{verbatim}
+\end{quote}
+\caption{Example of aggregate bridge usage statistics}
+\label{fig:bridgeextrainfo}
+\end{figure}
+
+An obvious limitation of these bridge usage statistics is that we can only
+learn about connecting clients from bridges with at least 24 hours uptime.
+It's still unclear how many bridge users are not included in the
+statistics because of this, which is left for a different analysis.
+
+We further decided to exclude bridges running Tor versions 0.2.2.3-alpha
+or earlier.
+These bridges report similar statistics as the later Tor versions that
+we're considering here, but do not enforce a measurement interval of
+exactly 24 hours which would have slightly complicated the analysis.
+We don't expect the bridge version to have an influence on bridge usage
+or on the likelihood of the bridge to be blocked in a given country.
+
+\section{Case study: China in the first half of 2010}
+
+The major limitation of this analysis is that we don't have the data
+confirming that a bridge was actually blocked.
+We may decide on a case-by-case basis whether a blocking is a plausible
+explanation for the change in observed users from a given country.
+Anything more objective requires additional data, e.g., data obtained from
+active reachability scans.
+
+We decided to investigate bridge usage from China in the first half of
+2010 as a case study.
+Figure~\ref{fig:bridge-users} shows estimated daily bridge users from China
+since July 2009.
+The huge slope in September and October 2009 is very likely a result from
+China blocking direct access to the Tor network.
+It seems plausible that the drops in March and May 2010 result from
+attempts to block access to bridges, too.
+We're going to focus only on the interval from January to June 2010 which
+promises the most interesting results.
+We should be able to detect these blockings in the reported statistics of
+single bridges.
+Obviously, it may be hard or impossible to transfer the findings from this
+case study to other countries or situations.
+
+\begin{figure}
+\includegraphics[width=\textwidth]{bridge-users.png}
+\caption{Estimated daily bridge users from China}
+\label{fig:bridge-users}
+\end{figure}
+
+\paragraph{Definition of bridge blocking}
+
+We have a few options to define when we consider a bridge to be blocked
+from a given country on a given day.
+
+\begin{itemize}
+\item \textbf{Absolute threshold:}
+The absolute number of connecting clients from a country falls below a
+fixed threshold.
+\item \textbf{Relative threshold compared to other countries:}
+The fraction of connecting clients from a country drops below a fixed
+percent value.
+\item \textbf{Estimated interval based on history:}
+The absolute or relative number of connecting clients falls outside an
+estimated interval based on the recent history.
+\end{itemize}
+
+For this case study we decided to stick with the simplest solution being
+an absolute threshold.
+We define a somewhat arbitrary threshold of 32 users to decide whether a
+bridge is potentially blocked.
+A blocked bridge does not necessarily report zero users per day.
+A likely explanation for reporting users from a country that blocks a
+bridge is that our GeoIP is not 100~\% accurate and reports a few users
+which in fact come from other countries.
+
+The reason against using a relative threshold was that it depends on
+development in other countries.
+As we can see in the example of China, bridge usage can depend on the
+abilty to directly access the Tor network.
+A sudden increase in country $A$ could significantly lower the relative
+usage in country $B$.
+We should probably consider both absolute and relative thresholds in
+future investigations.
+Maybe we also need to take direct usage numbers into account.
+
+We also didn't build our analysis upon an estimated interval based on the
+recent history, because it's unclear how fast a bridge will be blocked
+after being set up.
+If it only takes the censor a few hours, the bridge may never see much use
+from a country at all.
+An estimate based on the bridge's history may not detect the censorship at
+all, because it may look like a bridge with only few users from that
+country.
+
+We plan to reconsider other options for deciding that a bridge is blocked
+once we have data confirming this.
+
+\paragraph{Visualization of bridge blockings}
+
+Figure~\ref{fig:bridge-blockings} shows a subset of the raw bridge usage
+statistics for clients connecting from China in the first half of 2010.
+Possible blocking events are those when the bridge reports 32 or fewer
+connecting clients per day.
+These events are marked with red dots.
+
+We decided to only include bridges in the figure that report at least
+100~Chinese clients on at least one day in the whole interval.
+Bridges with fewer users than that have a usage pattern that makes it much
+more difficult to detect blockings at all.
+The figure also shows only bridges reporting statistics on at least 30
+days in the measurement interval.
+
+\begin{figure}[t]
+\includegraphics[width=\textwidth]{bridge-blockings.png}
+\caption{Subset of bridge usage statistics for Chinese clients in the
+first half of 2010}
+\label{fig:bridge-blockings}
+\end{figure}
+
+The single bridge usage plots indicate how difficult it is to detect
+blockings only from usage statistics.
+About 10 of the displayed 27 plots have a pattern similar to the expected
+pattern from Figure~\ref{fig:bridge-users}.
+The best examples are probably bridges \verb+C037+ and \verb+D795+.
+Interestingly, bridge \verb+A5FA+ was unaffected by the blocking in March
+2010, but affected by the blocking in May 2010.
+
+\paragraph{Aggregating blocking events}
+
+As the last step of this case study we want to compare observed bridge
+users to the number of blocked bridges as detected by our simple threshold
+approach.
+We would expect most of our bridges to exhibit blockings in March 2010 and
+from May 2010 on.
+Figure~\ref{fig:bridge-users-blockings} plots users and blocked bridges
+over time.
+The two plots indicate that our detection algorithm is at least not
+totally off.
+
+\begin{figure}[t]
+\includegraphics[width=\textwidth]{bridge-users-blockings.png}
+\caption{Estimated users and assumed bridge blockings in China in the
+first half of 2010}
+\label{fig:bridge-users-blockings}
+\end{figure}
+
+\section{Conclusion}
+
+Passively collected bridge usage statistics seem to be a useful tool to
+detect whether a bridge is blocked from a country.
+However, the main conclusion from this analysis is that we're lacking the
+data to conduct it usefully.
+One way to obtain the data we need are active scans.
+When conducting such scans, passively collected statistics may help reduce
+the total number and frequency of scans.
+For example, when selecting a bridge to scan, the reciprocal of the last
+reported number of connecting clients could be used as a probability
+weight.
+Once we have better data confirming bridge blocking we shall revisit the
+criteria for deriving the blocking from usage statistics.
+
+\end{document}
+
diff --git a/task-4030/bridge-blockings.R b/task-4030/bridge-blockings.R
new file mode 100644
index 0000000..da40186
--- /dev/null
+++ b/task-4030/bridge-blockings.R
@@ -0,0 +1,60 @@
+library(ggplot2)
+
+u <- read.csv("bridge-users.csv", stringsAsFactors = FALSE)
+u <- u[u$date >= "2009-07-01" & u$date < "2011-08-01", c("date", "cn")]
+ggplot(u, aes(x = as.Date(date), y = cn)) +
+geom_line(size = 0.75) +
+geom_rect(aes(NULL, NULL, xmin = as.Date("2010-01-01"),
+    xmax = as.Date("2010-06-30"), ymin = -Inf, ymax = Inf, fill = TRUE)) +
+scale_fill_manual(name = "", breaks = TRUE,
+    values = alpha("purple", 0.2)) +
+scale_x_date(name = "", major = "6 months", minor = "1 month",
+    format = "%b %Y") +
+scale_y_continuous(name = "", limits = c(0, max(u$cn, na.rm = TRUE))) +
+opts(legend.position = "none")
+ggsave("bridge-users.png", width = 8, height = 4, dpi = 150)
+
+b <- read.csv("bridge-blockings.csv", stringsAsFactors = FALSE)
+b <- b[b$date >= '2010-01-01' & b$date <= '2010-06-30', ]
+fingerprints <- unique(b[b$ips >= 100, "fingerprint"])
+b <- b[b$fingerprint %in% fingerprints, ]
+
+d <- data.frame(date = b$date, blocked = ifelse(b$ips < 40, 1, NA))
+d <- na.omit(d)
+d <- aggregate(d$blocked, by = list(date = d$date), sum)
+e <- as.Date(setdiff(seq(from = as.Date("2010-01-01"),
+    to = as.Date("2010-06-30"), by = "1 day"), as.Date(d$date)),
+    origin = "1970-01-01")
+u <- u[u$date >= '2010-01-01' & u$date <= '2010-06-30', ]
+d <- rbind(data.frame(date = u$date, value = u$cn, variable = "Users"),
+    data.frame(date = d$date, value = d$x, variable = "Blocked Bridges"),
+    data.frame(date = as.character(e), value = 0,
+    variable = "Blocked Bridges"))
+ggplot(d, aes(x = as.Date(date), y = value)) +
+geom_line(size = 0.75) +
+facet_grid(variable ~ ., scales = "free_y") +
+scale_x_date(name = "", format = "%b %Y", major = "1 month",
+    minor = "months") +
+scale_y_continuous(name = "")
+ggsave("bridge-users-blockings.png", width = 8, height = 4, dpi = 150)
+
+b <- data.frame(date = as.Date(b$date), ips = b$ips,
+    fingerprint = substr(b$fingerprint, 1, 4),
+    blocked = ifelse(b$blocked, "red", "black"))
+bb <- b
+bb[!is.na(b$ips) & b$ips >= 36, "ips"] <- NA
+ggplot(b, aes(x = date, y = ips)) +
+facet_wrap(~ fingerprint, ncol = 4) +
+geom_line(size = 0.75) +
+geom_point(size = 0.75) +
+geom_hline(yintercept = 32, linetype = 2, size = 0.25) +
+geom_point(data = bb, aes(x = date, y = ips), colour = "red", size = 3,
+    alpha = 0.25) +
+scale_x_date(name = "", format = "%b", major = "2 months",
+    minor = "months") +
+scale_y_continuous(name = "", breaks = c(0, 500, 1000)) +
+scale_colour_manual(breaks = c("red", "black"),
+    values = c("red", "black")) +
+opts(legend.position = "none")
+ggsave("bridge-blockings.png", height = 9, width = 8)
+



More information about the tor-commits mailing list