commit 73f278eb1ab44d6691dbbe349f608a1cbb72b42b Author: Karsten Loesing karsten.loesing@gmx.net Date: Thu Sep 15 20:49:40 2011 +0200
Add sources for bridge-blocking case study (#4030). --- task-4030/.gitignore | 7 + task-4030/DetectBridgeBlockings.java | 140 ++++++++++++++++++++ task-4030/README | 9 ++ task-4030/blocking.tex | 237 ++++++++++++++++++++++++++++++++++ task-4030/bridge-blockings.R | 60 +++++++++ 5 files changed, 453 insertions(+), 0 deletions(-)
diff --git a/task-4030/.gitignore b/task-4030/.gitignore new file mode 100644 index 0000000..94e7a87 --- /dev/null +++ b/task-4030/.gitignore @@ -0,0 +1,7 @@ +bridge-stats +*.csv +*.png +*.aux +*.log +*.pdf + diff --git a/task-4030/DetectBridgeBlockings.java b/task-4030/DetectBridgeBlockings.java new file mode 100644 index 0000000..ca9416a --- /dev/null +++ b/task-4030/DetectBridgeBlockings.java @@ -0,0 +1,140 @@ +import java.io.*; +import java.text.*; +import java.util.*; + +/** + * Processes previously extracted bridge usage statistics to detect + * possible bridge blockings in a given country. + * + * Before running this tool, make sure you download (sanitized) bridge + * descriptors from https://metrics.torproject.org/data.html#bridgedesc, + * extract them to a local directory, and run the following command: + * + * $ grep -Rl "^bridge-ips [a-z]" bridge-descriptors-* | \ + * xargs -I {} grep -E "^extra-info|^bridge" {} > bridge-stats + **/ + +public class DetectBridgeBlockings { + public static void main(String[] args) throws Exception { + + /* Run the analysis for the country with this country code. */ + final String COUNTRY = "cn"; + + /* Consider bridges with at most this many users as potentially + * blocked. */ + final int BLOCKED_THRESHOLD = 36; + + /* Consider bridges blocked that report no more than BLOCKED_THRESHOLD + * users for at least this number of days after having reported more + * than BLOCKED_THRESHOLD users at least once before. */ + final int BLOCKED_DAYS = 7; + + /* Only include bridges in the results that have reported at least + * this number of statistics. */ + final int MIN_REPORTS = 30; + + /* Begin of analysis interval (inclusive). */ + final String INTERVAL_BEGIN = "2010-01-01"; + + /* End of analysis interval (inclusive). */ + final String INTERVAL_END = "2010-07-31"; + + /* Check whether we have an input file. */ + File inputFile = new File("bridge-stats"); + if (!inputFile.exists()) { + System.out.println("File " + inputFile + " not found. Please see " + + "the README."); + System.exit(1); + } + + /* Read the relevant bridge statistics parts into memory. */ + BufferedReader br = new BufferedReader(new FileReader(inputFile)); + String line, fingerprint = null, date = null; + SortedMap<String, SortedMap<String, Integer>> usersPerBridgeAndDay = + new TreeMap<String, SortedMap<String, Integer>>(); + while ((line = br.readLine()) != null) { + if (line.startsWith("extra-info ")) { + fingerprint = line.split(" ")[2]; + } else if (line.startsWith("bridge-stats-end ")) { + date = line.substring("bridge-stats-end ".length(), + "bridge-stats-end YYYY-MM-DD".length()); + } else if (line.startsWith("bridge-ips ")) { + if (date.compareTo(INTERVAL_BEGIN) < 0 || + date.compareTo(INTERVAL_END) > 0) { + continue; + } + int ipsFromCountry = 0; + for (String part : line.split(" ")[1].split(",")) { + String country = part.split("=")[0]; + if (country.equals(COUNTRY)) { + ipsFromCountry = Integer.parseInt(part.split("=")[1]); + break; + } + } + if (!usersPerBridgeAndDay.containsKey(fingerprint)) { + usersPerBridgeAndDay.put(fingerprint, + new TreeMap<String, Integer>()); + } + usersPerBridgeAndDay.get(fingerprint).put(date, ipsFromCountry); + } + } + br.close(); + + /* Write processed statistics for COUNTRY to disk including a column + * for suspected blockings. */ + SimpleDateFormat dateFormat = new SimpleDateFormat( + "yyyy-MM-dd"); + dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); + BufferedWriter bw = new BufferedWriter(new FileWriter( + "bridge-blockings.csv")); + bw.write("fingerprint,date,ips,blocked\n"); + for (Map.Entry<String, SortedMap<String, Integer>> e : + usersPerBridgeAndDay.entrySet()) { + fingerprint = e.getKey(); + SortedMap<String, Integer> usersPerDay = e.getValue(); + if (usersPerDay.size() < MIN_REPORTS) { + continue; + } + long lastDateMillis = 0L; + String lastExceededBlockedThreshold = null; + SortedSet<String> datesNotExceedingBlockedThreshold = + new TreeSet<String>(); + for (Map.Entry<String, Integer> e1 : usersPerDay.entrySet()) { + date = e1.getKey(); + long dateMillis = dateFormat.parse(date).getTime(); + while (lastDateMillis > 0L && + dateMillis > lastDateMillis + 24L * 60L * 60L * 1000L) { + lastDateMillis += 24L * 60L * 60L * 1000L; + bw.write(fingerprint + "," + dateFormat.format(lastDateMillis) + + ",NA,NA\n"); + } + lastDateMillis = dateMillis; + int ips = e1.getValue(); + String bwLinePart = fingerprint + "," + date + "," + ips; + if (ips > BLOCKED_THRESHOLD) { + String blocked = "FALSE"; + if (lastExceededBlockedThreshold != null && + dateFormat.parse(date).getTime() - + dateFormat.parse(lastExceededBlockedThreshold).getTime() + > BLOCKED_DAYS * 24L * 60L * 60L * 1000L) { + blocked = "TRUE"; + } + for (String buffered : datesNotExceedingBlockedThreshold) { + bw.write(buffered + "," + blocked + "\n"); + } + datesNotExceedingBlockedThreshold.clear(); + bw.write(bwLinePart + ",FALSE\n"); + lastExceededBlockedThreshold = date; + } else { + datesNotExceedingBlockedThreshold.add(bwLinePart); + } + } + for (String buffered : datesNotExceedingBlockedThreshold) { + bw.write(buffered + ",TRUE\n"); + } + datesNotExceedingBlockedThreshold.clear(); + } + bw.close(); + } +} + diff --git a/task-4030/README b/task-4030/README new file mode 100644 index 0000000..53ffb68 --- /dev/null +++ b/task-4030/README @@ -0,0 +1,9 @@ +$ grep -Rl "^bridge-ips [a-z]" bridge-descriptors-* | \ + xargs -I {} grep -E "^extra-info|^bridge" {} > bridge-stats + +$ javac DetectBridgeBlockings.java && java DetectBridgeBlockings + +$ R --slave -f bridge-blockings.R + +$ pdflatex blocking.tex + diff --git a/task-4030/blocking.tex b/task-4030/blocking.tex new file mode 100644 index 0000000..bb2ad90 --- /dev/null +++ b/task-4030/blocking.tex @@ -0,0 +1,237 @@ +\documentclass{article} +\usepackage[pdftex]{graphicx} +\usepackage{graphics} +\usepackage{color} +\usepackage{url} + +\begin{document} + +\author{Karsten Loesing\{\tt karsten@torproject.org}} +\title{Case study:\Learning whether a Tor bridge is blocked\by looking +at its aggregate usage statistics\-- Part one --} +\maketitle + +\section{Introduction} + +Tor bridges\footnote{\url{https://www.torproject.org/docs/bridges%7D%7D are +relays that are not listed in the main directory. +Clients which cannot access the Tor network directly can try to learn a +few bridge addresses and use these bridges to connect to the Tor network. +Bridges have been introduced to impede censoring the Tor network, but in +the past we experienced successful blocking of bridges in a few countries. + +In this report we investigate whether we can learn that a bridge is +blocked in a given country only by looking at its reported aggregate +statistics on usage by country. +By knowing that a bridge is blocked, we can, for example, avoid giving +out its address to users from that country. + +Learning whether a bridge is blocked is somewhat related to our recent +efforts to detect censorship of direct access to the Tor +network.\footnote{\url{https://metrics.torproject.org/papers/detector-2011-09-09.pdf%7D%7D +The main difference is that we want to know which bridges are blocked and +which are not, whereas we don't care which relays are accessible in the +case of blocked direct access. +It's easy to block all relays, but it should be difficult to block all +bridges. + +This report can only be seen as a first step towards researching bridge +blocking. +Even if a bridge reports that it had zero users from a country, we're +lacking the confirmation that the bridge was really blocked. +There can be other reasons for low user numbers which may be completely +unrelated. +The results of this analysis should be considered when actively scanning +bridge reachability from inside a country, both to decide how frequently a +bridge should be scanned and to evaluate how reliable an analysis of +passive usage statistics can be. + +\section{Bridge usage statistics} + +Bridges report aggregate usage statistics on the number of connecting +clients. +Bridges gather these statistics by memorizing unique IP addresses of +connecting clients over 24 hour periods and resolving IP addresses to +country codes using an internal GeoIP database. +Archives of these statistics are available for analysis from the metrics +website.\footnote{\url{https://metrics.torproject.org/data.html#bridgedesc%7D%7D +Figure~\ref{fig:bridgeextrainfo} shows an example of bridge usage +statistics. +This bridge observed 41 to 48 connecting clients from Saudi Arabia +(all numbers are rounded up to the next multiple of 8), 33 to 40 +connecting clients from the U.S.A., 25 to 32 from Germany, 25 to 32 from +Iran, and so on. +These connecting clients were observed in the 24~hours (86,400 seconds) +before December 27, 2010, 14:56:29 UTC. + +\begin{figure}[h] +\begin{quote} +\begin{verbatim} +extra-info Unnamed A5FA7F38B02A415E72FE614C64A1E5A92BA99BBD +published 2010-12-27 18:55:01 +[...] +bridge-stats-end 2010-12-27 14:56:29 (86400 s) +bridge-ips sa=48,us=40,de=32,ir=32,[...] +\end{verbatim} +\end{quote} +\caption{Example of aggregate bridge usage statistics} +\label{fig:bridgeextrainfo} +\end{figure} + +An obvious limitation of these bridge usage statistics is that we can only +learn about connecting clients from bridges with at least 24 hours uptime. +It's still unclear how many bridge users are not included in the +statistics because of this, which is left for a different analysis. + +We further decided to exclude bridges running Tor versions 0.2.2.3-alpha +or earlier. +These bridges report similar statistics as the later Tor versions that +we're considering here, but do not enforce a measurement interval of +exactly 24 hours which would have slightly complicated the analysis. +We don't expect the bridge version to have an influence on bridge usage +or on the likelihood of the bridge to be blocked in a given country. + +\section{Case study: China in the first half of 2010} + +The major limitation of this analysis is that we don't have the data +confirming that a bridge was actually blocked. +We may decide on a case-by-case basis whether a blocking is a plausible +explanation for the change in observed users from a given country. +Anything more objective requires additional data, e.g., data obtained from +active reachability scans. + +We decided to investigate bridge usage from China in the first half of +2010 as a case study. +Figure~\ref{fig:bridge-users} shows estimated daily bridge users from China +since July 2009. +The huge slope in September and October 2009 is very likely a result from +China blocking direct access to the Tor network. +It seems plausible that the drops in March and May 2010 result from +attempts to block access to bridges, too. +We're going to focus only on the interval from January to June 2010 which +promises the most interesting results. +We should be able to detect these blockings in the reported statistics of +single bridges. +Obviously, it may be hard or impossible to transfer the findings from this +case study to other countries or situations. + +\begin{figure} +\includegraphics[width=\textwidth]{bridge-users.png} +\caption{Estimated daily bridge users from China} +\label{fig:bridge-users} +\end{figure} + +\paragraph{Definition of bridge blocking} + +We have a few options to define when we consider a bridge to be blocked +from a given country on a given day. + +\begin{itemize} +\item \textbf{Absolute threshold:} +The absolute number of connecting clients from a country falls below a +fixed threshold. +\item \textbf{Relative threshold compared to other countries:} +The fraction of connecting clients from a country drops below a fixed +percent value. +\item \textbf{Estimated interval based on history:} +The absolute or relative number of connecting clients falls outside an +estimated interval based on the recent history. +\end{itemize} + +For this case study we decided to stick with the simplest solution being +an absolute threshold. +We define a somewhat arbitrary threshold of 32 users to decide whether a +bridge is potentially blocked. +A blocked bridge does not necessarily report zero users per day. +A likely explanation for reporting users from a country that blocks a +bridge is that our GeoIP is not 100~% accurate and reports a few users +which in fact come from other countries. + +The reason against using a relative threshold was that it depends on +development in other countries. +As we can see in the example of China, bridge usage can depend on the +abilty to directly access the Tor network. +A sudden increase in country $A$ could significantly lower the relative +usage in country $B$. +We should probably consider both absolute and relative thresholds in +future investigations. +Maybe we also need to take direct usage numbers into account. + +We also didn't build our analysis upon an estimated interval based on the +recent history, because it's unclear how fast a bridge will be blocked +after being set up. +If it only takes the censor a few hours, the bridge may never see much use +from a country at all. +An estimate based on the bridge's history may not detect the censorship at +all, because it may look like a bridge with only few users from that +country. + +We plan to reconsider other options for deciding that a bridge is blocked +once we have data confirming this. + +\paragraph{Visualization of bridge blockings} + +Figure~\ref{fig:bridge-blockings} shows a subset of the raw bridge usage +statistics for clients connecting from China in the first half of 2010. +Possible blocking events are those when the bridge reports 32 or fewer +connecting clients per day. +These events are marked with red dots. + +We decided to only include bridges in the figure that report at least +100~Chinese clients on at least one day in the whole interval. +Bridges with fewer users than that have a usage pattern that makes it much +more difficult to detect blockings at all. +The figure also shows only bridges reporting statistics on at least 30 +days in the measurement interval. + +\begin{figure}[t] +\includegraphics[width=\textwidth]{bridge-blockings.png} +\caption{Subset of bridge usage statistics for Chinese clients in the +first half of 2010} +\label{fig:bridge-blockings} +\end{figure} + +The single bridge usage plots indicate how difficult it is to detect +blockings only from usage statistics. +About 10 of the displayed 27 plots have a pattern similar to the expected +pattern from Figure~\ref{fig:bridge-users}. +The best examples are probably bridges \verb+C037+ and \verb+D795+. +Interestingly, bridge \verb+A5FA+ was unaffected by the blocking in March +2010, but affected by the blocking in May 2010. + +\paragraph{Aggregating blocking events} + +As the last step of this case study we want to compare observed bridge +users to the number of blocked bridges as detected by our simple threshold +approach. +We would expect most of our bridges to exhibit blockings in March 2010 and +from May 2010 on. +Figure~\ref{fig:bridge-users-blockings} plots users and blocked bridges +over time. +The two plots indicate that our detection algorithm is at least not +totally off. + +\begin{figure}[t] +\includegraphics[width=\textwidth]{bridge-users-blockings.png} +\caption{Estimated users and assumed bridge blockings in China in the +first half of 2010} +\label{fig:bridge-users-blockings} +\end{figure} + +\section{Conclusion} + +Passively collected bridge usage statistics seem to be a useful tool to +detect whether a bridge is blocked from a country. +However, the main conclusion from this analysis is that we're lacking the +data to conduct it usefully. +One way to obtain the data we need are active scans. +When conducting such scans, passively collected statistics may help reduce +the total number and frequency of scans. +For example, when selecting a bridge to scan, the reciprocal of the last +reported number of connecting clients could be used as a probability +weight. +Once we have better data confirming bridge blocking we shall revisit the +criteria for deriving the blocking from usage statistics. + +\end{document} + diff --git a/task-4030/bridge-blockings.R b/task-4030/bridge-blockings.R new file mode 100644 index 0000000..da40186 --- /dev/null +++ b/task-4030/bridge-blockings.R @@ -0,0 +1,60 @@ +library(ggplot2) + +u <- read.csv("bridge-users.csv", stringsAsFactors = FALSE) +u <- u[u$date >= "2009-07-01" & u$date < "2011-08-01", c("date", "cn")] +ggplot(u, aes(x = as.Date(date), y = cn)) + +geom_line(size = 0.75) + +geom_rect(aes(NULL, NULL, xmin = as.Date("2010-01-01"), + xmax = as.Date("2010-06-30"), ymin = -Inf, ymax = Inf, fill = TRUE)) + +scale_fill_manual(name = "", breaks = TRUE, + values = alpha("purple", 0.2)) + +scale_x_date(name = "", major = "6 months", minor = "1 month", + format = "%b %Y") + +scale_y_continuous(name = "", limits = c(0, max(u$cn, na.rm = TRUE))) + +opts(legend.position = "none") +ggsave("bridge-users.png", width = 8, height = 4, dpi = 150) + +b <- read.csv("bridge-blockings.csv", stringsAsFactors = FALSE) +b <- b[b$date >= '2010-01-01' & b$date <= '2010-06-30', ] +fingerprints <- unique(b[b$ips >= 100, "fingerprint"]) +b <- b[b$fingerprint %in% fingerprints, ] + +d <- data.frame(date = b$date, blocked = ifelse(b$ips < 40, 1, NA)) +d <- na.omit(d) +d <- aggregate(d$blocked, by = list(date = d$date), sum) +e <- as.Date(setdiff(seq(from = as.Date("2010-01-01"), + to = as.Date("2010-06-30"), by = "1 day"), as.Date(d$date)), + origin = "1970-01-01") +u <- u[u$date >= '2010-01-01' & u$date <= '2010-06-30', ] +d <- rbind(data.frame(date = u$date, value = u$cn, variable = "Users"), + data.frame(date = d$date, value = d$x, variable = "Blocked Bridges"), + data.frame(date = as.character(e), value = 0, + variable = "Blocked Bridges")) +ggplot(d, aes(x = as.Date(date), y = value)) + +geom_line(size = 0.75) + +facet_grid(variable ~ ., scales = "free_y") + +scale_x_date(name = "", format = "%b %Y", major = "1 month", + minor = "months") + +scale_y_continuous(name = "") +ggsave("bridge-users-blockings.png", width = 8, height = 4, dpi = 150) + +b <- data.frame(date = as.Date(b$date), ips = b$ips, + fingerprint = substr(b$fingerprint, 1, 4), + blocked = ifelse(b$blocked, "red", "black")) +bb <- b +bb[!is.na(b$ips) & b$ips >= 36, "ips"] <- NA +ggplot(b, aes(x = date, y = ips)) + +facet_wrap(~ fingerprint, ncol = 4) + +geom_line(size = 0.75) + +geom_point(size = 0.75) + +geom_hline(yintercept = 32, linetype = 2, size = 0.25) + +geom_point(data = bb, aes(x = date, y = ips), colour = "red", size = 3, + alpha = 0.25) + +scale_x_date(name = "", format = "%b", major = "2 months", + minor = "months") + +scale_y_continuous(name = "", breaks = c(0, 500, 1000)) + +scale_colour_manual(breaks = c("red", "black"), + values = c("red", "black")) + +opts(legend.position = "none") +ggsave("bridge-blockings.png", height = 9, width = 8) +
tor-commits@lists.torproject.org