commit a589b95501a926029a5e826d709e3f5e8c90ab34 Author: Karsten Loesing karsten.loesing@gmx.net Date: Tue Jul 19 12:02:47 2016 +0200
Add graph on bridges by country and transport.
Includes major improvements and fixes suggested by iwakeh: - Omit a mid estimate which could have been misleading to users. - Use explicit column aliases rather than relying on PostgreSQL's default column naming scheme. - Fix a major bug in ordering transports by user numbers that would favor transports with a large range between low and high bound over transports with a larger mean of low and high bound. - Improve documentation of low and high bound of users by country and transport.
Implements #19544. --- modules/clients/init-userstats.sql | 170 ++++++++++++++++++++- .../src/org/torproject/metrics/clients/Main.java | 1 + shared/bin/80-run-clients-stats.sh | 2 + shared/bin/99-copy-stats-files.sh | 1 + website/etc/metrics.json | 62 +++++++- website/etc/web.xml | 5 + website/rserve/graphs.R | 43 ++++++ .../metrics/web/research/ResearchStatsServlet.java | 1 + 8 files changed, 281 insertions(+), 4 deletions(-)
diff --git a/modules/clients/init-userstats.sql b/modules/clients/init-userstats.sql index a86cabf..09447f6 100644 --- a/modules/clients/init-userstats.sql +++ b/modules/clients/init-userstats.sql @@ -1,4 +1,4 @@ --- Copyright 2013 The Tor Project +-- Copyright 2013--2016 The Tor Project -- See LICENSE for licensing information
-- Use enum types for dimensions that may only change if we write new code @@ -145,6 +145,38 @@ CREATE TABLE aggregated ( nrh DOUBLE PRECISION NOT NULL DEFAULT 0 );
+-- After aggregating data into the aggregated table, combine responses +-- reported by bridges by country and by transport into low and high +-- estimates of responses by both country and transport at the same time. +-- Like in the aggregate step, only dates with new data in the imported +-- table will be recomputed in this table. +CREATE TABLE combined_country_transport ( + + -- The date of these aggregated and combined observations. + date DATE NOT NULL, + + -- The country and transport columns have the same meaning as in the + -- imported and aggregated tables, though these columns are always set + -- to non-empty strings. There is no node column, because that would + -- always be 'bridge', and there is no version column, because that + -- would always be ''. + country CHARACTER VARYING(2) NOT NULL DEFAULT '', + transport CHARACTER VARYING(20) NOT NULL DEFAULT '', + + -- Lower limit of responses by country and transport, calculated as: + -- max(0, country + transport - total). If the number of responses from + -- a given country and using a given transport exceeds the total number + -- of responses from all countries and transports, there must be + -- responses from that country *and* transport. And if that is not the + -- case, 0 is the lower limit. + low DOUBLE PRECISION NOT NULL DEFAULT 0, + + -- Upper limit of responses by country and transport, calculated as: + -- min(country, transport). There cannot be more responses by country + -- and transport than there are responses by either of the two numbers. + high DOUBLE PRECISION NOT NULL DEFAULT 0 +); + CREATE LANGUAGE plpgsql;
-- Merge new entries from the imported table into the merged table, and @@ -531,6 +563,75 @@ BEGIN END; $$ LANGUAGE plpgsql;
+-- Combine responses reported by bridges by country and by transport into +-- low and high estimates of responses by both country and transport at +-- the same time. This function combines responses for all dates that +-- have entries in the imported table. It first creates a temporary table +-- with new or updated responses, then removes all existing combined +-- response numbers for the dates to be updated, and finally inserts newly +-- combined response numbers for these dates. +CREATE OR REPLACE FUNCTION combine() RETURNS VOID AS $$ +BEGIN + RAISE NOTICE '% Starting combine step.', timeofday(); + + -- Create a new temporary table containing all relevant information + -- needed to update the combined_country_transport table. In this + -- table, we sum up all responses by reporting node. This query is + -- (temporarily) materialized, because we need to combine its entries + -- multiple times in various ways. A (non-materialized) view would have + -- meant to re-compute this query multiple times. + CREATE TEMPORARY TABLE update2 AS + SELECT fingerprint, country, transport, + DATE(stats_start) AS date, SUM(val) AS val + FROM merged + WHERE node = 'bridge' + AND metric = 'responses' + AND version = '' + -- Note: Comment out the following condition to initialize table! + AND DATE(stats_start) IN ( + SELECT DISTINCT DATE(stats_start) FROM imported) + GROUP BY fingerprint, country, transport, date; + + -- Delete all entries from the combined table that we're about to + -- re-compute. + DELETE FROM combined_country_transport + WHERE date IN (SELECT DISTINCT date FROM update2); + + -- Combine each country with each transport that a bridge reported + -- responses for and also consider total responses reported by the + -- bridge. Compute lower and upper bounds for responses by country and + -- transport. These response numbers will later be transformed into + -- user number estimates in the combined view. + INSERT INTO combined_country_transport + SELECT country.date AS date, country.country AS country, + transport.transport AS transport, + SUM(GREATEST(0, transport.val + country.val - total.val)) + AS low, + SUM(LEAST(transport.val, country.val)) AS high + FROM update2 country, + update2 transport, + update2 total + WHERE country.country <> '' + AND transport.transport <> '' + AND total.country = '' + AND total.transport = '' + AND country.date = transport.date + AND country.date = total.date + AND transport.date = total.date + AND country.val > 0 + AND transport.val > 0 + AND total.val > 0 + AND country.fingerprint = transport.fingerprint + AND country.fingerprint = total.fingerprint + AND transport.fingerprint = total.fingerprint + GROUP BY date, country, transport; + + -- We're done combining new data. + RAISE NOTICE '% Finishing combine step.', timeofday(); + RETURN; +END; +$$ LANGUAGE plpgsql; + -- User-friendly view on the aggregated table that implements the -- algorithm proposed in Tor Tech Report 2012-10-001. This view returns -- user number estimates for both relay and bridge staistics, possibly @@ -583,3 +684,70 @@ CREATE OR REPLACE VIEW estimated AS SELECT -- Order results. ORDER BY date DESC, node, version, transport, country;
+-- User-friendly view on the combined table joined with the aggregated +-- table. This view returns lower and upper bounds for user numbers by +-- country and transport. +CREATE OR REPLACE VIEW combined AS SELECT + + -- The date of this user number estimate. + a.date, + + -- The node type, which is always 'bridge', because relays don't report + -- responses by transport. + 'bridge'::TEXT AS node, + + -- The two-letter lower-case country code of this estimate; can be '??' + -- for an estimate of users that could not be resolved to any country. + a.country, + + -- The pluggable transport name of this estimate; can be '<OR>' for an + -- estimate of users that did not use any pluggable transport, '<??>' + -- for unknown pluggable transports. + a.transport, + + -- The IP address version of this estimate, which is always ''. + ''::TEXT as version, + + -- Estimated fraction of nodes reporting directory requests, which is + -- used to extrapolate observed requests to estimated total requests in + -- the network. The closer this fraction is to 1.0, the more precise + -- the estimation. + CAST(a.frac * 100 AS INTEGER) AS frac, + + -- Lower bound of users by country and transport, calculated as: + -- max(0, country + transport - total). If the number of users from a + -- given country and using a given transport exceeds the total number of + -- users from all countries and transports, there must be users from + -- that country *and* transport. And if that is not the case, 0 is the + -- lower limit. + CAST(a.low / (a.frac * 10) AS INTEGER) AS low, + + -- Upper limit of users by country and transport, calculated as: + -- min(country, transport). There cannot be more users by country and + -- transport than there are users by either of the two numbers. + CAST(a.high / (a.frac * 10) AS INTEGER) AS high + + -- Implement the table join and estimation method in a subquery, so that + -- the ugly formula only has to be written once. + FROM ( + SELECT aggregated.date, + (hrh * nh + hh * nrh) / (hh * nn) AS frac, + combined_country_transport.country, + combined_country_transport.transport, + combined_country_transport.low, + combined_country_transport.high + FROM aggregated RIGHT JOIN combined_country_transport + ON aggregated.date = combined_country_transport.date + AND aggregated.node = 'bridge' + WHERE hh * nn > 0.0 + AND aggregated.country = '' + AND aggregated.transport = '' + AND aggregated.version = '') a + + -- Only include estimates with at least 10% of nodes reporting directory + -- request statistics. + WHERE a.frac BETWEEN 0.1 AND 1.0 + + -- Order results. + ORDER BY date DESC; + diff --git a/modules/clients/src/org/torproject/metrics/clients/Main.java b/modules/clients/src/org/torproject/metrics/clients/Main.java index fcd18a3..63a3681 100644 --- a/modules/clients/src/org/torproject/metrics/clients/Main.java +++ b/modules/clients/src/org/torproject/metrics/clients/Main.java @@ -464,6 +464,7 @@ public class Main { bw.write("\.\n"); bw.write("SELECT merge();\n"); bw.write("SELECT aggregate();\n"); + bw.write("SELECT combine();\n"); bw.write("TRUNCATE imported;\n"); bw.write("COMMIT;\n"); bw.close(); diff --git a/shared/bin/80-run-clients-stats.sh b/shared/bin/80-run-clients-stats.sh index bdd55c1..b296c37 100755 --- a/shared/bin/80-run-clients-stats.sh +++ b/shared/bin/80-run-clients-stats.sh @@ -13,6 +13,7 @@ done
echo `date` "Exporting results." psql -c 'COPY (SELECT * FROM estimated) TO STDOUT WITH CSV HEADER;' userstats > userstats.csv +psql -c 'COPY (SELECT * FROM combined) TO STDOUT WITH CSV HEADER;' userstats > userstats-combined.csv
echo `date` "Running censorship detector." R --slave -f userstats-detector.R > /dev/null 2>&1 @@ -22,6 +23,7 @@ echo `date` "Merging censorship detector results." R --slave -f merge-clients.R > /dev/null 2>&1 mkdir -p stats/ cp clients.csv stats/ +cp userstats-combined.csv stats/
echo `date` "Terminating."
diff --git a/shared/bin/99-copy-stats-files.sh b/shared/bin/99-copy-stats-files.sh index 23d46cf..5ced875 100755 --- a/shared/bin/99-copy-stats-files.sh +++ b/shared/bin/99-copy-stats-files.sh @@ -5,5 +5,6 @@ cp -a modules/connbidirect/stats/connbidirect2.csv shared/stats/ cp -a modules/advbwdist/stats/advbwdist.csv shared/stats/ cp -a modules/hidserv/stats/hidserv.csv shared/stats/ cp -a modules/clients/stats/clients.csv shared/stats/ +cp -a modules/clients/stats/userstats-combined.csv shared/stats/ cp -a modules/disagreement/stats/disagreement.csv shared/stats/
diff --git a/website/etc/metrics.json b/website/etc/metrics.json index 1aac409..ffa2ad2 100644 --- a/website/etc/metrics.json +++ b/website/etc/metrics.json @@ -354,6 +354,7 @@ "userstats-censorship-events", "userstats-bridge-country", "userstats-bridge-table", + "userstats-bridge-combined", "oxford-anonymous-internet" ] }, @@ -387,6 +388,7 @@ "userstats-censorship-events", "userstats-bridge-country", "userstats-bridge-table", + "userstats-bridge-combined", "oxford-anonymous-internet" ] }, @@ -444,7 +446,8 @@ "related": [ "userstats-relay-country", "userstats-relay-table", - "userstats-bridge-table" + "userstats-bridge-table", + "userstats-bridge-combined" ] }, { @@ -477,6 +480,7 @@ "userstats-relay-table", "userstats-bridge-country", "userstats-bridge-transport", + "userstats-bridge-combined", "userstats-bridge-version" ] }, @@ -502,10 +506,35 @@ "related": [ "userstats-bridge-country", "userstats-bridge-table", + "userstats-bridge-combined", "userstats-bridge-version" ] }, { + "id": "userstats-bridge-combined", + "title": "Bridge users by country and transport", + "tags": [ + "Clients", + "Pluggable transports" + ], + "type": "Graph", + "level": "Basic", + "description": "<p>The following graph shows the estimated number of <a href="about.html#client">clients</a> connecting via <a href="about.html#bridge">bridges</a>. These numbers are derived from directory requests counted on bridges. Bridges resolve client IP addresses of incoming directory requests to country codes, and they distinguish connecting clients by transport protocol, which may include <a href="about.html#pluggable-transport">pluggable transports</a>. Even though bridges don't report a combination of clients by country and transport, it's possible to derive and graph lower and upper bounds from existing usage statistics. For further details see these <a href="https://gitweb.torproject.org/metrics-web.git/tree/doc/users-q-and-a.txt%5C%... and answers about user statistics</a>.</p>", + "function": "plot_userstats_bridge_combined", + "parameters": [ + "start", + "end", + "country" + ], + "data": [ + "userstats-combined-data" + ], + "related": [ + "userstats-bridge-country", + "userstats-bridge-transport" + ] + }, + { "id": "userstats-bridge-version", "title": "Bridge users by IP version", "tags": [ @@ -526,7 +555,8 @@ "related": [ "userstats-bridge-country", "userstats-bridge-table", - "userstats-bridge-transport" + "userstats-bridge-transport", + "userstats-bridge-combined" ] }, { @@ -573,11 +603,37 @@ "userstats-bridge-country", "userstats-bridge-table", "userstats-bridge-transport", - "userstats-bridge-version", + "userstats-combined-data", "oxford-anonymous-internet" ] }, { + "id": "userstats-combined-data", + "title": "Estimated number of clients by country and transport", + "tags": [ + "Clients", + "Pluggable transports" + ], + "type": "Data", + "level": "Advanced", + "description": "<p>The following data file contains additional statistics on the number of <a href="about.html#client">clients</a> in the network. This data file is related to the <a href="clients-data.html">clients-data file</a> that contains estimates on the number of clients by country and by transport protocol. This data file enhances these statistics by containing estimates of clients connecting to <a href="about.html#bridge">bridges</a> by a given country and using a given <a href="about.html#pluggable-transport">transport protocol</a>. Even though bridges don't report a combination of clients by country and transport, it's possible to derive lower and upper bounds from existing usage statistics.</p>", + "data_file": "stats/userstats-combined.csv", + "data_column_spec": [ + "<b>date:</b> UTC date (YYYY-MM-DD) for which client numbers are estimated.", + "<b>node:</b> The node type to which clients connect first, which is always <b>"bridge"</b>, because relays don't report responses by transport.", + "<b>country:</b> Two-letter lower-case country code as found in a GeoIP database by resolving clients' IP addresses, or <b>"??"</b> if client IP addresses could not be resolved.", + "<b>transport:</b> Transport name used by clients to connect to the Tor network using bridges. Examples are <b>"obfs2"</b>, <b>"obfs3"</b>, <b>"websocket"</b>, or <b>"<OR>"</b> (original onion routing protocol).", + "<b>version:</b> IP version used by clients to connect to the Tor network using bridges. This column always contains the empty string and is only included for compatibility reasons.", + "<b>frac:</b> Fraction of relays or bridges in percent that the estimate is based on. The higher this value, the more reliable is the estimate. Values above 50 can be considered reliable enough for most purposes, lower values should be handled with more care.", + "<b>low:</b> Lower bound of users by country and transport, calculated as sum over all bridges having reports for the given country and transport, that is, the sum of <b>M(b)</b>, where for each bridge <b>b</b> define <b>M(b) := max(0, C(b) + T(b) - S(b))</b> using the following definitions: <b>C(b)</b> is the number of users from a given country reported by <b>b</b>; <b>T(b)</b> is the number of users using a given transport reported by <b>b</b>; and <b>S(b)</b> is the total numbers of users reported by <b>b</b>. Reasoning: If the sum <b>C(b) + T(b)</b> exceeds the total number of users from all countries and transports <b>S(b)</b>, there must be users from that country and transport. And if that is not the case, <b>0</b> is the lower limit.", + "<b>high:</b> Upper bound of users by country and transport, calculated as sum over all bridges having reports for the given country and transport, that is, the sum of <b>m(b)</b>, where for each bridge <b>b</b> define <b>m(b):=min(C(b), T(b))</b> where we use the definitions from <b>low</b> (above). Reasoning: there cannot be more users by country and transport than there are users by either of the two numbers." + ], + "related": [ + "userstats-bridge-combined", + "clients-data" + ] + }, + { "id": "torperf", "title": "Time to download files over Tor", "tags": [ diff --git a/website/etc/web.xml b/website/etc/web.xml index 809cc41..b1b6511 100644 --- a/website/etc/web.xml +++ b/website/etc/web.xml @@ -38,6 +38,7 @@ <url-pattern>/userstats-relay-country.html</url-pattern> <url-pattern>/userstats-bridge-country.html</url-pattern> <url-pattern>/userstats-bridge-transport.html</url-pattern> + <url-pattern>/userstats-bridge-combined.html</url-pattern> <url-pattern>/userstats-bridge-version.html</url-pattern> <url-pattern>/torperf.html</url-pattern> <url-pattern>/torperf-failures.html</url-pattern> @@ -72,6 +73,7 @@ <url-pattern>/bandwidth-data.html</url-pattern> <url-pattern>/advbwdist-data.html</url-pattern> <url-pattern>/clients-data.html</url-pattern> + <url-pattern>/userstats-combined-data.html</url-pattern> <url-pattern>/torperf-data.html</url-pattern> <url-pattern>/connbidirect-data.html</url-pattern> <url-pattern>/connbidirect2-data.html</url-pattern> @@ -172,6 +174,9 @@ <url-pattern>/userstats-bridge-transport.png</url-pattern> <url-pattern>/userstats-bridge-transport.pdf</url-pattern> <url-pattern>/userstats-bridge-transport.svg</url-pattern> + <url-pattern>/userstats-bridge-combined.png</url-pattern> + <url-pattern>/userstats-bridge-combined.pdf</url-pattern> + <url-pattern>/userstats-bridge-combined.svg</url-pattern> <url-pattern>/userstats-bridge-version.png</url-pattern> <url-pattern>/userstats-bridge-version.pdf</url-pattern> <url-pattern>/userstats-bridge-version.svg</url-pattern> diff --git a/website/rserve/graphs.R b/website/rserve/graphs.R index 9d1db7b..ca25bf7 100644 --- a/website/rserve/graphs.R +++ b/website/rserve/graphs.R @@ -920,6 +920,49 @@ plot_userstats_bridge_version <- function(start, end, version, path) { plot_userstats(start, end, 'bridge', 'version', version, 'off', path) }
+plot_userstats_bridge_combined <- function(start, end, country, path) { + if (country == "all") { + plot_userstats_bridge_country(start, end, country, path) + } else { + top <- 3 + country <- ifelse(country == "all", NA, country) + end <- min(end, as.character(Sys.Date() - 2)) + u <- read.csv(paste("/srv/metrics.torproject.org/metrics/shared/", + "stats/userstats-combined.csv", sep = ""), + stringsAsFactors = FALSE) + u <- u[u$date >= start & u$date <= end + & (is.na(country) | u$country == country), ] + a <- aggregate(list(mid = (u$high + u$low) / 2), + by = list(transport = u$transport), FUN = sum) + a <- a[order(a$mid, decreasing = TRUE)[1:top], ] + u <- u[u$transport %in% a$transport, ] + formatter <- function(x, ...) { + format(x, ..., scientific = FALSE, big.mark = '\u2006') } + max_y <- ifelse(length(na.omit(u$high)) == 0, 0, + max(u$high, na.rm = TRUE)) + title <- paste("Bridge users by transport from ", + countryname(country), sep = "") + date_breaks <- date_breaks( + as.numeric(max(as.Date(u$date, "%Y-%m-%d")) - + min(as.Date(u$date, "%Y-%m-%d")))) + ggplot(u, aes(x = as.Date(date), ymin = low, ymax = high, + colour = transport, fill = transport)) + + geom_ribbon(alpha = 0.5, size = 0.5) + + scale_x_date(name = paste("\nThe Tor Project - ", + "https://metrics.torproject.org/", sep = ""), + labels = date_format(date_breaks$format), + breaks = date_breaks$major, + minor_breaks = date_breaks$minor) + + scale_y_continuous(name = "", limits = c(0, max_y), + labels = formatter) + + scale_colour_hue(paste("Top-", top, " transports", sep = "")) + + scale_fill_hue(paste("Top-", top, " transports", sep = "")) + + ggtitle(title) + + theme(legend.position = "top") + ggsave(filename = path, width = 8, height = 5, dpi = 72) + } +} + plot_advbwdist_perc <- function(start, end, p, path) { end <- min(end, as.character(Sys.Date() - 2)) t <- read.csv(paste("/srv/metrics.torproject.org/metrics/shared/stats/", diff --git a/website/src/org/torproject/metrics/web/research/ResearchStatsServlet.java b/website/src/org/torproject/metrics/web/research/ResearchStatsServlet.java index a4194f2..f1d0ad4 100644 --- a/website/src/org/torproject/metrics/web/research/ResearchStatsServlet.java +++ b/website/src/org/torproject/metrics/web/research/ResearchStatsServlet.java @@ -31,6 +31,7 @@ public class ResearchStatsServlet extends HttpServlet { this.availableStatisticsFiles.add("servers"); this.availableStatisticsFiles.add("bandwidth"); this.availableStatisticsFiles.add("clients"); + this.availableStatisticsFiles.add("userstats-combined"); this.availableStatisticsFiles.add("torperf"); this.availableStatisticsFiles.add("connbidirect"); this.availableStatisticsFiles.add("connbidirect2");
tor-commits@lists.torproject.org