November 2019 - tor-commits - lists.torproject.org

[metrics-web/release] Stop hard-coding versions.
by karsten＠torproject.org 09 Nov '19

09 Nov '19

commit c8a3414347c8df7aed3e63be4c704f1dd43aded4 Author: Karsten Loesing <karsten.loesing(a)gmx.net> Date: Mon Jan 7 09:59:18 2019 +0100 Stop hard-coding versions. --- src/main/R/rserver/graphs.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/R/rserver/graphs.R b/src/main/R/rserver/graphs.R index 03b5b93..d3ea90a 100644 --- a/src/main/R/rserver/graphs.R +++ b/src/main/R/rserver/graphs.R @@ -381,18 +381,18 @@ write_networksize <- function(start_p = NULL, end_p = NULL, path_p) { } prepare_versions <- function(start_p, end_p) { - read.csv(paste(stats_dir, "versions.csv", sep = ""), - colClasses = c("date" = "Date")) %>% + read_csv(paste(stats_dir, "versions.csv", sep = ""), + col_types = cols( + date = col_date(format = ""), + version = col_character(), + relays = col_double())) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) } plot_versions <- function(start_p, end_p, path_p) { s <- prepare_versions(start_p, end_p) - known_versions <- c("Other", "0.1.0", "0.1.1", "0.1.2", "0.2.0", - "0.2.1", "0.2.2", "0.2.3", "0.2.4", "0.2.5", "0.2.6", "0.2.7", - "0.2.8", "0.2.9", "0.3.0", "0.3.1", "0.3.2", "0.3.3", "0.3.4", - "0.3.5") + known_versions <- unique(s$version) getPalette <- colorRampPalette(brewer.pal(12, "Paired")) colours <- data.frame(breaks = known_versions, values = rep(brewer.pal(min(12, length(known_versions)), "Paired"),

1 0

[metrics-web/release] Update news.json to version 307 of doc/MetricsTimeline.
by karsten＠torproject.org 09 Nov '19

09 Nov '19

commit f6f0570819a8a2e05c3e22636b21e00629d50b8f Author: Karsten Loesing <karsten.loesing(a)gmx.net> Date: Mon Jan 7 12:27:51 2019 +0100 Update news.json to version 307 of doc/MetricsTimeline. --- src/main/resources/web/json/news.json | 202 +++++++++++++++++++++++++++++----- 1 file changed, 173 insertions(+), 29 deletions(-) diff --git a/src/main/resources/web/json/news.json b/src/main/resources/web/json/news.json index 9810770..26bf78a 100644 --- a/src/main/resources/web/json/news.json +++ b/src/main/resources/web/json/news.json @@ -2489,6 +2489,15 @@ "target" : "https://en.wikipedia.org/wiki/Hurricane_Maria#Puerto_Rico_3" } ] }, { + "start" : "2017-10-04", + "protocols" : [ "fte" ], + "short_description" : "Permanent hardware failure of default FTE bridge 128.105.214.161:8080.", + "description" : "Permanent hardware failure of default FTE bridge 128.105.214.161:8080.", + "links" : [ { + "label" : "comment", + "target" : "https://bugs.torproject.org/28521#comment:2" + } ] +}, { "start" : "2017-10-05", "protocols" : [ "ipv4", "ipv6" ], "short_description" : "geoip and geoip6 databases updated to \"October 4 2017 Maxmind GeoLite2 Country\"", @@ -2534,6 +2543,9 @@ "links" : [ { "label" : "wikipedia", "target" : "https://en.wikipedia.org/wiki/19th_National_Congress_of_the_Communist_Party…" + }, { + "label" : "Psiphon users", + "target" : "https://media.ccc.de/v/35c3-9964-cat_mouse_evading_the_censors_in_2018#t=20…" } ] }, { "start" : "2017-10-25", @@ -2754,6 +2766,9 @@ }, { "label" : "tweet", "target" : "https://twitter.com/nusenu_/status/948914485045145601" + }, { + "label" : "Psiphon users", + "target" : "https://media.ccc.de/v/35c3-9964-cat_mouse_evading_the_censors_in_2018#t=17…" } ] }, { "start" : "2018-01-01", @@ -3045,6 +3060,16 @@ "target" : "https://metrics.torproject.org/userstats-bridge-country.html?start=2018-03-…" } ] }, { + "start" : "2018-04-20", + "end" : "2018-04-27", + "protocols" : [ "onion" ], + "short_description" : "The number of v2 onion services increases from 70k to 120k.", + "description" : "The number of v2 onion services increases from 70k to 120k.", + "links" : [ { + "label" : "ticket", + "target" : "https://bugs.torproject.org/26081" + } ] +}, { "start" : "2018-04-28", "places" : [ "ru" ], "short_description" : "Russia unblocks about 3 million IP addresses belonging to Amazon and OVH.", @@ -3074,6 +3099,9 @@ }, { "label" : "bridge graph", "target" : "https://metrics.torproject.org/userstats-bridge-country.html?start=2018-03-…" + }, { + "label" : "Psiphon users", + "target" : "https://media.ccc.de/v/35c3-9964-cat_mouse_evading_the_censors_in_2018#t=19…" } ] }, { "start" : "2018-05-08", @@ -3169,6 +3197,16 @@ "target" : "https://www.accessnow.org/venezuela-blocks-tor/" } ] }, { + "start" : "2018-06-24", + "end" : "2018-07-06", + "protocols" : [ "snowflake" ], + "short_description" : "Metrics for Snowflake are missing, for unknown reasons", + "description" : "Metrics for Snowflake are missing, for unknown reasons. The gap is nearly contemporaneous with the gap the measurements for all transports/bridges caused by Bifroest–Serge bridge authority switchover, but starts about 2 weeks earlier in Snowflake's case.", + "links" : [ { + "label" : "ticket", + "target" : "https://bugs.torproject.org/26783" + } ] +}, { "start" : "2018-06-27", "protocols" : [ "meek" ], "short_description" : "Release of Tor Browser 8.0a9 with non-working meek.", @@ -3198,6 +3236,9 @@ }, { "label" : "AllAfrica article", "target" : "http://allafrica.com/stories/201807040129.html" + }, { + "label" : "OONI report", + "target" : "https://ooni.io/post/uganda-social-media-tax/" } ] }, { "start" : "2018-07-04", @@ -3253,6 +3294,25 @@ } ] }, { "start" : "2018-07-14", + "end" : "2018-07-25", + "places" : [ "iq" ], + "short_description" : "Protests, Internet shutdowns, and social media blocks in Iraq.", + "description" : "Protests, Internet shutdowns, and social media blocks in Iraq.", + "links" : [ { + "label" : "relay graph", + "target" : "https://metrics.torproject.org/userstats-relay-country.html?start=2018-06-0…" + }, { + "label" : "NetBlocks post", + "target" : "https://netblocks.org/reports/study-shows-extent-of-iraq-internet-shutdown-…" + }, { + "label" : "NRT article about end", + "target" : "http://www.nrttv.com/EN/News.aspx?id=2810&MapID=1" + }, { + "label" : "Psiphon users", + "target" : "https://media.ccc.de/v/35c3-9964-cat_mouse_evading_the_censors_in_2018#t=17…" + } ] +}, { + "start" : "2018-07-14", "protocols" : [ "bridge" ], "short_description" : "Release of Tor 0.2.9.16, 0.3.2.11, 0.3.3.9, and 0.3.4.5-rc", "description" : "Release of Tor 0.2.9.16, 0.3.2.11, 0.3.3.9, and 0.3.4.5-rc. Switches bridge authority from Bifroest to <a href=\"https://metrics.torproject.org/rs.html#details/BA44A889E64B93FAA2B114E02C2A…">Serge</a>. The number of bridges begins counting up from zero as bridges are upgraded. The estimated number of bridge users remained unavailable until 2018-07-21 because of the discontinuity.", @@ -3302,6 +3362,9 @@ }, { "label" : "Daily Star article on throttling", "target" : "https://www.thedailystar.net/country/bangladesh-mobile-internet-speed-broug…" + }, { + "label" : "NetBlocks report", + "target" : "https://netblocks.org/reports/bangladesh-internet-shutdown-student-protests…" } ] }, { "start" : "2018-08-04", @@ -3316,6 +3379,95 @@ "target" : "https://lists.torproject.org/pipermail/tor-relays/2018-August/015850.html" } ] }, { + "start" : "2018-09-26", + "ongoing" : true, + "protocols" : [ "fte" ], + "short_description" : "Outage of default FTE bridge 128.105.214.162:8080.", + "description" : "Outage of default FTE bridge 128.105.214.162:8080.", + "links" : [ { + "label" : "comment", + "target" : "https://bugs.torproject.org/28521#comment:3" + } ] +}, { + "start" : "2018-09-26", + "ongoing" : true, + "protocols" : [ "fte" ], + "short_description" : "Outage of default FTE bridge 128.105.214.163:8080.", + "description" : "Outage of default FTE bridge 128.105.214.163:8080.", + "links" : [ { + "label" : "comment", + "target" : "https://bugs.torproject.org/28521#comment:3" + } ] +}, { + "start" : "2018-10-25", + "end" : "2018-10-28", + "protocols" : [ "relay" ], + "short_description" : "Gap in observed number of relay users, caused by the estimated fraction of reporting relays exceeding 100%.", + "description" : "Gap in observed number of relay users, caused by the estimated fraction of reporting relays exceeding 100%.", + "links" : [ { + "label" : "metrics-team post", + "target" : "https://lists.torproject.org/pipermail/metrics-team/2018-November/000936.ht…" + }, { + "label" : "ticket", + "target" : "https://bugs.torproject.org/28305" + }, { + "label" : "graph", + "target" : "https://metrics.torproject.org/userstats-relay-country.html?start=2018-10-0…" + }, { + "label" : "archived graph", + "target" : "https://web.archive.org/web/20181104023227/https://metrics.torproject.org/u…" + } ] +}, { + "start" : "2018-11-16", + "end" : "2018-11-22", + "protocols" : [ "snowflake" ], + "short_description" : "A full disk stops the Snowflake bridge and fallback proxies from working.", + "description" : "A full disk stops the Snowflake bridge and fallback proxies from working.", + "links" : [ { + "label" : "ticket", + "target" : "https://bugs.torproject.org/28390" + } ] +}, { + "start" : "2018-11-19", + "end" : "2018-11-27", + "short_description" : "Slow-running processes on the metrics host cause an observed drop in the overall relay bandwidth graph.", + "description" : "Slow-running processes on the metrics host cause an observed drop in the overall relay bandwidth graph.", + "links" : [ { + "label" : "mailing list thread", + "target" : "https://lists.torproject.org/pipermail/metrics-team/2018-December/000971.ht…" + } ] +}, { + "start" : "2018-11-26", + "end" : "2018-11-28", + "short_description" : "Outage of the onionperf-us instance, caused by a Greenhost east coast VPS migration.", + "description" : "Outage of the onionperf-us instance, caused by a Greenhost east coast VPS migration.", + "links" : [ { + "label" : "mailing list post", + "target" : "https://lists.torproject.org/pipermail/metrics-team/2018-November/000967.ht…" + } ] +}, { + "start" : "2018-11-26", + "end" : "2018-11-28", + "protocols" : [ "flashproxy" ], + "short_description" : "Outage of the flash proxy badge hosting server, flashproxy.bamsoftware.com, caused by a Greenhost east coast VPS migration.", + "description" : "Outage of the flash proxy badge hosting server, flashproxy.bamsoftware.com, caused by a Greenhost east coast VPS migration." +}, { + "start" : "2018-12-20", + "ongoing" : true, + "places" : [ "sd" ], + "short_description" : "Protests and social media blocks in Sudan.", + "description" : "Protests and social media blocks in Sudan.", + "links" : [ { + "label" : "relay graph", + "target" : "https://metrics.torproject.org/userstats-relay-country.html?start=2018-11-1…" + }, { + "label" : "Access Now post", + "target" : "https://www.accessnow.org/amid-countrywide-protest-sudan-shuts-down-social-…" + }, { + "label" : "Psiphon users", + "target" : "https://media.ccc.de/v/35c3-9964-cat_mouse_evading_the_censors_in_2018#t=16…" + } ] +}, { "start" : "2016-02-24", "places" : [ "tm" ], "protocols" : [ "<OR>" ], @@ -3420,8 +3572,8 @@ "end" : "2017-03-01", "places" : [ "ae" ], "protocols" : [ "<OR>", "relay" ], - "short_description" : "Huge increase in relay users", - "description" : "Huge increase in relay users (400k+). An anonymous contributor suggests that it may be a botnet, based on the large number of hosts with an open SMB port in the UAE.", + "short_description" : "Huge spike in relay users", + "description" : "Huge spike in relay users (400k+). An anonymous contributor suggests that it may be a botnet, based on the large number of hosts with an open SMB port in the UAE.", "links" : [ { "label" : "graph", "target" : "https://metrics.torproject.org/userstats-relay-country.html?start=2017-01-0…" @@ -3480,14 +3632,17 @@ "unknown" : true }, { "start" : "2017-03-01", - "end" : "2017-07-01", + "end" : "2018-11-12", "places" : [ "ae" ], "protocols" : [ "<OR>", "relay" ], - "short_description" : "Another increase in relay users, with a slower rate of growth than the previous one.", - "description" : "Another increase in relay users, with a slower rate of growth than the previous one.", + "short_description" : "Another increase in relay users, slower and more sustained than the previous one.", + "description" : "Another increase in relay users, slower and more sustained than the previous one.", "links" : [ { "label" : "graph", - "target" : "https://metrics.torproject.org/userstats-relay-country.html?start=2017-01-0…" + "target" : "https://metrics.torproject.org/userstats-relay-country.html?start=2016-07-0…" + }, { + "label" : "ticket about end", + "target" : "https://bugs.torproject.org/28898#comment:2" } ], "unknown" : true }, { @@ -3609,18 +3764,6 @@ } ], "unknown" : true }, { - "start" : "2017-07-01", - "end" : "2017-08-30", - "places" : [ "ae" ], - "protocols" : [ "<OR>", "relay" ], - "short_description" : "Slow increase in relay users.", - "description" : "Slow increase in relay users.", - "links" : [ { - "label" : "graph", - "target" : "https://metrics.torproject.org/userstats-relay-country.html?start=2017-06-0…" - } ], - "unknown" : true -}, { "start" : "2017-07-15", "end" : "2017-07-22", "places" : [ "sc" ], @@ -3786,17 +3929,6 @@ } ], "unknown" : true }, { - "start" : "2017-09-01", - "places" : [ "ae" ], - "protocols" : [ "<OR>", "relay" ], - "short_description" : "Relay users remain volatile but flatten their rate of growth.", - "description" : "Relay users remain volatile but flatten their rate of growth.", - "links" : [ { - "label" : "graph", - "target" : "https://metrics.torproject.org/userstats-relay-country.html?start=2017-06-0…" - } ], - "unknown" : true -}, { "start" : "2017-09-02", "end" : "2017-09-21", "protocols" : [ "obfs4" ], @@ -3956,6 +4088,18 @@ } ], "unknown" : true }, { + "start" : "2018-05-22", + "end" : "2018-06-08", + "places" : [ "iq" ], + "protocols" : [ "obfs4" ], + "short_description" : "Increase in obfs4 users in Iraq, followed by a slow decrease", + "description" : "Increase in obfs4 users in Iraq, followed by a slow decrease. No matching change in relay users or other transports.", + "links" : [ { + "label" : "bridge graph", + "target" : "https://metrics.torproject.org/userstats-bridge-country.html?start=2018-04-…" + } ], + "unknown" : true +}, { "start" : "2018-07-28", "ongoing" : true, "places" : [ "tr" ],

1 0

[metrics-web/release] Remove two unused R files from censorship detector.
by karsten＠torproject.org 09 Nov '19

09 Nov '19

commit c0a18aab9092c57f107732cb2f97f034909e94d9 Author: Karsten Loesing <karsten.loesing(a)gmx.net> Date: Thu Dec 20 14:09:24 2018 +0100 Remove two unused R files from censorship detector. Still part of #21588. --- src/main/R/clients/merge-clients.R | 19 ------------------- src/main/R/clients/userstats-detector.R | 18 ------------------ 2 files changed, 37 deletions(-) diff --git a/src/main/R/clients/merge-clients.R b/src/main/R/clients/merge-clients.R deleted file mode 100644 index cce7e9d..0000000 --- a/src/main/R/clients/merge-clients.R +++ /dev/null @@ -1,19 +0,0 @@ -require(reshape) -r <- read.csv("userstats-ranges.csv", stringsAsFactors = FALSE) -r <- melt(r, id.vars = c("date", "country")) -r <- data.frame(date = r$date, node = "relay", country = r$country, - transport = "", version = "", - variable = ifelse(r$variable == "maxusers", "upper", "lower"), - value = floor(r$value)) -u <- read.csv("userstats.csv", stringsAsFactors = FALSE) -u <- melt(u, id.vars = c("date", "node", "country", "transport", - "version")) -u <- data.frame(date = u$date, node = u$node, country = u$country, - transport = u$transport, version = u$version, - variable = ifelse(u$variable == "frac", "frac", "clients"), - value = u$value) -c <- rbind(r, u) -c <- cast(c, date + node + country + transport + version ~ variable) -c <- c[order(as.Date(c$date), c$node, c$country, c$transport, c$version), ] -write.csv(c, "clients.csv", quote = FALSE, row.names = FALSE, na = "") - diff --git a/src/main/R/clients/userstats-detector.R b/src/main/R/clients/userstats-detector.R deleted file mode 100644 index c3a9041..0000000 --- a/src/main/R/clients/userstats-detector.R +++ /dev/null @@ -1,18 +0,0 @@ -library("reshape") -export_userstats_detector <- function(path) { - c <- read.csv("userstats.csv", stringsAsFactors = FALSE) - c <- c[c$country != '' & c$transport == '' & c$version == '' & - c$node == 'relay', ] - u <- data.frame(country = c$country, date = c$date, users = c$users, - stringsAsFactors = FALSE) - u <- rbind(u, data.frame(country = "zy", - aggregate(list(users = u$users), - by = list(date = u$date), sum))) - u <- data.frame(date = u$date, country = u$country, - users = floor(u$users)) - u <- cast(u, date ~ country, value = "users") - names(u)[names(u) == "zy"] <- "all" - write.csv(u, path, quote = FALSE, row.names = FALSE) -} -export_userstats_detector("userstats-detector.csv") -

1 0

[metrics-web/release] Fix Traffic link on start page.
by karsten＠torproject.org 09 Nov '19

09 Nov '19

commit 87f922d4fd555804d4e80fdefd7968acce5f4433 Author: Karsten Loesing <karsten.loesing(a)gmx.net> Date: Sat Dec 29 09:23:35 2018 +0100 Fix Traffic link on start page. --- src/main/resources/web/jsps/index.jsp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/resources/web/jsps/index.jsp b/src/main/resources/web/jsps/index.jsp index 3fa49b8..ec93792 100644 --- a/src/main/resources/web/jsps/index.jsp +++ b/src/main/resources/web/jsps/index.jsp @@ -42,7 +42,7 @@ </div> <div class="col-sm-4"> - <a href="bandwidth.html"> <h3>Traffic</h3> How much traffic the Tor network can handle and how much traffic there is.</a> + <a href="bandwidth-flags.html"> <h3>Traffic</h3> How much traffic the Tor network can handle and how much traffic there is.</a> </div> <div class="col-sm-4">

1 0

[metrics-web/release] Properly skip previously imported webstats files.
by karsten＠torproject.org 09 Nov '19

09 Nov '19

commit 9bdb6d39fc7b0ac8e7327caeafabfac43a41689f Author: Karsten Loesing <karsten.loesing(a)gmx.net> Date: Mon Jan 7 11:59:19 2019 +0100 Properly skip previously imported webstats files. Turns out we never skipped previously imported webstats files due to two bugs: 1. While building a list of previously imported webstats files we reassembled their file names as ${server}_${site}_* rather than ${site}_${server}_* which was the file name format we chose in an earlier version of the CollecTor module. 2. When checking whether a given webstats file already exists in the database we compared the full file name to the reassembled file name from the database with ${server} being truncated to 32 characters. This commit fixes both bugs. --- src/main/java/org/torproject/metrics/stats/webstats/Main.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/torproject/metrics/stats/webstats/Main.java b/src/main/java/org/torproject/metrics/stats/webstats/Main.java index a154e64..fb0a903 100644 --- a/src/main/java/org/torproject/metrics/stats/webstats/Main.java +++ b/src/main/java/org/torproject/metrics/stats/webstats/Main.java @@ -100,7 +100,7 @@ public class Main { try (ResultSet rs = st.executeQuery(queryString)) { while (rs.next()) { importedLogFileUrls.add(String.format("%s_%s_access.log_%s.xz", - rs.getString(1), rs.getString(2), + rs.getString(2), rs.getString(1), rs.getDate(3).toLocalDate().format(dateFormat))); } } @@ -111,13 +111,19 @@ public class Main { static void importLogFiles(Connection connection, SortedSet<String> skipFiles, File... inDirectories) { + DateTimeFormatter dateFormat = DateTimeFormatter.ofPattern("yyyyMMdd"); for (Descriptor descriptor : DescriptorSourceFactory .createDescriptorReader().readDescriptors(inDirectories)) { if (!(descriptor instanceof WebServerAccessLog)) { continue; } WebServerAccessLog logFile = (WebServerAccessLog) descriptor; - if (skipFiles.contains(logFile.getDescriptorFile().getName())) { + String logFileNameWithTruncatedParts = String.format( + "%s_%s_access.log_%s.xz", + truncateString(logFile.getVirtualHost(), 128), + truncateString(logFile.getPhysicalHost(), 32), + logFile.getLogDate().format(dateFormat)); + if (skipFiles.contains(logFileNameWithTruncatedParts)) { continue; } try {

1 0

[metrics-web/release] Split up huge plot_userstats function.
by karsten＠torproject.org 09 Nov '19

09 Nov '19

commit f55e63d986ed9c1054ce19ff0d4a19b1c0bce26d Author: Karsten Loesing <karsten.loesing(a)gmx.net> Date: Thu Jan 10 09:54:39 2019 +0100 Split up huge plot_userstats function. The mere size of this function made it hard to impossible to refactor things to using more recent R packages dplyr and tidyr. Now there are four plot_userstats_* functions with accompanying prepare_userstats_* that make the corresponding write_userstats_* functions really small. --- src/main/R/rserver/graphs.R | 269 +++++++++++++++++++------------------------- 1 file changed, 115 insertions(+), 154 deletions(-) diff --git a/src/main/R/rserver/graphs.R b/src/main/R/rserver/graphs.R index d3ea90a..ba8862c 100644 --- a/src/main/R/rserver/graphs.R +++ b/src/main/R/rserver/graphs.R @@ -751,9 +751,9 @@ write_bandwidth_flags <- function(start_p = NULL, end_p = NULL, path_p) { write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") } -plot_userstats <- function(start_p, end_p, node_p, variable_p, value_p, - events_p, path_p) { - c <- read_csv(file = paste(stats_dir, "clients.csv", sep = ""), +prepare_userstats_relay_country <- function(start_p, end_p, country_p, + events_p) { + read_csv(file = paste(stats_dir, "clients.csv", sep = ""), col_types = cols( date = col_date(format = ""), node = col_character(), @@ -763,97 +763,26 @@ plot_userstats <- function(start_p, end_p, node_p, variable_p, value_p, lower = col_double(), upper = col_double(), clients = col_double(), - frac = col_skip()), + frac = col_double()), na = character()) %>% - filter(node == node_p) - u <- c[c$date >= start_p & c$date <= end_p, c("date", "country", "transport", - "version", "lower", "upper", "clients")] - u <- rbind(u, data.frame(date = start_p, - country = ifelse(variable_p == "country" & value_p != "all", value_p, ""), - transport = ifelse(variable_p == "transport", value_p, ""), - version = ifelse(variable_p == "version", value_p, ""), - lower = 0, upper = 0, clients = 0)) - if (node_p == "relay") { - if (value_p != "all") { - u <- u[u$country == value_p, ] - title <- paste("Directly connecting users from", countryname(value_p)) - } else { - u <- u[u$country == "", ] - title <- "Directly connecting users" - } - u <- aggregate(list(lower = u$lower, upper = u$upper, - users = u$clients), - by = list(date = as.Date(u$date, "%Y-%m-%d"), - value = u$country), - FUN = sum) - } else if (variable_p == "transport") { - if ("!<OR>" %in% value_p) { - n <- u[u$transport != "" & u$transport != "<OR>", ] - n <- aggregate(list(lower = n$lower, upper = n$upper, - clients = n$clients), - by = list(date = n$date), - FUN = sum) - u <- rbind(u, data.frame(date = n$date, - country = "", transport = "!<OR>", - version = "", lower = n$lower, - upper = n$upper, clients = n$clients)) - } - if (length(value_p) > 1) { - u <- u[u$transport %in% value_p, ] - u <- aggregate(list(lower = u$lower, upper = u$upper, - users = u$clients), - by = list(date = as.Date(u$date, "%Y-%m-%d"), - value = u$transport), - FUN = sum) - title <- paste("Bridge users by transport") - } else { - u <- u[u$transport == value_p, ] - u <- aggregate(list(lower = u$lower, upper = u$upper, - users = u$clients), - by = list(date = as.Date(u$date, "%Y-%m-%d"), - value = u$transport), - FUN = sum) - title <- paste("Bridge users using", - ifelse(value_p == "<??>", "unknown pluggable transport(s)", - ifelse(value_p == "<OR>", "default OR protocol", - ifelse(value_p == "!<OR>", "any pluggable transport", - ifelse(value_p == "fte", "FTE", - ifelse(value_p == "websocket", "Flash proxy/websocket", - paste("transport", value_p))))))) - } - } else if (variable_p == "version") { - u <- u[u$version == value_p, ] - title <- paste("Bridge users using IP", value_p, sep = "") - u <- aggregate(list(lower = u$lower, upper = u$upper, - users = u$clients), - by = list(date = as.Date(u$date, "%Y-%m-%d"), - value = u$version), - FUN = sum) - } else { - if (value_p != "all") { - u <- u[u$country == value_p, ] - title <- paste("Bridge users from", countryname(value_p)) - } else { - u <- u[u$country == "" & u$transport == "" & u$version == "", ] - title <- "Bridge users" - } - u <- aggregate(list(lower = u$lower, upper = u$upper, - users = u$clients), - by = list(date = as.Date(u$date, "%Y-%m-%d"), - value = u$country), - FUN = sum) - } - u <- merge(x = u, all.y = TRUE, y = data.frame(expand.grid( - date = seq(from = as.Date(start_p, "%Y-%m-%d"), - to = as.Date(end_p, "%Y-%m-%d"), by = "1 day"), - value = ifelse(value_p == "all", "", value_p)))) - if (length(value_p) > 1) { - plot <- ggplot(u, aes(x = date, y = users, colour = value)) - } else { - plot <- ggplot(u, aes(x = date, y = users)) - } + filter(node == "relay") %>% + filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% + filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% + filter(if (!is.null(country_p)) + country == ifelse(country_p == "all", "", country_p) else TRUE) %>% + filter(transport == "") %>% + filter(version == "") %>% + select(date, country, clients, lower, upper, frac) %>% + rename(users = clients) +} + +plot_userstats_relay_country <- function(start_p, end_p, country_p, events_p, + path_p) { + u <- prepare_userstats_relay_country(start_p, end_p, country_p, events_p) %>% + complete(date = full_seq(date, period = 1)) + plot <- ggplot(u, aes(x = date, y = users)) if (length(na.omit(u$users)) > 0 & events_p != "off" & - variable_p == "country" & length(value_p) == 1 && value_p != "all") { + country_p != "all") { upturns <- u[u$users > u$upper, c("date", "users")] downturns <- u[u$users < u$lower, c("date", "users")] if (events_p == "on") { @@ -875,69 +804,20 @@ plot_userstats <- function(start_p, end_p, node_p, variable_p, value_p, scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + scale_y_continuous(name = "", labels = formatter, limits = c(0, NA)) + - ggtitle(title) + + ggtitle(paste("Directly connecting users", + ifelse(country_p == "all", "", + paste(" from", countryname(country_p))), sep = "")) + labs(caption = copyright_notice) - if (length(value_p) > 1) { - plot <- plot + - scale_colour_hue(name = "", breaks = value_p, - labels = ifelse(value_p == "<??>", "Unknown PT", - ifelse(value_p == "<OR>", "Default OR protocol", - ifelse(value_p == "!<OR>", "Any PT", - ifelse(value_p == "fte", "FTE", - ifelse(value_p == "websocket", "Flash proxy/websocket", - value_p)))))) - } ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -plot_userstats_relay_country <- function(start_p, end_p, country_p, events_p, - path_p) { - plot_userstats(start_p, end_p, "relay", "country", country_p, events_p, - path_p) -} - -plot_userstats_bridge_country <- function(start_p, end_p, country_p, path_p) { - plot_userstats(start_p, end_p, "bridge", "country", country_p, "off", path_p) -} - -plot_userstats_bridge_transport <- function(start_p, end_p, transport_p, - path_p) { - plot_userstats(start_p, end_p, "bridge", "transport", transport_p, "off", - path_p) -} - -plot_userstats_bridge_version <- function(start_p, end_p, version_p, path_p) { - plot_userstats(start_p, end_p, "bridge", "version", version_p, "off", path_p) -} - write_userstats_relay_country <- function(start_p = NULL, end_p = NULL, country_p = NULL, events_p = NULL, path_p) { - read_csv(file = paste(stats_dir, "clients.csv", sep = ""), - col_types = cols( - date = col_date(format = ""), - node = col_character(), - country = col_character(), - transport = col_character(), - version = col_character(), - lower = col_double(), - upper = col_double(), - clients = col_double(), - frac = col_double()), - na = character()) %>% - filter(node == "relay") %>% - filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% - filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% - filter(if (!is.null(country_p)) - country == ifelse(country_p == "all", "", country_p) else TRUE) %>% - filter(transport == "") %>% - filter(version == "") %>% - select(date, country, clients, lower, upper, frac) %>% - rename(users = clients) %>% + prepare_userstats_relay_country(start_p, end_p, country_p, events_p) %>% write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") } -write_userstats_bridge_country <- function(start_p = NULL, end_p = NULL, - country_p = NULL, path_p) { +prepare_userstats_bridge_country <- function(start_p, end_p, country_p) { read_csv(file = paste(stats_dir, "clients.csv", sep = ""), col_types = cols( date = col_date(format = ""), @@ -958,12 +838,32 @@ write_userstats_bridge_country <- function(start_p = NULL, end_p = NULL, filter(transport == "") %>% filter(version == "") %>% select(date, country, clients, frac) %>% - rename(users = clients) %>% + rename(users = clients) +} + +plot_userstats_bridge_country <- function(start_p, end_p, country_p, path_p) { + prepare_userstats_bridge_country(start_p, end_p, country_p) %>% + complete(date = full_seq(date, period = 1)) %>% + ggplot(aes(x = date, y = users)) + + geom_line() + + scale_x_date(name = "", breaks = custom_breaks, + labels = custom_labels, minor_breaks = custom_minor_breaks) + + scale_y_continuous(name = "", labels = formatter, limits = c(0, NA)) + + ggtitle(paste("Bridge users", + ifelse(country_p == "all", "", + paste(" from", countryname(country_p))), sep = "")) + + labs(caption = copyright_notice) + ggsave(filename = path_p, width = 8, height = 5, dpi = 150) +} + +write_userstats_bridge_country <- function(start_p = NULL, end_p = NULL, + country_p = NULL, path_p) { + prepare_userstats_bridge_country(start_p, end_p, country_p) %>% write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") } -write_userstats_bridge_transport <- function(start_p = NULL, end_p = NULL, - transport_p = NULL, path_p) { +prepare_userstats_bridge_transport <- function(start_p = NULL, end_p = NULL, + transport_p = NULL) { u <- read_csv(file = paste(stats_dir, "clients.csv", sep = ""), col_types = cols( date = col_date(format = ""), @@ -992,15 +892,58 @@ write_userstats_bridge_transport <- function(start_p = NULL, end_p = NULL, } u %>% filter(if (!is.null(transport_p)) transport %in% transport_p else TRUE) %>% - group_by(date, transport) %>% select(date, transport, clients, frac) %>% rename(users = clients) %>% - arrange(date, transport) %>% + arrange(date, transport) +} + +plot_userstats_bridge_transport <- function(start_p, end_p, transport_p, + path_p) { + if (length(transport_p) > 1) { + title <- paste("Bridge users by transport") + } else { + title <- paste("Bridge users using", + ifelse(transport_p == "<??>", "unknown pluggable transport(s)", + ifelse(transport_p == "<OR>", "default OR protocol", + ifelse(transport_p == "!<OR>", "any pluggable transport", + ifelse(transport_p == "fte", "FTE", + ifelse(transport_p == "websocket", "Flash proxy/websocket", + paste("transport", transport_p))))))) + } + u <- prepare_userstats_bridge_transport(start_p, end_p, transport_p) %>% + complete(date = full_seq(date, period = 1), nesting(transport)) + if (length(transport_p) > 1) { + plot <- ggplot(u, aes(x = date, y = users, colour = transport)) + } else { + plot <- ggplot(u, aes(x = date, y = users)) + } + plot <- plot + + geom_line() + + scale_x_date(name = "", breaks = custom_breaks, + labels = custom_labels, minor_breaks = custom_minor_breaks) + + scale_y_continuous(name = "", labels = formatter, limits = c(0, NA)) + + ggtitle(title) + + labs(caption = copyright_notice) + if (length(transport_p) > 1) { + plot <- plot + + scale_colour_hue(name = "", breaks = transport_p, + labels = ifelse(transport_p == "<??>", "Unknown PT", + ifelse(transport_p == "<OR>", "Default OR protocol", + ifelse(transport_p == "!<OR>", "Any PT", + ifelse(transport_p == "fte", "FTE", + ifelse(transport_p == "websocket", "Flash proxy/websocket", + transport_p)))))) + } + ggsave(filename = path_p, width = 8, height = 5, dpi = 150) +} + +write_userstats_bridge_transport <- function(start_p = NULL, end_p = NULL, + transport_p = NULL, path_p) { + prepare_userstats_bridge_transport(start_p, end_p, transport_p) %>% write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") } -write_userstats_bridge_version <- function(start_p = NULL, end_p = NULL, - version_p = NULL, path_p) { +prepare_userstats_bridge_version <- function(start_p, end_p, version_p) { read_csv(file = paste(stats_dir, "clients.csv", sep = ""), col_types = cols( date = col_date(format = ""), @@ -1019,7 +962,25 @@ write_userstats_bridge_version <- function(start_p = NULL, end_p = NULL, filter(is.na(transport)) %>% filter(if (!is.null(version_p)) version == version_p else TRUE) %>% select(date, version, clients, frac) %>% - rename(users = clients) %>% + rename(users = clients) +} + +plot_userstats_bridge_version <- function(start_p, end_p, version_p, path_p) { + prepare_userstats_bridge_version(start_p, end_p, version_p) %>% + complete(date = full_seq(date, period = 1)) %>% + ggplot(aes(x = date, y = users)) + + geom_line() + + scale_x_date(name = "", breaks = custom_breaks, + labels = custom_labels, minor_breaks = custom_minor_breaks) + + scale_y_continuous(name = "", labels = formatter, limits = c(0, NA)) + + ggtitle(paste("Bridge users using IP", version_p, sep = "")) + + labs(caption = copyright_notice) + ggsave(filename = path_p, width = 8, height = 5, dpi = 150) +} + +write_userstats_bridge_version <- function(start_p = NULL, end_p = NULL, + version_p = NULL, path_p) { + prepare_userstats_bridge_version(start_p, end_p, version_p) %>% write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") }

1 0

[metrics-web/release] Make write_* functions obsolete.
by karsten＠torproject.org 09 Nov '19

09 Nov '19

commit 0d2f1e2afd5f4b9e5c533d256586bb03d7466d5f Author: Karsten Loesing <karsten.loesing(a)gmx.net> Date: Thu Jan 10 15:39:04 2019 +0100 Make write_* functions obsolete. In most cases these functions would call their prepare_* equivalents, possibly tweak the result, and write it to a .csv file. This patch moves all those tweaks to the prepare_* functions, possibly reverts them in the plot_* functions, and makes the write_* functions obsolete. The result is not only less code. We're also going to find bugs in written .csv files sooner, because the same code is now run for writing graph files, and the latter happens much more often. --- src/main/R/rserver/graphs.R | 414 +++++++-------------- .../torproject/metrics/web/RObjectGenerator.java | 2 +- 2 files changed, 140 insertions(+), 276 deletions(-) diff --git a/src/main/R/rserver/graphs.R b/src/main/R/rserver/graphs.R index 27f399d..82a51e7 100644 --- a/src/main/R/rserver/graphs.R +++ b/src/main/R/rserver/graphs.R @@ -348,10 +348,17 @@ robust_call <- function(wrappee, filename) { }) } +# Write the result of the given FUN, typically a prepare_ function, as .csv file +# to the given path_p. +write_data <- function(FUN, ..., path_p) { + FUN(...) %>% + write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") +} + # Disable readr's automatic progress bar. options(readr.show_progress = FALSE) -prepare_networksize <- function(start_p, end_p) { +prepare_networksize <- function(start_p = NULL, end_p = NULL) { read.csv(paste(stats_dir, "networksize.csv", sep = ""), colClasses = c("date" = "Date")) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% @@ -375,12 +382,7 @@ plot_networksize <- function(start_p, end_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_networksize <- function(start_p = NULL, end_p = NULL, path_p) { - prepare_networksize(start_p, end_p) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_versions <- function(start_p, end_p) { +prepare_versions <- function(start_p = NULL, end_p = NULL) { read_csv(paste(stats_dir, "versions.csv", sep = ""), col_types = cols( date = col_date(format = ""), @@ -413,42 +415,34 @@ plot_versions <- function(start_p, end_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_versions <- function(start_p = NULL, end_p = NULL, path_p) { - prepare_versions(start_p, end_p) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_platforms <- function(start_p, end_p) { +prepare_platforms <- function(start_p = NULL, end_p = NULL) { read.csv(paste(stats_dir, "platforms.csv", sep = ""), colClasses = c("date" = "Date")) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% - filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) + filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% + mutate(platform = tolower(platform)) %>% + spread(platform, relays) } plot_platforms <- function(start_p, end_p, path_p) { prepare_platforms(start_p, end_p) %>% + gather(platform, relays, -date) %>% ggplot(aes(x = date, y = relays, colour = platform)) + geom_line() + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + scale_y_continuous(name = "", labels = formatter, limits = c(0, NA)) + scale_colour_manual(name = "Platform", - breaks = c("Linux", "macOS", "BSD", "Windows", "Other"), - values = c("Linux" = "#56B4E9", "macOS" = "#333333", "BSD" = "#E69F00", - "Windows" = "#0072B2", "Other" = "#009E73")) + + breaks = c("linux", "macos", "bsd", "windows", "other"), + labels = c("Linux", "macOS", "BSD", "Windows", "Other"), + values = c("linux" = "#56B4E9", "macos" = "#333333", "bsd" = "#E69F00", + "windows" = "#0072B2", "other" = "#009E73")) + ggtitle("Relay platforms") + labs(caption = copyright_notice) ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_platforms <- function(start_p = NULL, end_p = NULL, path_p) { - prepare_platforms(start_p, end_p) %>% - mutate(platform = tolower(platform)) %>% - spread(platform, relays) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_dirbytes <- function(start_p, end_p, path_p) { +prepare_dirbytes <- function(start_p = NULL, end_p = NULL) { read.csv(paste(stats_dir, "bandwidth.csv", sep = ""), colClasses = c("date" = "Date")) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% @@ -478,12 +472,7 @@ plot_dirbytes <- function(start_p, end_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_dirbytes <- function(start_p = NULL, end_p = NULL, path_p) { - prepare_dirbytes(start_p, end_p) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_relayflags <- function(start_p, end_p, flag_p) { +prepare_relayflags <- function(start_p = NULL, end_p = NULL, flag_p = NULL) { read.csv(paste(stats_dir, "relayflags.csv", sep = ""), colClasses = c("date" = "Date")) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% @@ -507,13 +496,8 @@ plot_relayflags <- function(start_p, end_p, flag_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_relayflags <- function(start_p = NULL, end_p = NULL, flag_p = NULL, - path_p) { - prepare_relayflags(start_p, end_p, flag_p) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_torperf <- function(start_p, end_p, server_p, filesize_p, path_p) { +prepare_torperf <- function(start_p = NULL, end_p = NULL, server_p = NULL, + filesize_p = NULL) { read.csv(paste(stats_dir, "torperf-1.1.csv", sep = ""), colClasses = c("date" = "Date", "source" = "character")) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% @@ -528,7 +512,7 @@ prepare_torperf <- function(start_p, end_p, server_p, filesize_p, path_p) { } plot_torperf <- function(start_p, end_p, server_p, filesize_p, path_p) { - prepare_torperf(start_p, end_p, server_p, filesize_p, path_p) %>% + prepare_torperf(start_p, end_p, server_p, filesize_p) %>% filter(source != "") %>% complete(date = full_seq(date, period = 1), nesting(source)) %>% ggplot(aes(x = date, y = md, ymin = q1, ymax = q3, fill = source)) + @@ -549,13 +533,8 @@ plot_torperf <- function(start_p, end_p, server_p, filesize_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_torperf <- function(start_p = NULL, end_p = NULL, server_p = NULL, - filesize_p = NULL, path_p) { - prepare_torperf(start_p, end_p, server_p, filesize_p, path_p) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_torperf_failures <- function(start_p, end_p, server_p, filesize_p) { +prepare_torperf_failures <- function(start_p = NULL, end_p = NULL, + server_p = NULL, filesize_p = NULL) { read.csv(paste(stats_dir, "torperf-1.1.csv", sep = ""), colClasses = c("date" = "Date")) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% @@ -593,24 +572,13 @@ plot_torperf_failures <- function(start_p, end_p, server_p, filesize_p, ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_torperf_failures <- function(start_p = NULL, end_p = NULL, - server_p = NULL, filesize_p = NULL, path_p) { - prepare_torperf_failures(start_p, end_p, server_p, filesize_p) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_onionperf_buildtimes <- function(start_p, end_p) { +prepare_onionperf_buildtimes <- function(start_p = NULL, end_p = NULL) { read.csv(paste(stats_dir, "buildtimes.csv", sep = ""), colClasses = c("date" = "Date")) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) } -write_onionperf_buildtimes <- function(start_p = NULL, end_p = NULL, path_p) { - prepare_onionperf_buildtimes(start_p, end_p) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - plot_onionperf_buildtimes <- function(start_p, end_p, path_p) { prepare_onionperf_buildtimes(start_p, end_p) %>% filter(source != "") %>% @@ -634,20 +602,15 @@ plot_onionperf_buildtimes <- function(start_p, end_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -prepare_onionperf_latencies <- function(start_p, end_p, server_p) { - read.csv(paste(stats_dir, "latencies.csv", sep = ""), +prepare_onionperf_latencies <- function(start_p = NULL, end_p = NULL, + server_p = NULL) { + read.csv(paste(stats_dir, "latencies.csv", sep = ""), colClasses = c("date" = "Date")) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% filter(if (!is.null(server_p)) server == server_p else TRUE) } -write_onionperf_latencies <- function(start_p = NULL, end_p = NULL, - server_p = NULL, path_p) { - prepare_onionperf_latencies(start_p, end_p, server_p) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - plot_onionperf_latencies <- function(start_p, end_p, server_p, path_p) { prepare_onionperf_latencies(start_p, end_p, server_p) %>% filter(source != "") %>% @@ -667,21 +630,22 @@ plot_onionperf_latencies <- function(start_p, end_p, server_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -prepare_connbidirect <- function(start_p, end_p) { +prepare_connbidirect <- function(start_p = NULL, end_p = NULL) { read.csv(paste(stats_dir, "connbidirect2.csv", sep = ""), colClasses = c("date" = "Date", "direction" = "factor")) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% mutate(quantile = paste("X", quantile, sep = ""), fraction = fraction / 100) %>% - spread(quantile, fraction) + spread(quantile, fraction) %>% + rename(q1 = X0.25, md = X0.5, q3 = X0.75) } plot_connbidirect <- function(start_p, end_p, path_p) { prepare_connbidirect(start_p, end_p) %>% - ggplot(aes(x = date, y = X0.5, colour = direction)) + + ggplot(aes(x = date, y = md, colour = direction)) + geom_line(size = 0.75) + - geom_ribbon(aes(x = date, ymin = X0.25, ymax = X0.75, + geom_ribbon(aes(x = date, ymin = q1, ymax = q3, fill = direction), alpha = 0.5, show.legend = FALSE) + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + @@ -700,13 +664,7 @@ plot_connbidirect <- function(start_p, end_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_connbidirect <- function(start_p = NULL, end_p = NULL, path_p) { - prepare_connbidirect(start_p, end_p) %>% - rename(q1 = X0.25, md = X0.5, q3 = X0.75) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_bandwidth_flags <- function(start_p, end_p) { +prepare_bandwidth_flags <- function(start_p = NULL, end_p = NULL) { advbw <- read.csv(paste(stats_dir, "advbw.csv", sep = ""), colClasses = c("date" = "Date")) %>% transmute(date, have_guard_flag = isguard, have_exit_flag = isexit, @@ -719,11 +677,13 @@ prepare_bandwidth_flags <- function(start_p, end_p) { filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% filter(have_exit_flag != "") %>% - filter(have_guard_flag != "") + filter(have_guard_flag != "") %>% + spread(variable, value) } plot_bandwidth_flags <- function(start_p, end_p, path_p) { prepare_bandwidth_flags(start_p, end_p) %>% + gather(variable, value, c(advbw, bwhist)) %>% unite(flags, have_guard_flag, have_exit_flag) %>% mutate(flags = factor(flags, levels = c("f_t", "t_t", "t_f", "f_f"), labels = c("Exit only", "Guard and Exit", "Guard only", @@ -745,14 +705,8 @@ plot_bandwidth_flags <- function(start_p, end_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_bandwidth_flags <- function(start_p = NULL, end_p = NULL, path_p) { - prepare_bandwidth_flags(start_p, end_p) %>% - spread(variable, value) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_userstats_relay_country <- function(start_p, end_p, country_p, - events_p) { +prepare_userstats_relay_country <- function(start_p = NULL, end_p = NULL, + country_p = NULL, events_p = NULL) { read_csv(file = paste(stats_dir, "clients.csv", sep = ""), col_types = cols( date = col_date(format = ""), @@ -811,13 +765,8 @@ plot_userstats_relay_country <- function(start_p, end_p, country_p, events_p, ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_userstats_relay_country <- function(start_p = NULL, end_p = NULL, - country_p = NULL, events_p = NULL, path_p) { - prepare_userstats_relay_country(start_p, end_p, country_p, events_p) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_userstats_bridge_country <- function(start_p, end_p, country_p) { +prepare_userstats_bridge_country <- function(start_p = NULL, end_p = NULL, + country_p = NULL) { read_csv(file = paste(stats_dir, "clients.csv", sep = ""), col_types = cols( date = col_date(format = ""), @@ -856,12 +805,6 @@ plot_userstats_bridge_country <- function(start_p, end_p, country_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_userstats_bridge_country <- function(start_p = NULL, end_p = NULL, - country_p = NULL, path_p) { - prepare_userstats_bridge_country(start_p, end_p, country_p) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - prepare_userstats_bridge_transport <- function(start_p = NULL, end_p = NULL, transport_p = NULL) { u <- read_csv(file = paste(stats_dir, "clients.csv", sep = ""), @@ -937,13 +880,8 @@ plot_userstats_bridge_transport <- function(start_p, end_p, transport_p, ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_userstats_bridge_transport <- function(start_p = NULL, end_p = NULL, - transport_p = NULL, path_p) { - prepare_userstats_bridge_transport(start_p, end_p, transport_p) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_userstats_bridge_version <- function(start_p, end_p, version_p) { +prepare_userstats_bridge_version <- function(start_p = NULL, end_p = NULL, + version_p = NULL) { read_csv(file = paste(stats_dir, "clients.csv", sep = ""), col_types = cols( date = col_date(format = ""), @@ -978,27 +916,28 @@ plot_userstats_bridge_version <- function(start_p, end_p, version_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_userstats_bridge_version <- function(start_p = NULL, end_p = NULL, - version_p = NULL, path_p) { - prepare_userstats_bridge_version(start_p, end_p, version_p) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_userstats_bridge_combined <- function(start_p, end_p, country_p) { - read_csv(file = paste(stats_dir, "userstats-combined.csv", sep = ""), - col_types = cols( - date = col_date(format = ""), - node = col_skip(), - country = col_character(), - transport = col_character(), - version = col_skip(), - frac = col_double(), - low = col_double(), - high = col_double()), - na = character()) %>% - filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% - filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% - filter(if (!is.null(country_p)) country == country_p else TRUE) +prepare_userstats_bridge_combined <- function(start_p = NULL, end_p = NULL, + country_p = NULL) { + if (!is.null(country_p) && country_p == "all") { + prepare_userstats_bridge_country(start_p, end_p, country_p) + } else { + read_csv(file = paste(stats_dir, "userstats-combined.csv", sep = ""), + col_types = cols( + date = col_date(format = ""), + node = col_skip(), + country = col_character(), + transport = col_character(), + version = col_skip(), + frac = col_double(), + low = col_double(), + high = col_double()), + na = character()) %>% + filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% + filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% + filter(if (!is.null(country_p)) country == country_p else TRUE) %>% + select(date, country, transport, low, high, frac) %>% + arrange(date, country, transport) + } } plot_userstats_bridge_combined <- function(start_p, end_p, country_p, path_p) { @@ -1028,19 +967,7 @@ plot_userstats_bridge_combined <- function(start_p, end_p, country_p, path_p) { } } -write_userstats_bridge_combined <- function(start_p = NULL, end_p = NULL, - country_p = NULL, path_p) { - if (!is.null(country_p) && country_p == "all") { - write_userstats_bridge_country(start_p, end_p, country_p, path_p) - } else { - prepare_userstats_bridge_combined(start_p, end_p, country_p) %>% - select(date, country, transport, low, high, frac) %>% - arrange(date, country, transport) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") - } -} - -prepare_advbwdist_perc <- function(start_p, end_p, p_p) { +prepare_advbwdist_perc <- function(start_p = NULL, end_p = NULL, p_p = NULL) { read.csv(paste(stats_dir, "advbwdist.csv", sep = ""), colClasses = c("date" = "Date")) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% @@ -1048,15 +975,18 @@ prepare_advbwdist_perc <- function(start_p, end_p, p_p) { filter(if (!is.null(p_p)) percentile %in% as.numeric(p_p) else percentile != "") %>% transmute(date, percentile = as.factor(percentile), - variable = ifelse(is.na(isexit), "all", "exits"), - advbw = advbw * 8 / 1e9) + variable = ifelse(isexit == "t", "exits", "all"), + advbw = advbw * 8 / 1e9) %>% + spread(variable, advbw) %>% + rename(p = percentile) } plot_advbwdist_perc <- function(start_p, end_p, p_p, path_p) { prepare_advbwdist_perc(start_p, end_p, p_p) %>% + gather(variable, advbw, -c(date, p)) %>% mutate(variable = ifelse(variable == "all", "All relays", "Exits only")) %>% - ggplot(aes(x = date, y = advbw, colour = percentile)) + + ggplot(aes(x = date, y = advbw, colour = p)) + facet_grid(variable ~ .) + geom_line() + scale_x_date(name = "", breaks = custom_breaks, @@ -1069,15 +999,7 @@ plot_advbwdist_perc <- function(start_p, end_p, p_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_advbwdist_perc <- function(start_p = NULL, end_p = NULL, p_p = NULL, - path_p) { - prepare_advbwdist_perc(start_p, end_p, p_p) %>% - spread(variable, advbw) %>% - rename(p = percentile) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_advbwdist_relay <- function(start_p, end_p, n_p) { +prepare_advbwdist_relay <- function(start_p = NULL, end_p = NULL, n_p = NULL) { read.csv(paste(stats_dir, "advbwdist.csv", sep = ""), colClasses = c("date" = "Date")) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% @@ -1086,14 +1008,17 @@ prepare_advbwdist_relay <- function(start_p, end_p, n_p) { relay != "") %>% transmute(date, relay = as.factor(relay), variable = ifelse(isexit != "t", "all", "exits"), - advbw = advbw * 8 / 1e9) + advbw = advbw * 8 / 1e9) %>% + spread(variable, advbw) %>% + rename(n = relay) } plot_advbwdist_relay <- function(start_p, end_p, n_p, path_p) { prepare_advbwdist_relay(start_p, end_p, n_p) %>% + gather(variable, advbw, -c(date, n)) %>% mutate(variable = ifelse(variable == "all", "All relays", "Exits only")) %>% - ggplot(aes(x = date, y = advbw, colour = relay)) + + ggplot(aes(x = date, y = advbw, colour = n)) + facet_grid(variable ~ .) + geom_line() + scale_x_date(name = "", breaks = custom_breaks, @@ -1106,15 +1031,7 @@ plot_advbwdist_relay <- function(start_p, end_p, n_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_advbwdist_relay <- function(start_p = NULL, end_p = NULL, n_p = NULL, - path_p) { - prepare_advbwdist_relay(start_p, end_p, n_p) %>% - spread(variable, advbw) %>% - rename(n = relay) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_hidserv_dir_onions_seen <- function(start_p, end_p) { +prepare_hidserv_dir_onions_seen <- function(start_p = NULL, end_p = NULL) { read.csv(paste(stats_dir, "hidserv.csv", sep = ""), colClasses = c("date" = "Date")) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% @@ -1135,13 +1052,7 @@ plot_hidserv_dir_onions_seen <- function(start_p, end_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_hidserv_dir_onions_seen <- function(start_p = NULL, end_p = NULL, - path_p) { - prepare_hidserv_dir_onions_seen(start_p, end_p) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_hidserv_rend_relayed_cells <- function(start_p, end_p) { +prepare_hidserv_rend_relayed_cells <- function(start_p = NULL, end_p = NULL) { read.csv(paste(stats_dir, "hidserv.csv", sep = ""), colClasses = c("date" = "Date")) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% @@ -1164,13 +1075,7 @@ plot_hidserv_rend_relayed_cells <- function(start_p, end_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_hidserv_rend_relayed_cells <- function(start_p = NULL, end_p = NULL, - path_p) { - prepare_hidserv_rend_relayed_cells(start_p, end_p) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_webstats_tb <- function(start_p, end_p) { +prepare_webstats_tb <- function(start_p = NULL, end_p = NULL) { read_csv(file = paste(stats_dir, "webstats.csv", sep = ""), col_types = cols( log_date = col_date(format = ""), @@ -1184,17 +1089,22 @@ prepare_webstats_tb <- function(start_p, end_p) { filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>% filter(request_type %in% c("tbid", "tbsd", "tbup", "tbur")) %>% group_by(log_date, request_type) %>% - summarize(count = sum(count)) + summarize(count = sum(count)) %>% + spread(request_type, count) %>% + rename(date = log_date, initial_downloads = tbid, + signature_downloads = tbsd, update_pings = tbup, + update_requests = tbur) } plot_webstats_tb <- function(start_p, end_p, path_p) { - d <- prepare_webstats_tb(start_p, end_p) - levels(d$request_type) <- list( - "Initial downloads" = "tbid", - "Signature downloads" = "tbsd", - "Update pings" = "tbup", - "Update requests" = "tbur") - ggplot(d, aes(x = log_date, y = count)) + + prepare_webstats_tb(start_p, end_p) %>% + gather(request_type, count, -date) %>% + mutate(request_type = factor(request_type, + levels = c("initial_downloads", "signature_downloads", "update_pings", + "update_requests"), + labels = c("Initial downloads", "Signature downloads", "Update pings", + "Update requests"))) %>% + ggplot(aes(x = date, y = count)) + geom_point() + geom_line() + facet_grid(request_type ~ ., scales = "free_y") + @@ -1208,16 +1118,7 @@ plot_webstats_tb <- function(start_p, end_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_webstats_tb <- function(start_p = NULL, end_p = NULL, path_p) { - prepare_webstats_tb(start_p, end_p) %>% - rename(date = log_date) %>% - spread(request_type, count) %>% - rename(initial_downloads = tbid, signature_downloads = tbsd, - update_pings = tbup, update_requests = tbur) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_webstats_tb_platform <- function(start_p, end_p) { +prepare_webstats_tb_platform <- function(start_p = NULL, end_p = NULL) { read_csv(file = paste(stats_dir, "webstats.csv", sep = ""), col_types = cols( log_date = col_date(format = ""), @@ -1231,15 +1132,18 @@ prepare_webstats_tb_platform <- function(start_p, end_p) { filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>% filter(request_type %in% c("tbid", "tbup")) %>% group_by(log_date, platform, request_type) %>% - summarize(count = sum(count)) + summarize(count = sum(count)) %>% + spread(request_type, count, fill = 0) %>% + rename(date = log_date, initial_downloads = tbid, update_pings = tbup) } plot_webstats_tb_platform <- function(start_p, end_p, path_p) { - d <- prepare_webstats_tb_platform(start_p, end_p) - levels(d$request_type) <- list( - "Initial downloads" = "tbid", - "Update pings" = "tbup") - ggplot(d, aes(x = log_date, y = count, colour = platform)) + + prepare_webstats_tb_platform(start_p, end_p) %>% + gather(request_type, count, -c(date, platform)) %>% + mutate(request_type = factor(request_type, + levels = c("initial_downloads", "update_pings"), + labels = c("Initial downloads", "Update pings"))) %>% + ggplot(aes(x = date, y = count, colour = platform)) + geom_point() + geom_line() + scale_x_date(name = "", breaks = custom_breaks, @@ -1257,15 +1161,7 @@ plot_webstats_tb_platform <- function(start_p, end_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_webstats_tb_platform <- function(start_p = NULL, end_p = NULL, path_p) { - prepare_webstats_tb_platform(start_p, end_p) %>% - rename(date = log_date) %>% - spread(request_type, count, fill = 0) %>% - rename(initial_downloads = tbid, update_pings = tbup) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_webstats_tb_locale <- function(start_p, end_p) { +prepare_webstats_tb_locale <- function(start_p = NULL, end_p = NULL) { read_csv(file = paste(stats_dir, "webstats.csv", sep = ""), col_types = cols( log_date = col_date(format = ""), @@ -1320,12 +1216,7 @@ plot_webstats_tb_locale <- function(start_p, end_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_webstats_tb_locale <- function(start_p = NULL, end_p = NULL, path_p) { - prepare_webstats_tb_locale(start_p, end_p) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_webstats_tm <- function(start_p, end_p) { +prepare_webstats_tm <- function(start_p = NULL, end_p = NULL) { read_csv(file = paste(stats_dir, "webstats.csv", sep = ""), col_types = cols( log_date = col_date(format = ""), @@ -1339,15 +1230,19 @@ prepare_webstats_tm <- function(start_p, end_p) { filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>% filter(request_type %in% c("tmid", "tmup")) %>% group_by(log_date, request_type) %>% - summarize(count = sum(count)) + summarize(count = sum(count)) %>% + mutate(request_type = factor(request_type, levels = c("tmid", "tmup"))) %>% + spread(request_type, count, drop = FALSE) %>% + rename(date = log_date, initial_downloads = tmid, update_pings = tmup) } plot_webstats_tm <- function(start_p, end_p, path_p) { - d <- prepare_webstats_tm(start_p, end_p) - levels(d$request_type) <- list( - "Initial downloads" = "tmid", - "Update pings" = "tmup") - ggplot(d, aes(x = log_date, y = count)) + + prepare_webstats_tm(start_p, end_p) %>% + gather(request_type, count, -date) %>% + mutate(request_type = factor(request_type, + levels = c("initial_downloads", "update_pings"), + labels = c("Initial downloads", "Update pings"))) %>% + ggplot(aes(x = date, y = count)) + geom_point() + geom_line() + facet_grid(request_type ~ ., scales = "free_y") + @@ -1361,16 +1256,7 @@ plot_webstats_tm <- function(start_p, end_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_webstats_tm <- function(start_p = NULL, end_p = NULL, path_p) { - prepare_webstats_tm(start_p, end_p) %>% - rename(date = log_date) %>% - mutate(request_type = factor(request_type, levels = c("tmid", "tmup"))) %>% - spread(request_type, count, drop = FALSE) %>% - rename(initial_downloads = tmid, update_pings = tmup) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_relays_ipv6 <- function(start_p, end_p) { +prepare_relays_ipv6 <- function(start_p = NULL, end_p = NULL) { read.csv(paste(stats_dir, "ipv6servers.csv", sep = ""), colClasses = c("valid_after_date" = "Date")) %>% filter(if (!is.null(start_p)) @@ -1385,12 +1271,15 @@ prepare_relays_ipv6 <- function(start_p, end_p) { exiting = sum(server_count_sum_avg[exiting_ipv6_relay == "t"])) %>% complete(valid_after_date = full_seq(valid_after_date, period = 1)) %>% gather(total, announced, reachable, exiting, key = "category", - value = "count") + value = "count") %>% + rename(date = valid_after_date) %>% + spread(category, count) } plot_relays_ipv6 <- function(start_p, end_p, path_p) { prepare_relays_ipv6(start_p, end_p) %>% - ggplot(aes(x = valid_after_date, y = count, colour = category)) + + gather(category, count, -date) %>% + ggplot(aes(x = date, y = count, colour = category)) + geom_line() + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + @@ -1405,14 +1294,7 @@ plot_relays_ipv6 <- function(start_p, end_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_relays_ipv6 <- function(start_p = NULL, end_p = NULL, path_p) { - prepare_relays_ipv6(start_p, end_p) %>% - rename(date = valid_after_date) %>% - spread(category, count) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_bridges_ipv6 <- function(start_p, end_p) { +prepare_bridges_ipv6 <- function(start_p = NULL, end_p = NULL) { read.csv(paste(stats_dir, "ipv6servers.csv", sep = ""), colClasses = c("valid_after_date" = "Date")) %>% filter(if (!is.null(start_p)) @@ -1424,12 +1306,13 @@ prepare_bridges_ipv6 <- function(start_p, end_p) { summarize(total = sum(server_count_sum_avg), announced = sum(server_count_sum_avg[announced_ipv6 == "t"])) %>% complete(valid_after_date = full_seq(valid_after_date, period = 1)) %>% - gather(total, announced, key = "category", value = "count") + rename(date = valid_after_date) } plot_bridges_ipv6 <- function(start_p, end_p, path_p) { prepare_bridges_ipv6(start_p, end_p) %>% - ggplot(aes(x = valid_after_date, y = count, colour = category)) + + gather(category, count, -date) %>% + ggplot(aes(x = date, y = count, colour = category)) + geom_line() + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + @@ -1443,14 +1326,7 @@ plot_bridges_ipv6 <- function(start_p, end_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_bridges_ipv6 <- function(start_p = NULL, end_p = NULL, path_p) { - prepare_bridges_ipv6(start_p, end_p) %>% - rename(date = valid_after_date) %>% - spread(category, count) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_advbw_ipv6 <- function(start_p, end_p) { +prepare_advbw_ipv6 <- function(start_p = NULL, end_p = NULL) { read.csv(paste(stats_dir, "ipv6servers.csv", sep = ""), colClasses = c("valid_after_date" = "Date")) %>% filter(if (!is.null(start_p)) @@ -1458,6 +1334,8 @@ prepare_advbw_ipv6 <- function(start_p, end_p) { filter(if (!is.null(end_p)) valid_after_date <= as.Date(end_p) else TRUE) %>% filter(server == "relay") %>% + mutate(advertised_bandwidth_bytes_sum_avg = + advertised_bandwidth_bytes_sum_avg * 8 / 1e9) %>% group_by(valid_after_date) %>% summarize(total = sum(advertised_bandwidth_bytes_sum_avg), total_guard = sum(advertised_bandwidth_bytes_sum_avg[guard_relay != "f"]), @@ -1469,14 +1347,13 @@ prepare_advbw_ipv6 <- function(start_p, end_p) { exiting = sum(advertised_bandwidth_bytes_sum_avg[ exiting_ipv6_relay != "f"])) %>% complete(valid_after_date = full_seq(valid_after_date, period = 1)) %>% - gather(total, total_guard, total_exit, reachable_guard, reachable_exit, - exiting, key = "category", value = "advbw") %>% - mutate(advbw = advbw * 8 / 1e9) + rename(date = valid_after_date) } plot_advbw_ipv6 <- function(start_p, end_p, path_p) { prepare_advbw_ipv6(start_p, end_p) %>% - ggplot(aes(x = valid_after_date, y = advbw, colour = category)) + + gather(category, advbw, -date) %>% + ggplot(aes(x = date, y = advbw, colour = category)) + geom_line() + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + @@ -1494,14 +1371,7 @@ plot_advbw_ipv6 <- function(start_p, end_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_advbw_ipv6 <- function(start_p = NULL, end_p = NULL, path_p) { - prepare_advbw_ipv6(start_p, end_p) %>% - rename(date = valid_after_date) %>% - spread(category, advbw) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} - -prepare_totalcw <- function(start_p, end_p) { +prepare_totalcw <- function(start_p = NULL, end_p = NULL) { read.csv(paste(stats_dir, "totalcw.csv", sep = ""), colClasses = c("valid_after_date" = "Date", "nickname" = "character")) %>% filter(if (!is.null(start_p)) @@ -1509,7 +1379,9 @@ prepare_totalcw <- function(start_p, end_p) { filter(if (!is.null(end_p)) valid_after_date <= as.Date(end_p) else TRUE) %>% group_by(valid_after_date, nickname) %>% - summarize(measured_sum_avg = sum(measured_sum_avg)) + summarize(measured_sum_avg = sum(measured_sum_avg)) %>% + rename(date = valid_after_date, totalcw = measured_sum_avg) %>% + arrange(date, nickname) } plot_totalcw <- function(start_p, end_p, path_p) { @@ -1517,10 +1389,8 @@ plot_totalcw <- function(start_p, end_p, path_p) { mutate(nickname = ifelse(nickname == "", "consensus", nickname)) %>% mutate(nickname = factor(nickname, levels = c("consensus", unique(nickname[nickname != "consensus"])))) %>% - complete(valid_after_date = full_seq(valid_after_date, period = 1), - nesting(nickname)) %>% - ggplot(aes(x = valid_after_date, y = measured_sum_avg, - colour = nickname)) + + complete(date = full_seq(date, period = 1), nesting(nickname)) %>% + ggplot(aes(x = date, y = totalcw, colour = nickname)) + geom_line(na.rm = TRUE) + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + @@ -1531,10 +1401,4 @@ plot_totalcw <- function(start_p, end_p, path_p) { ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -write_totalcw <- function(start_p = NULL, end_p = NULL, path_p) { - prepare_totalcw(start_p, end_p) %>% - rename(date = valid_after_date, totalcw = measured_sum_avg) %>% - arrange(date, nickname) %>% - write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") -} diff --git a/src/main/java/org/torproject/metrics/web/RObjectGenerator.java b/src/main/java/org/torproject/metrics/web/RObjectGenerator.java index a529830..6a142e8 100644 --- a/src/main/java/org/torproject/metrics/web/RObjectGenerator.java +++ b/src/main/java/org/torproject/metrics/web/RObjectGenerator.java @@ -122,7 +122,7 @@ public class RObjectGenerator implements ServletContextListener { StringBuilder queryBuilder = new StringBuilder(); queryBuilder.append("robust_call(as.call(list("); if ("csv".equalsIgnoreCase(fileType)) { - queryBuilder.append("write_"); + queryBuilder.append("write_data, prepare_"); /* When we checked parameters above we also put in defaults for missing * parameters. This is okay for graphs, but we want to support CSV files * with empty parameters. Using the parameters we got here. */

1 0

[metrics-web/release] Remove Torperf/OnionPerf plots with all sources.
by karsten＠torproject.org 09 Nov '19

09 Nov '19

commit c39472548511175a6eaa0d67de62d3b5fa59dbe3 Author: Karsten Loesing <karsten.loesing(a)gmx.net> Date: Wed Dec 5 11:56:19 2018 +0100 Remove Torperf/OnionPerf plots with all sources. OnionPerf results look to be comparable over time, but between vantage points there are systematic deltas between the results. The "all" plots show rises and falls where they actually don't exist, it's just that a particular vantage point was offline so the average of the two remaining moves noticeably. In this commit we remove the source parameter from these graphs and always include all sources separately in the graph, but not a combination of all measurements together. Implements #28603. --- src/main/R/rserver/graphs.R | 178 +++++++++------------ .../metrics/web/GraphParameterChecker.java | 24 --- .../org/torproject/metrics/web/GraphServlet.java | 8 - src/main/resources/web/json/metrics.json | 7 +- src/main/resources/web/jsps/graph.jsp | 9 -- 5 files changed, 76 insertions(+), 150 deletions(-) diff --git a/src/main/R/rserver/graphs.R b/src/main/R/rserver/graphs.R index e541c30..1f7309b 100644 --- a/src/main/R/rserver/graphs.R +++ b/src/main/R/rserver/graphs.R @@ -592,70 +592,49 @@ write_relayflags <- function(start_p = NULL, end_p = NULL, flag_p = NULL, write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") } -plot_torperf <- function(start_p, end_p, source_p, server_p, filesize_p, - path_p) { - filesize_val <- ifelse(filesize_p == "50kb", 50 * 1024, - ifelse(filesize_p == "1mb", 1024 * 1024, 5 * 1024 * 1024)) - t <- read.csv(paste(stats_dir, "torperf-1.1.csv", sep = ""), - colClasses = c("date" = "Date", "source" = "character")) - known_sources <- c("all", unique(t[t$source != "", "source"])) - colours <- data.frame(source = known_sources, - colour = brewer.pal(length(known_sources), "Paired"), - stringsAsFactors = FALSE) - colour <- colours[colours$source == source_p, "colour"] - filesizes <- data.frame(filesizes = c("5mb", "1mb", "50kb"), - label = c("5 MiB", "1 MiB", "50 KiB"), stringsAsFactors = FALSE) - filesize_str <- filesizes[filesizes$filesize == filesize_p, "label"] - t[t$date >= as.Date(start_p) & t$date <= as.Date(end_p) & - t$filesize == filesize_val & - t$source == ifelse(source_p == "all", "", source_p) & - t$server == server_p, ] %>% - transmute(date, q1 = q1 / 1e3, md = md / 1e3, q3 = q3 / 1e3) %>% - complete(date = full_seq(date, period = 1)) %>% - ggplot(aes(x = date, y = md, fill = "line")) + - geom_line(colour = colour, size = 0.75) + - geom_ribbon(aes(x = date, ymin = q1, ymax = q3, fill = "ribbon")) + +prepare_torperf <- function(start_p, end_p, server_p, filesize_p, path_p) { + read.csv(paste(stats_dir, "torperf-1.1.csv", sep = ""), + colClasses = c("date" = "Date", "source" = "character")) %>% + filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% + filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% + filter(if (!is.null(server_p)) server == server_p else TRUE) %>% + filter(if (!is.null(filesize_p)) + filesize == ifelse(filesize_p == "50kb", 50 * 1024, + ifelse(filesize_p == "1mb", 1024 * 1024, 5 * 1024 * 1024)) else + TRUE) %>% + transmute(date, filesize, source, server, q1 = q1 / 1e3, md = md / 1e3, + q3 = q3 / 1e3) +} + +plot_torperf <- function(start_p, end_p, server_p, filesize_p, path_p) { + prepare_torperf(start_p, end_p, server_p, filesize_p, path_p) %>% + filter(source != "") %>% + complete(date = full_seq(date, period = 1), nesting(source)) %>% + ggplot(aes(x = date, y = md, ymin = q1, ymax = q3, fill = source)) + + geom_ribbon(alpha = 0.5) + + geom_line(aes(colour = source), size = 0.75) + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + scale_y_continuous(name = "", labels = unit_format(unit = "s"), limits = c(0, NA)) + - scale_fill_manual(name = paste("Measured times on", - ifelse(source_p == "all", "all sources", source_p), "per day"), - breaks = c("line", "ribbon"), - labels = c("Median", "1st to 3rd quartile"), - values = paste(colour, c("", "66"), sep = "")) + - ggtitle(paste("Time to complete", filesize_str, + scale_fill_hue(name = "Source") + + scale_colour_hue(name = "Source") + + ggtitle(paste("Time to complete", + ifelse(filesize_p == "50kb", "50 KiB", + ifelse(filesize_p == "1mb", "1 MiB", "5 MiB")), "request to", server_p, "server")) + labs(caption = copyright_notice) + theme(legend.position = "top") ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -# Ideally, this function would share code with plot_torperf by using a -# common prepare_torperf function. This just turned out to be a bit -# harder than for other functions, because plot_torperf uses different -# colours based on which sources exist, unrelated to which source is -# plotted. Left as future work. -write_torperf <- function(start_p = NULL, end_p = NULL, source_p = NULL, - server_p = NULL, filesize_p = NULL, path_p) { - read.csv(paste(stats_dir, "torperf-1.1.csv", sep = ""), - colClasses = c("date" = "Date")) %>% - filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% - filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% - filter(if (!is.null(source_p)) - source == ifelse(source_p == "all", "", source_p) else TRUE) %>% - filter(if (!is.null(server_p)) server == server_p else TRUE) %>% - filter(if (!is.null(filesize_p)) - filesize == ifelse(filesize_p == "50kb", 50 * 1024, - ifelse(filesize_p == "1mb", 1024 * 1024, 5 * 1024 * 1024)) else - TRUE) %>% - transmute(date, filesize, source, server, q1 = q1 / 1e3, md = md / 1e3, - q3 = q3 / 1e3) %>% +write_torperf <- function(start_p = NULL, end_p = NULL, server_p = NULL, + filesize_p = NULL, path_p) { + prepare_torperf(start_p, end_p, server_p, filesize_p, path_p) %>% write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") } -prepare_torperf_failures <- function(start_p, end_p, source_p, server_p, - filesize_p) { +prepare_torperf_failures <- function(start_p, end_p, server_p, filesize_p) { read.csv(paste(stats_dir, "torperf-1.1.csv", sep = ""), colClasses = c("date" = "Date")) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% @@ -664,31 +643,29 @@ prepare_torperf_failures <- function(start_p, end_p, source_p, server_p, filesize == ifelse(filesize_p == "50kb", 50 * 1024, ifelse(filesize_p == "1mb", 1024 * 1024, 5 * 1024 * 1024)) else TRUE) %>% - filter(if (!is.null(source_p)) - source == ifelse(source_p == "all", "", source_p) else TRUE) %>% filter(if (!is.null(server_p)) server == server_p else TRUE) %>% filter(requests > 0) %>% transmute(date, filesize, source, server, timeouts = timeouts / requests, failures = failures / requests) } -plot_torperf_failures <- function(start_p, end_p, source_p, server_p, - filesize_p, path_p) { - filesizes <- data.frame(filesizes = c("5mb", "1mb", "50kb"), - label = c("5 MiB", "1 MiB", "50 KiB"), stringsAsFactors = FALSE) - filesize_str <- filesizes[filesizes$filesize == filesize_p, "label"] - prepare_torperf_failures(start_p, end_p, source_p, server_p, filesize_p) %>% +plot_torperf_failures <- function(start_p, end_p, server_p, filesize_p, + path_p) { + prepare_torperf_failures(start_p, end_p, server_p, filesize_p) %>% + filter(source != "") %>% gather(variable, value, -c(date, filesize, source, server)) %>% - ggplot(aes(x = date, y = value, colour = variable)) + - geom_point(size = 2) + + mutate(variable = factor(variable, levels = c("timeouts", "failures"), + labels = c("Timeouts", "Failures"))) %>% + ggplot(aes(x = date, y = value, colour = source)) + + geom_point(size = 2, alpha = 0.5) + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + scale_y_continuous(name = "", labels = percent, limits = c(0, NA)) + - scale_colour_hue(name = paste("Problems encountered on", - ifelse(source_p == "all", "all sources", source_p)), - h.start = 45, breaks = c("timeouts", "failures"), - labels = c("Timeouts", "Failures")) + - ggtitle(paste("Timeouts and failures of", filesize_str, + scale_colour_hue(name = "Source") + + facet_grid(variable ~ .) + + ggtitle(paste("Timeouts and failures of", + ifelse(filesize_p == "50kb", "50 KiB", + ifelse(filesize_p == "1mb", "1 MiB", "5 MiB")), "requests to", server_p, "server")) + labs(caption = copyright_notice) + theme(legend.position = "top") @@ -696,81 +673,74 @@ plot_torperf_failures <- function(start_p, end_p, source_p, server_p, } write_torperf_failures <- function(start_p = NULL, end_p = NULL, - source_p = NULL, server_p = NULL, filesize_p = NULL, path_p) { - prepare_torperf_failures(start_p, end_p, source_p, server_p, filesize_p) %>% + server_p = NULL, filesize_p = NULL, path_p) { + prepare_torperf_failures(start_p, end_p, server_p, filesize_p) %>% write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") } -prepare_onionperf_buildtimes <- function(start_p, end_p, source_p) { +prepare_onionperf_buildtimes <- function(start_p, end_p) { read.csv(paste(stats_dir, "buildtimes.csv", sep = ""), colClasses = c("date" = "Date")) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% - filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% - filter(if (!is.null(source_p)) - source == ifelse(source_p == "all", "", source_p) else TRUE) + filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) } -write_onionperf_buildtimes <- function(start_p = NULL, end_p = NULL, - source_p = NULL, path_p) { - prepare_onionperf_buildtimes(start_p, end_p, source_p) %>% +write_onionperf_buildtimes <- function(start_p = NULL, end_p = NULL, path_p) { + prepare_onionperf_buildtimes(start_p, end_p) %>% write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") } -plot_onionperf_buildtimes <- function(start_p, end_p, source_p, path_p) { - prepare_onionperf_buildtimes(start_p, end_p, source_p) %>% +plot_onionperf_buildtimes <- function(start_p, end_p, path_p) { + prepare_onionperf_buildtimes(start_p, end_p) %>% + filter(source != "") %>% mutate(date = as.Date(date), position = factor(position, levels = seq(1, 3, 1), labels = c("1st hop", "2nd hop", "3rd hop"))) %>% - ggplot(aes(x = date, y = md, colour = position, fill = position)) + - geom_line(size = 0.75) + - geom_ribbon(aes(x = as.Date(date), ymin = q1, ymax = q3, alpha = 0.5), - show.legend = FALSE) + + complete(date = full_seq(date, period = 1), nesting(source, position)) %>% + ggplot(aes(x = date, y = md, ymin = q1, ymax = q3, fill = source)) + + geom_ribbon(alpha = 0.5) + + geom_line(aes(colour = source), size = 0.75) + + facet_grid(position ~ .) + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + scale_y_continuous(name = "", labels = unit_format(unit = "ms"), limits = c(0, NA)) + - scale_colour_hue(name = "Medians and interquartile ranges") + - scale_fill_hue(name = "Medians and interquartile ranges") + - ggtitle(ifelse(source_p == "all", "Circuit build times on all sources", - paste("Circuit build times on", source_p))) + + scale_fill_hue(name = "Source") + + scale_colour_hue(name = "Source") + + ggtitle("Circuit build times") + labs(caption = copyright_notice) + theme(legend.position = "top") ggsave(filename = path_p, width = 8, height = 5, dpi = 150) } -prepare_onionperf_latencies <- function(start_p, end_p, source_p) { +prepare_onionperf_latencies <- function(start_p, end_p, server_p) { read.csv(paste(stats_dir, "latencies.csv", sep = ""), colClasses = c("date" = "Date")) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% - filter(if (!is.null(source_p)) - source == ifelse(source_p == "all", "", source_p) else TRUE) + filter(if (!is.null(server_p)) server == server_p else TRUE) } write_onionperf_latencies <- function(start_p = NULL, end_p = NULL, - source_p = NULL, path_p) { - prepare_onionperf_latencies(start_p, end_p, source_p) %>% + server_p = NULL, path_p) { + prepare_onionperf_latencies(start_p, end_p, server_p) %>% write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") } -plot_onionperf_latencies <- function(start_p, end_p, source_p, path_p) { - prepare_onionperf_latencies(start_p, end_p, source_p) %>% - mutate(date = as.Date(date), - server = factor(server, levels = c("public", "onion"), - labels = c("public server", "onion server"))) %>% - ggplot(aes(x = date, y = md, colour = server, fill = server)) + - geom_line(size = 0.75) + - geom_ribbon(aes(x = as.Date(date), ymin = q1, ymax = q3, alpha = 0.5), - show.legend = FALSE) + +plot_onionperf_latencies <- function(start_p, end_p, server_p, path_p) { + prepare_onionperf_latencies(start_p, end_p, server_p) %>% + filter(source != "") %>% + complete(date = full_seq(date, period = 1), nesting(source)) %>% + ggplot(aes(x = date, y = md, ymin = q1, ymax = q3, fill = source)) + + geom_ribbon(alpha = 0.5) + + geom_line(aes(colour = source), size = 0.75) + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + scale_y_continuous(name = "", labels = unit_format(unit = "ms"), limits = c(0, NA)) + - scale_colour_hue(name = "Medians and interquartile ranges") + - scale_fill_hue(name = "Medians and interquartile ranges") + - ggtitle(ifelse(source_p == "all", - "Circuit round-trip latencies on all sources", - paste("Circuit round-trip latencies on", source_p))) + + scale_fill_hue(name = "Source") + + scale_colour_hue(name = "Source") + + ggtitle(paste("Circuit round-trip latencies to", server_p, "server")) + labs(caption = copyright_notice) + theme(legend.position = "top") ggsave(filename = path_p, width = 8, height = 5, dpi = 150) diff --git a/src/main/java/org/torproject/metrics/web/GraphParameterChecker.java b/src/main/java/org/torproject/metrics/web/GraphParameterChecker.java index 2168ab5..ac642e9 100644 --- a/src/main/java/org/torproject/metrics/web/GraphParameterChecker.java +++ b/src/main/java/org/torproject/metrics/web/GraphParameterChecker.java @@ -61,8 +61,6 @@ public class GraphParameterChecker { } this.knownParameterValues.put("country", sb.toString()); this.knownParameterValues.put("events", "on,off,points"); - this.knownParameterValues.put("source", "all,siv,moria,torperf,op-hk," - + "op-nl,op-us"); this.knownParameterValues.put("server", "public,onion"); this.knownParameterValues.put("filesize", "50kb,1mb,5mb"); this.knownParameterValues.put("transport", "obfs2,obfs3,obfs4," @@ -199,28 +197,6 @@ public class GraphParameterChecker { recognizedGraphParameters.put("events", eventsParameter); } - /* Parse torperf data source if supported by the graph type. Only a - * single source can be passed. If no source is passed, use "torperf" - * as default. */ - if (supportedGraphParameters.contains("source")) { - String[] sourceParameter = (String[]) requestParameters.get( - "source"); - List<String> knownSources = Arrays.asList( - this.knownParameterValues.get("source").split(",")); - if (sourceParameter != null) { - if (sourceParameter.length != 1) { - return null; - } - if (sourceParameter[0].length() == 0 - || !knownSources.contains(sourceParameter[0])) { - return null; - } - } else { - sourceParameter = new String[] { "all" }; - } - recognizedGraphParameters.put("source", sourceParameter); - } - /* Parse onionperf server if supported by the graph type. Only a single * server can be passed. If no server is passed, use "public" as default. */ if (supportedGraphParameters.contains("server")) { diff --git a/src/main/java/org/torproject/metrics/web/GraphServlet.java b/src/main/java/org/torproject/metrics/web/GraphServlet.java index 2f35320..17d9309 100644 --- a/src/main/java/org/torproject/metrics/web/GraphServlet.java +++ b/src/main/java/org/torproject/metrics/web/GraphServlet.java @@ -103,14 +103,6 @@ public class GraphServlet extends MetricServlet { this.defaultParameters.put("version", new String[][] { { "v4", " selected", "IPv4" }, { "v6", "", "IPv6" } }); - this.defaultParameters.put("source", new String[][] { - { "all", " checked" }, - { "torperf", "" }, - { "moria", "" }, - { "siv", "" }, - { "op-hk", "" }, - { "op-nl", "" }, - { "op-us", "" }}); this.defaultParameters.put("server", new String[][] { { "public", " checked" }, { "onion", "" }}); diff --git a/src/main/resources/web/json/metrics.json b/src/main/resources/web/json/metrics.json index 9cb50ad..b351814 100644 --- a/src/main/resources/web/json/metrics.json +++ b/src/main/resources/web/json/metrics.json @@ -290,7 +290,6 @@ "parameters": [ "start", "end", - "source", "server", "filesize" ] @@ -304,7 +303,6 @@ "parameters": [ "start", "end", - "source", "server", "filesize" ] @@ -317,8 +315,7 @@ "function": "onionperf_buildtimes", "parameters": [ "start", - "end", - "source" + "end" ] }, { @@ -330,7 +327,7 @@ "parameters": [ "start", "end", - "source" + "server" ] }, { diff --git a/src/main/resources/web/jsps/graph.jsp b/src/main/resources/web/jsps/graph.jsp index c30481f..e710d2c 100644 --- a/src/main/resources/web/jsps/graph.jsp +++ b/src/main/resources/web/jsps/graph.jsp @@ -122,15 +122,6 @@ </select> </c:if> - <c:if test="${fn:length(source) > 0}"> - Source: - <c:forEach var="row" items="${source}"> - <label class="radio-label"> - <input type="radio" name="source" value="${row[0]}"${row[1]}> ${row[0]} - </label> - </c:forEach> - - </c:if> <c:if test="${fn:length(server) > 0}"> Server: <c:forEach var="row" items="${server}">

1 0

[metrics-web/release] Document changes to OnionPerf graphs.
by karsten＠torproject.org 09 Nov '19

09 Nov '19

commit ad1221cb980aa5bf3bf075338d2588d803e652c2 Author: Karsten Loesing <karsten.loesing(a)gmx.net> Date: Thu Dec 20 10:03:31 2018 +0100 Document changes to OnionPerf graphs. Still related to #28603. --- src/main/resources/web/jsps/stats.jsp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/main/resources/web/jsps/stats.jsp b/src/main/resources/web/jsps/stats.jsp index e5f9c6a..2ae6726 100644 --- a/src/main/resources/web/jsps/stats.jsp +++ b/src/main/resources/web/jsps/stats.jsp @@ -48,7 +48,7 @@ https://metrics.torproject.org/identifier.csv <li>August 15, 2018: Made the first batch of changes to per-graph CSV files.</li> <li>September 15, 2018: Removed all pre-aggregated CSV files.</li> <li>October 28, 2018: Added and/or removed columns to <a href="#webstats-tb-platform">Tor Browser downloads and updates by platform</a> and <a href="#webstats-tb-locale">Tor Browser downloads and updates by locale</a> graphs.</li> -<li>December 20, 2018 (scheduled): Remove source parameters and output rows with aggregates over all sources from <a href="#torperf">Time to download files over Tor</a>, <a href="#torperf-failures">Timeouts and failures of downloading files over Tor</a>, <a href="#onionperf-buildtimes">Circuit build times</a>, <a href="#onionperf-latencies">Circuit round-trip latencies</a> graphs.</li> +<li>December 20, 2018: Removed source parameters and output rows with aggregates over all sources from <a href="#torperf">Time to download files over Tor</a>, <a href="#torperf-failures">Timeouts and failures of downloading files over Tor</a>, <a href="#onionperf-buildtimes">Circuit build times</a>, <a href="#onionperf-latencies">Circuit round-trip latencies</a> graphs.</li> <li>December 20, 2018 (scheduled): Remove two graphs <a href="#bandwidth">Total relay bandwidth</a> and <a href="#bwhist-flags">Consumed bandwidth by Exit/Guard flag combination</a>, and update the data format of the <a href="#bandwidth-flags">Advertised and consumed bandwidth by relay flag</a> graph to cover all data previously contained in the first two graphs.</li> </ul> @@ -536,7 +536,6 @@ Performance <a href="#performance" name="performance" class="anchor">#</a></h2> <ul> <li>start: First UTC date (YYYY-MM-DD) to include in the file.</li> <li>end: Last UTC date (YYYY-MM-DD) to include in the file.</li> -<li>source: Name of the OnionPerf or Torperf service performing measurements, or "all" for measurements performed by any service. This parameter is going to be removed after December 20, 2018.</li> <li>server: Either "public" for requests to a server on the public internet, or "onion" for requests to a version 2 onion server.</li> <li>filesize: Size of the downloaded file in bytes, with pre-defined possible values: "50kb", "1mb", or "5mb".</li> </ul> @@ -546,7 +545,7 @@ Performance <a href="#performance" name="performance" class="anchor">#</a></h2> <ul> <li>date: UTC date (YYYY-MM-DD) when download performance was measured.</li> <li>filesize: Size of the downloaded file in bytes.</li> -<li>source: Name of the OnionPerf or Torperf service performing measurements. If this column contains the empty string, all measurements are included, regardless of which service performed them. Output rows with aggregates over all sources are going to be removed after December 20, 2018.</li> +<li>source: Name of the OnionPerf or Torperf service performing measurements.</li> <li>server: Either "public" if the request was made to a server on the public internet, or "onion" if the request was made to a version 2 onion server.</li> <li>q1: First quartile of time in milliseconds until receiving the last byte.</li> <li>md: Median of time in milliseconds until receiving the last byte.</li> @@ -563,7 +562,6 @@ Performance <a href="#performance" name="performance" class="anchor">#</a></h2> <ul> <li>start: First UTC date (YYYY-MM-DD) to include in the file.</li> <li>end: Last UTC date (YYYY-MM-DD) to include in the file.</li> -<li>source: Name of the OnionPerf or Torperf service performing measurements, or "all" for measurements performed by any service. This parameter is going to be removed after December 20, 2018.</li> <li>server: Either "public" for requests to a server on the public internet, or "onion" for requests to a version 2 onion server.</li> <li>filesize: Size of the downloaded file in bytes, with pre-defined possible values: "50kb", "1mb", or "5mb".</li> </ul> @@ -573,7 +571,7 @@ Performance <a href="#performance" name="performance" class="anchor">#</a></h2> <ul> <li>date: UTC date (YYYY-MM-DD) when download performance was measured.</li> <li>filesize: Size of the downloaded file in bytes.</li> -<li>source: Name of the OnionPerf or Torperf service performing measurements. If this column contains the empty string, all measurements are included, regardless of which service performed them. Output rows with aggregates over all sources are going to be removed after December 20, 2018.</li> +<li>source: Name of the OnionPerf or Torperf service performing measurements.</li> <li>server: Either "public" if the request was made to a server on the public internet, or "onion" if the request was made to a version 2 onion server.</li> <li>timeouts: Fraction of requests that timed out when attempting to download the static file over Tor.</li> <li>failures: Fraction of requests that failed when attempting to download the static file over Tor.</li> @@ -589,14 +587,13 @@ Performance <a href="#performance" name="performance" class="anchor">#</a></h2> <ul> <li>start: First UTC date (YYYY-MM-DD) to include in the file.</li> <li>end: Last UTC date (YYYY-MM-DD) to include in the file.</li> -<li>source: Name of the OnionPerf or Torperf service performing measurements, or "all" for measurements performed by any service. This parameter is going to be removed after December 20, 2018.</li> </ul> <h4>Columns</h4> <ul> <li>date: UTC date (YYYY-MM-DD) when download performance was measured.</li> -<li>source: Name of the OnionPerf or Torperf service performing measurements. If this column contains the empty string, all measurements are included, regardless of which service performed them. Output rows with aggregates over all sources are going to be removed after December 20, 2018.</li> +<li>source: Name of the OnionPerf or Torperf service performing measurements.</li> <li>position: Position in the circuit, from first to third hop.</li> <li>q1: First quartile of time in milliseconds until successfully extending the circuit to the given position.</li> <li>md: Median of time in milliseconds until successfully extending the circuit to the given position.</li> @@ -613,14 +610,13 @@ Performance <a href="#performance" name="performance" class="anchor">#</a></h2> <ul> <li>start: First UTC date (YYYY-MM-DD) to include in the file.</li> <li>end: Last UTC date (YYYY-MM-DD) to include in the file.</li> -<li>source: Name of the OnionPerf or Torperf service performing measurements, or "all" for measurements performed by any service. This parameter is going to be removed after December 20, 2018.</li> </ul> <h4>Columns</h4> <ul> <li>date: UTC date (YYYY-MM-DD) when download performance was measured.</li> -<li>source: Name of the OnionPerf or Torperf service performing measurements. If this column contains the empty string, all measurements are included, regardless of which service performed them. Output rows with aggregates over all sources are going to be removed after December 20, 2018.</li> +<li>source: Name of the OnionPerf or Torperf service performing measurements.</li> <li>server: Either "public" if the request was made to a server on the public internet, or "onion" if the request was made to a version 2 onion server.</li> <li>q1: First quartile of time in milliseconds between sending the HTTP request and receiving the HTTP response header.</li> <li>md: Median of time in milliseconds between sending the HTTP request and receiving the HTTP response header.</li>

1 0

[metrics-web/release] Rewrite censorship detector in Java.
by karsten＠torproject.org 09 Nov '19

09 Nov '19

commit a367168a782e864bdacb610857b1dc5d58fd192d Author: Karsten Loesing <karsten.loesing(a)gmx.net> Date: Sun Dec 9 12:02:42 2018 +0100 Rewrite censorship detector in Java. This allows us to remove the last remaining Python parts from the daily updater. Implements #21588. --- build.xml | 26 -- .../torproject/metrics/stats/clients/Detector.java | 433 +++++++++++++++++++++ .../org/torproject/metrics/stats/clients/Main.java | 5 + src/main/python/clients/country_info.py | 255 ------------ src/main/python/clients/detector.py | 242 ------------ 5 files changed, 438 insertions(+), 523 deletions(-) diff --git a/build.xml b/build.xml index 6736e19..93eda7b 100644 --- a/build.xml +++ b/build.xml @@ -23,7 +23,6 @@ <property name="tardepends" value="war" /> <property name="Rsources" value="${basedir}/src/main/R" /> - <property name="pysources" value="${basedir}/src/main/python" /> <property name="specdir" value="${basedir}/generated/spec" /> @@ -360,32 +359,7 @@ <target name="clients" > <property name="module.name" value="clients" /> - <property name="localmoddir" value="${modulebase}/${module.name}" /> - - <property name="statsdir" - value="${localmoddir}/stats" /> - <mkdir dir="${statsdir}" /> - <antcall target="run-java" /> - - <antcall target="run-R" > - <param name="module.Rscript" value="userstats-detector.R" /> - </antcall> - - <exec executable="python" - dir="${localmoddir}" - failonerror="true" > - <arg value="${pysources}/${module.name}/detector.py" /> - <arg value="userstats-detector.csv" /> - <arg value="userstats-ranges.csv" /> - </exec> - - <antcall target="run-R" > - <param name="module.Rscript" value="merge-clients.R" /> - </antcall> - - <copy file="${localmoddir}/clients.csv" todir="${statsdir}" /> - <copy file="${localmoddir}/userstats-combined.csv" todir="${statsdir}" /> </target> <target name="servers" > diff --git a/src/main/java/org/torproject/metrics/stats/clients/Detector.java b/src/main/java/org/torproject/metrics/stats/clients/Detector.java new file mode 100644 index 0000000..1a523c2 --- /dev/null +++ b/src/main/java/org/torproject/metrics/stats/clients/Detector.java @@ -0,0 +1,433 @@ +/* Copyright 2011 George Danezis <gdane(a)microsoft.com> + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted (subject to the limitations in the + * disclaimer below) provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of <Owner Organization> nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE + * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT + * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * (Clear BSD license: + * http://labs.metacarta.com/license-explanation.html#license) + * + * Copyright 2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.metrics.stats.clients; + +import org.apache.commons.math3.distribution.NormalDistribution; +import org.apache.commons.math3.distribution.PoissonDistribution; +import org.apache.commons.math3.stat.descriptive.moment.Mean; +import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation; +import org.apache.commons.math3.stat.descriptive.rank.Percentile; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.LineNumberReader; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.LocalDate; +import java.time.format.DateTimeParseException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.stream.Collectors; + +/** Censorship detector that reads a .csv file of the number of Tor clients and + * finds anomalies that might be indicative of censorship. */ +public class Detector { + + /** Input file. */ + private static final Path INPUT_PATH = Paths.get("stats", "userstats.csv"); + + /** Output file. */ + private static final Path OUTPUT_PATH = Paths.get("stats", "clients.csv"); + + /** Number of largest locations to be included in the detection algorithm. */ + private static final int NUM_LARGEST_LOCATIONS = 50; + + /** Time interval in days to model connection rates. */ + private static final int INTERV = 7; + + /** Compound key under which client estimates are stored in both input and + * output files. */ + private static class ClientsKey implements Comparable<ClientsKey> { + + /** Date when clients connected to the Tor network. */ + private LocalDate date; + + /** Whether clients connected via relays (true) or bridges (false). */ + private boolean nodeIsRelay; + + /** Two-letter lower-case country code of the country from which clients + * connected, "??" if the country could not be resolved, or left empty for + * all countries together. */ + private String country; + + /** Name of the transport used by clients to connect using bridges, or left + * empty for all transports together. */ + private String transport = ""; + + /** IP version used by clients to connect using bridges, or left empty for + * all IP versions together. */ + private String version = ""; + + ClientsKey(LocalDate date, boolean nodeIsRelay, String country) { + this.date = date; + this.nodeIsRelay = nodeIsRelay; + this.country = country; + } + + ClientsKey(LocalDate date, boolean nodeIsRelay, String country, + String transport, String version) { + this(date, nodeIsRelay, country); + this.transport = transport; + this.version = version; + } + + @Override + public int compareTo(ClientsKey other) { + if (!this.date.equals(other.date)) { + return this.date.compareTo(other.date); + } else if (!this.nodeIsRelay && other.nodeIsRelay) { + return -1; + } else if (this.nodeIsRelay && !other.nodeIsRelay) { + return 1; + } else if (!this.country.equals(other.country)) { + return this.country.compareTo(other.country); + } else if (!this.transport.equals(other.transport)) { + return this.transport.compareTo(other.transport); + } else if (!this.version.equals(other.version)) { + return this.version.compareTo(other.version); + } else { + return 0; + } + } + + @Override + public boolean equals(Object otherObject) { + if (!(otherObject instanceof ClientsKey)) { + return false; + } else { + ClientsKey other = (ClientsKey) otherObject; + return this.date.equals(other.date) + && this.nodeIsRelay == other.nodeIsRelay + && this.country.equals(other.country) + && this.transport.equals(other.transport) + && this.version.equals(other.version); + } + } + + @Override + public int hashCode() { + return 3 * this.date.hashCode() + (this.nodeIsRelay ? 5 : 0) + + 7 * this.country.hashCode() + 11 * this.transport.hashCode() + + 13 * this.version.hashCode(); + } + + @Override + public String toString() { + return String.format("%s,%s,%s,%s,%s", + this.date.toString(), this.nodeIsRelay ? "relay" : "bridge", + this.country, this.transport, this.version); + } + } + + /** Value class that stores everything we already knew about a specific + * subset of clients from the input file. */ + private static class ClientsEstimates { + + /** Estimated number of clients. */ + private int clients; + + /** Fraction of relays or bridges in percent that the estimate is based on, + * between 0 and 100. */ + private int frac; + + ClientsEstimates(int clients, int frac) { + this.clients = clients; + this.frac = frac; + } + + @Override + public String toString() { + return String.format("%d,%d", this.clients, this.frac); + } + } + + /** Value class that stores everything we're computing here about a specific + * subset of clients from the input file. */ + private static class ClientsRanges { + + /** Lower number of expected clients under the assumption that there has + * been no censorship event, as computed here. */ + private int lower; + + /** Upper number of expected clients under the assumption that there has + * been no release of censorship, as computed here. */ + private int upper; + + ClientsRanges(int lower, int upper) { + this.lower = lower; + this.upper = upper; + } + + @Override + public String toString() { + return String.format("%d,%d", this.lower, this.upper); + } + } + + /** Run censorship detection. */ + public void detect() throws IOException { + SortedMap<ClientsKey, ClientsEstimates> estimates = readInputFile(); + Set<String> largestLocations = findLargestLocations(estimates); + Map<LocalDate, List<Double>> ratios = computeRatiosOfLargestLocations( + estimates, largestLocations); + Map<LocalDate, List<Double>> ratiosWithoutOutliers = removeOutliers(ratios); + SortedMap<ClientsKey, ClientsRanges> ranges = computeRanges(estimates, + ratiosWithoutOutliers); + writeOutputFile(estimates, ranges); + } + + /** Read and return the parsed input file containing comma-separated estimates + * of client numbers. */ + private static SortedMap<ClientsKey, ClientsEstimates> readInputFile() + throws IOException { + SortedMap<ClientsKey, ClientsEstimates> estimates = new TreeMap<>(); + File inputFile = INPUT_PATH.toFile(); + if (!inputFile.exists()) { + throw new IOException(String.format("Input file %s does not exist.", + inputFile)); + } + try (LineNumberReader lnr = new LineNumberReader( + new FileReader(inputFile))) { + String line = lnr.readLine(); + if (!"date,node,country,transport,version,frac,users".equals(line)) { + throw new IOException(String.format("Unable to read input file %s with " + + "unrecognized header line '%s'. Not running detector.", inputFile, + line)); + } + while ((line = lnr.readLine()) != null) { + ClientsKey key = null; + ClientsEstimates value = null; + boolean invalidLine = false; + String[] lineParts = line.split(","); + if (lineParts.length == 7) { + try { + LocalDate date = LocalDate.parse(lineParts[0]); + boolean nodeIsRelay = false; + if ("relay".equals(lineParts[1])) { + nodeIsRelay = true; + } else if (!"bridge".equals(lineParts[1])) { + invalidLine = true; + } + String country = lineParts[2].replaceAll("\"", ""); + String transport = lineParts[3].replaceAll("\"", ""); + String version = lineParts[4].replaceAll("\"", ""); + key = new ClientsKey(date, nodeIsRelay, country, transport, + version); + } catch (DateTimeParseException e) { + invalidLine = true; + } + try { + int frac = Integer.parseInt(lineParts[5]); + int clients = Integer.parseInt(lineParts[6]); + value = new ClientsEstimates(clients, frac); + } catch (NumberFormatException e) { + invalidLine = true; + } + } else { + invalidLine = true; + } + if (invalidLine) { + throw new IOException(String.format( + "Invalid line %d '%s' in input file %s.", lnr.getLineNumber(), + line, inputFile)); + } else { + estimates.put(key, value); + } + } + } + return estimates; + } + + /** Return the NUM_LARGEST_LOCATIONS countries (except for "??") with the + * largest number of estimated clients on the last known date in the input + * data set. + * + * Note that this implies that lower/upper values are going to change, + * depending on which countries had most clients on the last known date in the + * input data set. */ + private static Set<String> findLargestLocations( + SortedMap<ClientsKey, ClientsEstimates> clients) throws IOException { + LocalDate lastKnownDate = clients.keySet().stream() + .filter(c -> c.nodeIsRelay) + .map(c -> c.date) + .max(LocalDate::compareTo) + .orElseThrow(() -> new IOException("Unable to find maximum date. Was " + + "the input file empty or otherwise corrupt?")); + return clients.entrySet().stream() + .filter(c -> lastKnownDate.equals(c.getKey().date)) + .filter(c -> c.getKey().nodeIsRelay) + .filter(c -> !"".equals(c.getKey().country)) + .filter(c -> !"??".equals(c.getKey().country)) + .sorted((c1, c2) -> Integer.compare(c2.getValue().clients, + c1.getValue().clients)) + .map(c -> c.getKey().country) + .limit(NUM_LARGEST_LOCATIONS) + .collect(Collectors.toSet()); + } + + /** Compute the ratio of the client number estimate for a given date and + * country as compared to 1 week before, for all dates, for relay users, and + * for the largest locations. */ + private static Map<LocalDate, List<Double>> computeRatiosOfLargestLocations( + SortedMap<ClientsKey, ClientsEstimates> estimates, + Set<String> largestLocations) { + Map<LocalDate, List<Double>> ratios = new HashMap<>(); + for (Map.Entry<ClientsKey, ClientsEstimates> numerator + : estimates.entrySet()) { + if (!numerator.getKey().nodeIsRelay + || !largestLocations.contains(numerator.getKey().country)) { + continue; + } + ClientsEstimates denominator = estimates.get(new ClientsKey( + numerator.getKey().date.minusDays(INTERV), true, + numerator.getKey().country)); + if (null == denominator || denominator.clients == 0) { + continue; + } + if (!ratios.containsKey(numerator.getKey().date)) { + ratios.put(numerator.getKey().date, new ArrayList<>()); + } + ratios.get(numerator.getKey().date).add( + ((double) numerator.getValue().clients) + / (double) denominator.clients); + } + return ratios; + } + + /** Exclude outliers from the given ratios by date that fall outside four + * inter-quartile ranges of the median and make sure that at least 8 ratio + * values remain. */ + private static SortedMap<LocalDate, List<Double>> removeOutliers( + Map<LocalDate, List<Double>> ratios) { + SortedMap<LocalDate, List<Double>> ratiosWithoutOutliers = new TreeMap<>(); + for (Map.Entry<LocalDate, List<Double>> e : ratios.entrySet()) { + double[] values = e.getValue().stream().mapToDouble(Double::doubleValue) + .toArray(); + Percentile percentile = new Percentile() + .withEstimationType(Percentile.EstimationType.R_7); + percentile.setData(values); + double median = percentile.evaluate(50.0); + double firstQuarter = percentile.evaluate(25.0); + double thirdQuarter = percentile.evaluate(75.0); + double interQuartileRange = thirdQuarter - firstQuarter; + List<Double> valuesWithoutOutliers = new ArrayList<>(); + for (double value : values) { + if (value > median - 4 * interQuartileRange + && value < median + 4 * interQuartileRange) { + valuesWithoutOutliers.add(value); + } + } + if (valuesWithoutOutliers.size() < 8) { + continue; + } + LocalDate date = e.getKey(); + ratiosWithoutOutliers.put(date, valuesWithoutOutliers); + } + return ratiosWithoutOutliers; + } + + /** Compute ranges as the expected minimum and maximum number of users. */ + private static SortedMap<ClientsKey, ClientsRanges> computeRanges( + SortedMap<ClientsKey, ClientsEstimates> estimates, + Map<LocalDate, List<Double>> ratiosWithoutOutliers) { + SortedMap<ClientsKey, ClientsRanges> ranges = new TreeMap<>(); + for (Map.Entry<ClientsKey, ClientsEstimates> estimatesEntry + : estimates.entrySet()) { + LocalDate date = estimatesEntry.getKey().date; + if (!estimatesEntry.getKey().nodeIsRelay + || "".equals(estimatesEntry.getKey().country) + || "??".equals(estimatesEntry.getKey().country) + || !ratiosWithoutOutliers.containsKey(date)) { + continue; + } + ClientsEstimates referenceEstimate = estimates.get( + new ClientsKey(date.minusDays(INTERV), + true, estimatesEntry.getKey().country)); + if (null == referenceEstimate || referenceEstimate.clients == 0) { + continue; + } + double[] values = ratiosWithoutOutliers.get(date).stream() + .mapToDouble(Double::doubleValue).toArray(); + double mean = new Mean().evaluate(values); + double std = new StandardDeviation(false).evaluate(values); + NormalDistribution normalDistribution = new NormalDistribution(mean, std); + PoissonDistribution poissonDistribution + = new PoissonDistribution(referenceEstimate.clients); + int lower = Math.max(0, + (int) (normalDistribution.inverseCumulativeProbability(0.0001) + * poissonDistribution.inverseCumulativeProbability(0.0001))); + int upper = + (int) (normalDistribution.inverseCumulativeProbability(0.9999) + * poissonDistribution.inverseCumulativeProbability(0.9999)); + ranges.put(estimatesEntry.getKey(), new ClientsRanges(lower, upper)); + } + return ranges; + } + + /** Write client number estimates together with lower and upper bounds as + * comma-separated values to the output file. */ + private static void writeOutputFile( + SortedMap<ClientsKey, ClientsEstimates> estimates, + SortedMap<ClientsKey, ClientsRanges> ranges) throws IOException { + try (BufferedWriter bw = new BufferedWriter( + new FileWriter(OUTPUT_PATH.toFile()))) { + bw.write( + "date,node,country,transport,version,lower,upper,clients,frac\n"); + for (Map.Entry<ClientsKey, ClientsEstimates> e : estimates.entrySet()) { + String rangesString = ","; + if (ranges.containsKey(e.getKey())) { + rangesString = ranges.get(e.getKey()).toString(); + } + bw.write(String.format("%s,%s,%s%n", e.getKey().toString(), + rangesString, e.getValue().toString())); + } + } + } +} + diff --git a/src/main/java/org/torproject/metrics/stats/clients/Main.java b/src/main/java/org/torproject/metrics/stats/clients/Main.java index 48d8d8d..0f1087b 100644 --- a/src/main/java/org/torproject/metrics/stats/clients/Main.java +++ b/src/main/java/org/torproject/metrics/stats/clients/Main.java @@ -59,6 +59,11 @@ public class Main { log.info("Disconnecting from database."); database.close(); + + log.info("Running detector."); + new Detector().detect(); + + log.info("Terminating clients module."); } private static final long ONE_HOUR_MILLIS = 60L * 60L * 1000L; diff --git a/src/main/python/clients/country_info.py b/src/main/python/clients/country_info.py deleted file mode 100644 index 1a505d0..0000000 --- a/src/main/python/clients/country_info.py +++ /dev/null @@ -1,255 +0,0 @@ -# -*- coding: utf-8 -*- - -countries = { - "ad" : "Andorra", - "ae" : "the United Arab Emirates", - "af" : "Afghanistan", - "ag" : "Antigua and Barbuda", - "ai" : "Anguilla", - "al" : "Albania", - "am" : "Armenia", - "an" : "the Netherlands Antilles", - "ao" : "Angola", - "aq" : "Antarctica", - "ar" : "Argentina", - "as" : "American Samoa", - "at" : "Austria", - "au" : "Australia", - "aw" : "Aruba", - "ax" : "the Aland Islands", - "az" : "Azerbaijan", - "ba" : "Bosnia and Herzegovina", - "bb" : "Barbados", - "bd" : "Bangladesh", - "be" : "Belgium", - "bf" : "Burkina Faso", - "bg" : "Bulgaria", - "bh" : "Bahrain", - "bi" : "Burundi", - "bj" : "Benin", - "bl" : "Saint Bartelemey", - "bm" : "Bermuda", - "bn" : "Brunei", - "bo" : "Bolivia", - "bq" : "Bonaire, Sint Eustatius and Saba", - "br" : "Brazil", - "bs" : "the Bahamas", - "bt" : "Bhutan", - "bv" : "the Bouvet Island", - "bw" : "Botswana", - "by" : "Belarus", - "bz" : "Belize", - "ca" : "Canada", - "cc" : "the Cocos (Keeling) Islands", - "cd" : "the Democratic Republic of the Congo", - "cf" : "Central African Republic", - "cg" : "Congo", - "ch" : "Switzerland", - "ci" : u"Côte d'Ivoire", - "ck" : "the Cook Islands", - "cl" : "Chile", - "cm" : "Cameroon", - "cn" : "China", - "co" : "Colombia", - "cr" : "Costa Rica", - "cu" : "Cuba", - "cv" : "Cape Verde", - "cw" : u"Curaçao", - "cx" : "the Christmas Island", - "cy" : "Cyprus", - "cz" : "the Czech Republic", - "de" : "Germany", - "dj" : "Djibouti", - "dk" : "Denmark", - "dm" : "Dominica", - "do" : "the Dominican Republic", - "dz" : "Algeria", - "ec" : "Ecuador", - "ee" : "Estonia", - "eg" : "Egypt", - "eh" : "the Western Sahara", - "er" : "Eritrea", - "es" : "Spain", - "et" : "Ethiopia", - "fi" : "Finland", - "fj" : "Fiji", - "fk" : "the Falkland Islands (Malvinas)", - "fm" : "the Federated States of Micronesia", - "fo" : "the Faroe Islands", - "fr" : "France", - "ga" : "Gabon", - "gb" : "the United Kingdom", - "gd" : "Grenada", - "ge" : "Georgia", - "gf" : "French Guiana", - "gg" : "Guernsey", - "gh" : "Ghana", - "gi" : "Gibraltar", - "gl" : "Greenland", - "gm" : "Gambia", - "gn" : "Guinea", - "gp" : "Guadeloupe", - "gq" : "Equatorial Guinea", - "gr" : "Greece", - "gs" : "South Georgia and the South Sandwich Islands", - "gt" : "Guatemala", - "gu" : "Guam", - "gw" : "Guinea-Bissau", - "gy" : "Guyana", - "hk" : "Hong Kong", - "hm" : "Heard Island and McDonald Islands", - "hn" : "Honduras", - "hr" : "Croatia", - "ht" : "Haiti", - "hu" : "Hungary", - "id" : "Indonesia", - "ie" : "Ireland", - "il" : "Israel", - "im" : "the Isle of Man", - "in" : "India", - "io" : "the British Indian Ocean Territory", - "iq" : "Iraq", - "ir" : "Iran", - "is" : "Iceland", - "it" : "Italy", - "je" : "Jersey", - "jm" : "Jamaica", - "jo" : "Jordan", - "jp" : "Japan", - "ke" : "Kenya", - "kg" : "Kyrgyzstan", - "kh" : "Cambodia", - "ki" : "Kiribati", - "km" : "Comoros", - "kn" : "Saint Kitts and Nevis", - "kp" : "North Korea", - "kr" : "the Republic of Korea", - "kw" : "Kuwait", - "ky" : "the Cayman Islands", - "kz" : "Kazakhstan", - "la" : "Laos", - "lb" : "Lebanon", - "lc" : "Saint Lucia", - "li" : "Liechtenstein", - "lk" : "Sri Lanka", - "lr" : "Liberia", - "ls" : "Lesotho", - "lt" : "Lithuania", - "lu" : "Luxembourg", - "lv" : "Latvia", - "ly" : "Libya", - "ma" : "Morocco", - "mc" : "Monaco", - "md" : "the Republic of Moldova", - "me" : "Montenegro", - "mf" : "Saint Martin", - "mg" : "Madagascar", - "mh" : "the Marshall Islands", - "mk" : "Macedonia", - "ml" : "Mali", - "mm" : "Burma", - "mn" : "Mongolia", - "mo" : "Macau", - "mp" : "the Northern Mariana Islands", - "mq" : "Martinique", - "mr" : "Mauritania", - "ms" : "Montserrat", - "mt" : "Malta", - "mu" : "Mauritius", - "mv" : "the Maldives", - "mw" : "Malawi", - "mx" : "Mexico", - "my" : "Malaysia", - "mz" : "Mozambique", - "na" : "Namibia", - "nc" : "New Caledonia", - "ne" : "Niger", - "nf" : "Norfolk Island", - "ng" : "Nigeria", - "ni" : "Nicaragua", - "nl" : "the Netherlands", - "no" : "Norway", - "np" : "Nepal", - "nr" : "Nauru", - "nu" : "Niue", - "nz" : "New Zealand", - "om" : "Oman", - "pa" : "Panama", - "pe" : "Peru", - "pf" : "French Polynesia", - "pg" : "Papua New Guinea", - "ph" : "the Philippines", - "pk" : "Pakistan", - "pl" : "Poland", - "pm" : "Saint Pierre and Miquelon", - "pn" : "the Pitcairn Islands", - "pr" : "Puerto Rico", - "ps" : "the Palestinian Territory", - "pt" : "Portugal", - "pw" : "Palau", - "py" : "Paraguay", - "qa" : "Qatar", - "re" : "Reunion", - "ro" : "Romania", - "rs" : "Serbia", - "ru" : "Russia", - "rw" : "Rwanda", - "sa" : "Saudi Arabia", - "sb" : "the Solomon Islands", - "sc" : "the Seychelles", - "sd" : "Sudan", - "se" : "Sweden", - "sg" : "Singapore", - "sh" : "Saint Helena", - "si" : "Slovenia", - "sj" : "Svalbard and Jan Mayen", - "sk" : "Slovakia", - "sl" : "Sierra Leone", - "sm" : "San Marino", - "sn" : "Senegal", - "so" : "Somalia", - "sr" : "Suriname", - "ss" : "South Sudan", - "st" : u"São Tomé and Príncipe", - "sv" : "El Salvador", - "sx" : "Sint Maarten", - "sy" : "the Syrian Arab Republic", - "sz" : "Swaziland", - "tc" : "Turks and Caicos Islands", - "td" : "Chad", - "tf" : "the French Southern Territories", - "tg" : "Togo", - "th" : "Thailand", - "tj" : "Tajikistan", - "tk" : "Tokelau", - "tl" : "East Timor", - "tm" : "Turkmenistan", - "tn" : "Tunisia", - "to" : "Tonga", - "tr" : "Turkey", - "tt" : "Trinidad and Tobago", - "tv" : "Tuvalu", - "tw" : "Taiwan", - "tz" : "the United Republic of Tanzania", - "ua" : "Ukraine", - "ug" : "Uganda", - "um" : "the United States Minor Outlying Islands", - "us" : "the United States", - "uy" : "Uruguay", - "uz" : "Uzbekistan", - "va" : "Vatican City", - "vc" : "Saint Vincent and the Grenadines", - "ve" : "Venezuela", - "vg" : "the British Virgin Islands", - "vi" : "the United States Virgin Islands", - "vn" : "Vietnam", - "vu" : "Vanuatu", - "wf" : "Wallis and Futuna", - "ws" : "Samoa", - "xk" : "Kosovo", - "ye" : "Yemen", - "yt" : "Mayotte", - "za" : "South Africa", - "zm" : "Zambia", - "zw" : "Zimbabwe" - } diff --git a/src/main/python/clients/detector.py b/src/main/python/clients/detector.py deleted file mode 100644 index b0a98af..0000000 --- a/src/main/python/clients/detector.py +++ /dev/null @@ -1,242 +0,0 @@ -## Copyright (c) 2011 George Danezis <gdane(a)microsoft.com> -## -## All rights reserved. -## -## Redistribution and use in source and binary forms, with or without -## modification, are permitted (subject to the limitations in the -## disclaimer below) provided that the following conditions are met: -## -## * Redistributions of source code must retain the above copyright -## notice, this list of conditions and the following disclaimer. -## -## * Redistributions in binary form must reproduce the above copyright -## notice, this list of conditions and the following disclaimer in the -## documentation and/or other materials provided with the -## distribution. -## -## * Neither the name of <Owner Organization> nor the names of its -## contributors may be used to endorse or promote products derived -## from this software without specific prior written permission. -## -## NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE -## GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT -## HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED -## WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -## MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -## DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -## LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -## CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -## SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -## BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE -## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN -## IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -## -## (Clear BSD license: http://labs.metacarta.com/license-explanation.html#license) - -## This script reads a .csv file of the number of Tor users and finds -## anomalies that might be indicative of censorship. - -# Dep: numpy -import numpy -from numpy import mean, std, percentile - -# Dep: scipy -import scipy.stats -from scipy.stats.distributions import norm -from scipy.stats.distributions import poisson - -# Std lib -from datetime import date -from datetime import timedelta -import os.path - -# Country code -> Country names -import country_info - -# write utf8 to file -import codecs - -def get_country_name_from_cc(country_code): - if (country_code.lower() in country_info.countries): - return country_info.countries[country_code.lower()] - return country_code # if we didn't find the cc in our map - -""" -Represents a .csv file containing information on the number of -connecting Tor users per country. - -'store': Dictionary with (<country code>, <counter>) as key, and the number of users as value. - <country code> can also be "date"... -'all_dates': List of the data intervals (with default timedelta: 1 day). -'country_codes': List of all relevant country codes. -'MAX_INDEX': Length of store, number of country codes etc. -'date_min': The oldest date found in the .csv. -'date_min': The latest date found in the .csv. -""" -class torstatstore: - def __init__(self, file_name): - f = file(file_name) - country_codes = f.readline() - country_codes = country_codes.strip().split(",") - - store = {} - MAX_INDEX = 0 - for i, line in enumerate(f): - MAX_INDEX += 1 - line_parsed = line.strip().split(",") - for j, (ccode, val) in enumerate(zip(country_codes,line_parsed)): - processed_val = None - if ccode == "date": - try: - year, month, day = int(val[:4]), int(val[5:7]), int(val[8:10]) - processed_val = date(year, month, day) - except Exception, e: - print "Parsing error (ignoring line %s):" % j - print "%s" % val,e - break - - elif val != "NA": - processed_val = int(val) - store[(ccode, i)] = processed_val - - # min and max - date_min = store[("date", 0)] - date_max = store[("date", i)] - - all_dates = [] - d = date_min - dt = timedelta(days=1) - while d <= date_max: - all_dates += [d] - d = d + dt - - # Save for later - self.store = store - self.all_dates = all_dates - self.country_codes = country_codes - self.MAX_INDEX = MAX_INDEX - self.date_min = date_min - self.date_max = date_max - - """Return a list representing a time series of 'ccode' with respect - to the number of connected users. - """ - def get_country_series(self, ccode): - assert ccode in self.country_codes - series = {} - for d in self.all_dates: - series[d] = None - for i in range(self.MAX_INDEX): - series[self.store[("date", i)]] = self.store[(ccode, i)] - sx = [] - for d in self.all_dates: - sx += [series[d]] - return sx - - """Return an ordered list containing tuples of the form (<number of - users>, <country code>). The list is ordered with respect to the - number of users for each country. - """ - def get_largest(self, number): - exclude = set(["all", "??", "date"]) - l = [(self.store[(c, self.MAX_INDEX-1)], c) for c in self.country_codes if c not in exclude] - l.sort() - l.reverse() - return l[:number] - - """Return a dictionary, with <country code> as key, and the time - series of the country code as the value. - """ - def get_largest_locations(self, number): - l = self.get_largest(number) - res = {} - for _, ccode in l[:number]: - res[ccode] = self.get_country_series(ccode) - return res - -"""Return a list containing lists (?) where each such list contains -the difference in users for a time delta of 'days' -""" -def n_day_rel(series, days): - rel = [] - for i, v in enumerate(series): - if series[i] is None: - rel += [None] - continue - - if i - days < 0 or series[i-days] is None or series[i-days] == 0: - rel += [None] - else: - rel += [ float(series[i]) / series[i-days]] - return rel - -# Main model: computes the expected min / max range of number of users -def make_tendencies_minmax(l, INTERVAL = 1): - lminus1 = dict([(ccode, n_day_rel(l[ccode], INTERVAL)) for ccode in l]) - c = lminus1[lminus1.keys()[0]] - dists = [] - minx = [] - maxx = [] - for i in range(len(c)): - vals = [lminus1[ccode][i] for ccode in lminus1.keys() if lminus1[ccode][i] != None] - if len(vals) < 8: - dists += [None] - minx += [None] - maxx += [None] - else: - vals.sort() - median = percentile(vals, 50) - q1 = percentile(vals, 25) - q2 = percentile(vals, 75) - qd = q2 - q1 - vals = [v for v in vals if median - qd*4 < v and v < median + qd*4] - if len(vals) < 8: - dists += [None] - minx += [None] - maxx += [None] - continue - mu = mean(vals) - signma = std(vals) - dists += [(mu, signma)] - maxx += [norm.ppf(0.9999, mu, signma)] - minx += [norm.ppf(1 - 0.9999, mu, signma)] - ## print minx[-1], maxx[-1] - return minx, maxx - -"""Write a CSV report on the minimum/maximum users of each country per date.""" -def write_all(tss, minc, maxc, RANGES_FILE, INTERVAL=7): - ranges_file = file(RANGES_FILE, "w") - ranges_file.write("date,country,minusers,maxusers\n") - exclude = set(["all", "??", "date"]) - for c in tss.country_codes: - if c in exclude: - continue - series = tss.get_country_series(c) - for i, v in enumerate(series): - if i > 0 and i - INTERVAL >= 0 and series[i] != None and series[i-INTERVAL] != None and series[i-INTERVAL] != 0 and minc[i]!= None and maxc[i]!= None: - minv = minc[i] * poisson.ppf(1-0.9999, series[i-INTERVAL]) - maxv = maxc[i] * poisson.ppf(0.9999, series[i-INTERVAL]) - if not minv < maxv: - print minv, maxv, series[i-INTERVAL], minc[i], maxc[i] - assert minv < maxv - if minv < 0.0: - minv = 0.0 - ranges_file.write("%s,%s,%s,%s\n" % (tss.all_dates[i], c, minv, maxv)) - ranges_file.close() - -# INTERV is the time interval to model connection rates; -# consider maximum DAYS days back. -def detect(CSV_FILE = "userstats-detector.csv", - RANGES_FILE = "userstats-ranges.csv", - INTERV = 7, DAYS = 6 * 31): - tss = torstatstore(CSV_FILE) - l = tss.get_largest_locations(50) - minx, maxx = make_tendencies_minmax(l, INTERV) - write_all(tss, minx, maxx, RANGES_FILE, INTERV) - -def main(): - detect() - -if __name__ == "__main__": - main()

1 0