commit a91f2dc9f51c2bd0e7c20e13a84c66dcb60ccd3d Author: Karsten Loesing karsten.loesing@gmx.net Date: Wed Oct 26 15:23:25 2016 +0200
Split up clients.csv for faster graphs.
Most client graphs (except for clients by country and transport) use the same clients.csv file as input. That file has grown to 26M by now, and it seems wasteful to read numbers for clients connecting to bridges (relays) when graphing clients connecting to relays (bridges). Split up clients.csv and take out the node column.
Performance gain is 1.3 seconds for updating graphs on directly connecting clients and 2.0 seconds for graphs showing clients connecting via bridges. --- modules/clients/split-clients.R | 6 ++++++ shared/bin/80-run-clients-stats.sh | 5 ++++- shared/bin/99-copy-stats-files.sh | 2 +- website/rserve/graphs.R | 26 ++++++++++++-------------- 4 files changed, 23 insertions(+), 16 deletions(-)
diff --git a/modules/clients/split-clients.R b/modules/clients/split-clients.R new file mode 100644 index 0000000..5f3cb74 --- /dev/null +++ b/modules/clients/split-clients.R @@ -0,0 +1,6 @@ +u <- read.csv("clients.csv", stringsAsFactors = FALSE) +write.csv(u[u$node == 'relay', names(u) != "node"], 'clients-relay.csv', + quote = FALSE, row.names = FALSE, na = '') +write.csv(u[u$node == 'bridge', names(u) != "node"], 'clients-bridge.csv', + quote = FALSE, row.names = FALSE, na = '') + diff --git a/shared/bin/80-run-clients-stats.sh b/shared/bin/80-run-clients-stats.sh index b296c37..a3efbe3 100755 --- a/shared/bin/80-run-clients-stats.sh +++ b/shared/bin/80-run-clients-stats.sh @@ -21,8 +21,11 @@ python detector.py
echo `date` "Merging censorship detector results." R --slave -f merge-clients.R > /dev/null 2>&1 + +echo `date` "Splitting results file." +R --slave -f split-clients.R > /dev/null 2>&1 mkdir -p stats/ -cp clients.csv stats/ +cp clients*.csv stats/ cp userstats-combined.csv stats/
echo `date` "Terminating." diff --git a/shared/bin/99-copy-stats-files.sh b/shared/bin/99-copy-stats-files.sh index 504216a..6daf22b 100755 --- a/shared/bin/99-copy-stats-files.sh +++ b/shared/bin/99-copy-stats-files.sh @@ -4,6 +4,6 @@ cp -a modules/legacy/stats/*.csv shared/stats/ cp -a modules/connbidirect/stats/connbidirect2.csv shared/stats/ cp -a modules/advbwdist/stats/advbwdist.csv shared/stats/ cp -a modules/hidserv/stats/hidserv.csv shared/stats/ -cp -a modules/clients/stats/clients.csv shared/stats/ +cp -a modules/clients/stats/clients*.csv shared/stats/ cp -a modules/clients/stats/userstats-combined.csv shared/stats/
diff --git a/website/rserve/graphs.R b/website/rserve/graphs.R index e3ccb06..6f7e119 100644 --- a/website/rserve/graphs.R +++ b/website/rserve/graphs.R @@ -766,21 +766,21 @@ plot_userstats <- function(start, end, node, variable, value, events, path) { end <- min(end, as.character(Sys.Date() - 2)) c <- read.csv(paste("/srv/metrics.torproject.org/metrics/shared/stats/", - "clients.csv", sep = ""), stringsAsFactors = FALSE) + "clients-", node, ".csv", sep = ""), + stringsAsFactors = FALSE) u <- c[c$date >= start & c$date <= end, ] - u <- rbind(u, data.frame(date = start, node = node, + u <- rbind(u, data.frame(date = start, country = ifelse(variable == 'country' & value != 'all', value, ''), transport = ifelse(variable == 'transport', value, ''), version = ifelse(variable == 'version', value, ''), lower = 0, upper = 0, clients = 0, frac = 0)) if (node == 'relay') { if (value != 'all') { - u <- u[u$country == value & u$node == 'relay', ] + u <- u[u$country == value, ] title <- paste("Directly connecting users from ", countryname(value), "\n", sep = "") } else { - u <- u[u$country == '' & u$transport == '' & u$version == '' & - u$node == 'relay', ] + u <- u[u$country == '', ] title <- "Directly connecting users\n" } u <- aggregate(list(lower = u$lower, upper = u$upper, @@ -790,20 +790,19 @@ plot_userstats <- function(start, end, node, variable, value, events, FUN = sum) } else if (variable == 'transport') { if ('!<OR>' %in% value) { - n <- u[u$transport != '' & u$transport != '<OR>' & - u$node == 'bridge', ] + n <- u[u$transport != '' & u$transport != '<OR>', ] n <- aggregate(list(lower = n$lower, upper = n$upper, clients = n$clients), by = list(date = n$date), FUN = sum) - u <- rbind(u, data.frame(date = n$date, node = 'bridge', + u <- rbind(u, data.frame(date = n$date, country = '', transport = '!<OR>', version = '', lower = n$lower, upper = n$upper, clients = n$clients, frac = NA)) } if (length(value) > 1) { - u <- u[u$transport %in% value & u$node == 'bridge', ] + u <- u[u$transport %in% value, ] u <- aggregate(list(lower = u$lower, upper = u$upper, users = u$clients), by = list(date = as.Date(u$date, "%Y-%m-%d"), @@ -811,7 +810,7 @@ plot_userstats <- function(start, end, node, variable, value, events, FUN = sum) title <- paste("Bridge users by transport\n") } else { - u <- u[u$transport == value & u$node == 'bridge', ] + u <- u[u$transport == value, ] u <- aggregate(list(lower = u$lower, upper = u$upper, users = u$clients), by = list(date = as.Date(u$date, "%Y-%m-%d"), @@ -826,7 +825,7 @@ plot_userstats <- function(start, end, node, variable, value, events, paste('transport', value)))))), "\n", sep = "") } } else if (variable == 'version') { - u <- u[u$version == value & u$node == 'bridge', ] + u <- u[u$version == value, ] title <- paste("Bridge users using IP", value, "\n", sep = "") u <- aggregate(list(lower = u$lower, upper = u$upper, users = u$clients), @@ -835,12 +834,11 @@ plot_userstats <- function(start, end, node, variable, value, events, FUN = sum) } else { if (value != 'all') { - u <- u[u$country == value & u$node == 'bridge', ] + u <- u[u$country == value, ] title <- paste("Bridge users from ", countryname(value), "\n", sep = "") } else { - u <- u[u$country == '' & u$transport == '' & u$version == '' & - u$node == 'bridge', ] + u <- u[u$country == '' & u$transport == '' & u$version == '', ] title <- "Bridge users\n" } u <- aggregate(list(lower = u$lower, upper = u$upper,