commit 1f90b72368bfd2a15ee60efb7fbbcc62b3c03cdc Author: Karsten Loesing karsten.loesing@gmx.net Date: Thu Oct 27 11:26:45 2016 +0200
Make user graphs even faster.
In a91f2dc we split up clients.csv to make most client graphs faster (except for clients by country and transport). Turns out we can do even better by using R's save() and load() functions. Performance gain as compared to pre-a91f2dc:
- userstats-relay-country: 3.0 seconds
- userstats-bridge-country: 3.5 seconds
- userstats-bridge-transport: 3.5 seconds
- userstats-bridge-version: 3.5 seconds
- userstats-bridge-combined: 2.7 seconds --- modules/clients/split-clients.R | 16 +++++++++++----- shared/bin/80-run-clients-stats.sh | 8 ++++---- shared/bin/99-copy-stats-files.sh | 3 +++ website/rserve/graphs.R | 17 ++++++++--------- 4 files changed, 26 insertions(+), 18 deletions(-)
diff --git a/modules/clients/split-clients.R b/modules/clients/split-clients.R index 5f3cb74..50b03d4 100644 --- a/modules/clients/split-clients.R +++ b/modules/clients/split-clients.R @@ -1,6 +1,12 @@ -u <- read.csv("clients.csv", stringsAsFactors = FALSE) -write.csv(u[u$node == 'relay', names(u) != "node"], 'clients-relay.csv', - quote = FALSE, row.names = FALSE, na = '') -write.csv(u[u$node == 'bridge', names(u) != "node"], 'clients-bridge.csv', - quote = FALSE, row.names = FALSE, na = '') +dir.create("RData", showWarnings = FALSE) + +c <- read.csv("clients.csv", stringsAsFactors = FALSE) +data <- c[c$node == 'relay', !(names(c) %in% c("node", "frac"))] +save(data, file = "RData/clients-relay.RData") +data <- c[c$node == 'bridge', !(names(c) %in% c("node", "frac"))] +save(data, file = "RData/clients-bridge.RData") + +u <- read.csv("userstats-combined.csv", stringsAsFactors = FALSE) +data <- u[, !(names(u) %in% c("node", "version", "frac"))] +save(data, file = "RData/userstats-bridge-combined.RData")
diff --git a/shared/bin/80-run-clients-stats.sh b/shared/bin/80-run-clients-stats.sh index a3efbe3..fe93e44 100755 --- a/shared/bin/80-run-clients-stats.sh +++ b/shared/bin/80-run-clients-stats.sh @@ -21,13 +21,13 @@ python detector.py
echo `date` "Merging censorship detector results." R --slave -f merge-clients.R > /dev/null 2>&1 - -echo `date` "Splitting results file." -R --slave -f split-clients.R > /dev/null 2>&1 mkdir -p stats/ -cp clients*.csv stats/ +cp clients.csv stats/ cp userstats-combined.csv stats/
+echo `date` "Saving results as .RData files." +R --slave -f split-clients.R > /dev/null 2>&1 + echo `date` "Terminating."
cd ../../ diff --git a/shared/bin/99-copy-stats-files.sh b/shared/bin/99-copy-stats-files.sh index 6daf22b..d236630 100755 --- a/shared/bin/99-copy-stats-files.sh +++ b/shared/bin/99-copy-stats-files.sh @@ -7,3 +7,6 @@ cp -a modules/hidserv/stats/hidserv.csv shared/stats/ cp -a modules/clients/stats/clients*.csv shared/stats/ cp -a modules/clients/stats/userstats-combined.csv shared/stats/
+mkdir -p shared/RData +cp -a modules/clients/RData/*.RData shared/RData/ + diff --git a/website/rserve/graphs.R b/website/rserve/graphs.R index 6f7e119..9950a15 100644 --- a/website/rserve/graphs.R +++ b/website/rserve/graphs.R @@ -765,15 +765,15 @@ plot_bandwidth_flags <- function(start, end, path) { plot_userstats <- function(start, end, node, variable, value, events, path) { end <- min(end, as.character(Sys.Date() - 2)) - c <- read.csv(paste("/srv/metrics.torproject.org/metrics/shared/stats/", - "clients-", node, ".csv", sep = ""), - stringsAsFactors = FALSE) + load(paste("/srv/metrics.torproject.org/metrics/shared/RData/clients-", + node, ".RData", sep = "")) + c <- data u <- c[c$date >= start & c$date <= end, ] u <- rbind(u, data.frame(date = start, country = ifelse(variable == 'country' & value != 'all', value, ''), transport = ifelse(variable == 'transport', value, ''), version = ifelse(variable == 'version', value, ''), - lower = 0, upper = 0, clients = 0, frac = 0)) + lower = 0, upper = 0, clients = 0)) if (node == 'relay') { if (value != 'all') { u <- u[u$country == value, ] @@ -798,8 +798,7 @@ plot_userstats <- function(start, end, node, variable, value, events, u <- rbind(u, data.frame(date = n$date, country = '', transport = '!<OR>', version = '', lower = n$lower, - upper = n$upper, clients = n$clients, - frac = NA)) + upper = n$upper, clients = n$clients)) } if (length(value) > 1) { u <- u[u$transport %in% value, ] @@ -926,9 +925,9 @@ plot_userstats_bridge_combined <- function(start, end, country, path) { top <- 3 country <- ifelse(country == "all", NA, country) end <- min(end, as.character(Sys.Date() - 2)) - u <- read.csv(paste("/srv/metrics.torproject.org/metrics/shared/", - "stats/userstats-combined.csv", sep = ""), - stringsAsFactors = FALSE) + load(paste("/srv/metrics.torproject.org/metrics/shared/RData/", + "userstats-bridge-combined.RData", sep = "")) + u <- data u <- u[u$date >= start & u$date <= end & (is.na(country) | u$country == country), ] a <- aggregate(list(mid = (u$high + u$low) / 2),