commit 2c44721c9ab903183558b92d7a4e17674fcb79be Author: Karsten Loesing karsten.loesing@gmx.net Date: Mon Dec 17 21:03:16 2018 +0100
Use readr to speed up drawing graphs.
Over two years ago, in commit 1f90b72 from October 2016, we made our user graphs faster by avoiding to read the large .csv file on demand. Instead we read it once as part of the daily update, saved it to disk as .RData file using R's save() function, and loaded it back to memory using R's load() function when drawing a graph.
This approach worked okay. It just had two disadvantages:
1. We had to write a small amount of R code for each graph type, which is why we only did it for graphs with large .csv files. 2. Running these small R script as part of the daily update made it harder to move away from Ant towards a Java-only execution model.
The new approach implemented in this commit uses read_csv() fromt the readr package which reads CSV files several times faster than read.csv().
Requires installing the readr package from CRAN, which is available on Debian in stretch-backports and later as r-cran-readr.
Implements #28799. --- build.xml | 14 --- src/main/R/clients/split-clients.R | 12 --- src/main/R/rserver/graphs.R | 169 +++++++++++++++++++++++++++++-------- src/main/R/rserver/rserve-init.R | 1 + src/main/R/webstats/write-RData.R | 16 ---- 5 files changed, 136 insertions(+), 76 deletions(-)
diff --git a/build.xml b/build.xml index 89c8b31..250417e 100644 --- a/build.xml +++ b/build.xml @@ -362,8 +362,6 @@ <property name="module.name" value="clients" /> <property name="localmoddir" value="${modulebase}/${module.name}" />
- <property name="rdatadir" value="${localmoddir}/RData" /> - <mkdir dir="${rdatadir}" /> <property name="statsdir" value="${localmoddir}/stats" /> <mkdir dir="${statsdir}" /> @@ -410,10 +408,6 @@
<copy file="${localmoddir}/clients.csv" todir="${statsdir}" /> <copy file="${localmoddir}/userstats-combined.csv" todir="${statsdir}" /> - - <antcall target="run-R" > - <param name="module.Rscript" value="split-clients.R" /> - </antcall> </target>
<target name="servers" > @@ -426,13 +420,7 @@
<target name="webstats" > <property name="module.name" value="webstats" /> - <property name="rdatadir" value="${modulebase}/${module.name}/RData" /> - <mkdir dir="${rdatadir}" /> - <antcall target="run-java" /> - <antcall target="run-R" > - <param name="module.Rscript" value="write-RData.R" /> - </antcall> </target>
<target name="totalcw" > @@ -482,8 +470,6 @@ <fileset dir="${modulebase}/totalcw/stats" includes="totalcw.csv" /> </copy> <copy todir="${rdatadir}" > - <fileset dir="${modulebase}/clients/RData" includes="*.RData" /> - <fileset dir="${modulebase}/webstats/RData" includes="*.RData" /> <fileset dir="${resources}/web/images/" includes="no-data-available.*" /> </copy> </target> diff --git a/src/main/R/clients/split-clients.R b/src/main/R/clients/split-clients.R deleted file mode 100644 index 9f80902..0000000 --- a/src/main/R/clients/split-clients.R +++ /dev/null @@ -1,12 +0,0 @@ -dir.create("RData", showWarnings = FALSE) - -c <- read.csv("clients.csv", stringsAsFactors = FALSE) -data <- c[c$node == 'relay', !(names(c) %in% c("node"))] -save(data, file = "RData/clients-relay.RData") -data <- c[c$node == 'bridge', !(names(c) %in% c("node"))] -save(data, file = "RData/clients-bridge.RData") - -u <- read.csv("userstats-combined.csv", stringsAsFactors = FALSE) -data <- u[, !(names(u) %in% c("node", "version"))] -save(data, file = "RData/userstats-bridge-combined.RData") - diff --git a/src/main/R/rserver/graphs.R b/src/main/R/rserver/graphs.R index 7501a95..e541c30 100644 --- a/src/main/R/rserver/graphs.R +++ b/src/main/R/rserver/graphs.R @@ -348,6 +348,9 @@ robust_call <- function(wrappee, filename) { }) }
+# Disable readr's automatic progress bar. +options(readr.show_progress = FALSE) + prepare_networksize <- function(start_p, end_p) { read.csv(paste(stats_dir, "networksize.csv", sep = ""), colClasses = c("date" = "Date")) %>% @@ -863,8 +866,19 @@ write_bandwidth_flags <- function(start_p = NULL, end_p = NULL, path_p) {
plot_userstats <- function(start_p, end_p, node_p, variable_p, value_p, events_p, path_p) { - load(paste(rdata_dir, "clients-", node_p, ".RData", sep = "")) - c <- data + c <- read_csv(file = paste(stats_dir, "clients.csv", sep = ""), + col_types = cols( + date = col_date(format = ""), + node = col_character(), + country = col_character(), + transport = col_character(), + version = col_character(), + lower = col_double(), + upper = col_double(), + clients = col_double(), + frac = col_skip()), + na = character()) %>% + filter(node == node_p) u <- c[c$date >= start_p & c$date <= end_p, c("date", "country", "transport", "version", "lower", "upper", "clients")] u <- rbind(u, data.frame(date = start_p, @@ -1011,14 +1025,24 @@ plot_userstats_bridge_version <- function(start_p, end_p, version_p, path_p) {
write_userstats_relay_country <- function(start_p = NULL, end_p = NULL, country_p = NULL, events_p = NULL, path_p) { - load(paste(rdata_dir, "clients-relay.RData", sep = "")) - u <- data %>% + read_csv(file = paste(stats_dir, "clients.csv", sep = ""), + col_types = cols( + date = col_date(format = ""), + node = col_character(), + country = col_character(), + transport = col_character(), + version = col_character(), + lower = col_double(), + upper = col_double(), + clients = col_double(), + frac = col_double())) %>% + filter(node == "relay") %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% filter(if (!is.null(country_p)) country == ifelse(country_p == "all", "", country_p) else TRUE) %>% - filter(transport == "") %>% - filter(version == "") %>% + filter(is.na(transport)) %>% + filter(is.na(version)) %>% select(date, country, clients, lower, upper, frac) %>% rename(users = clients) %>% write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") @@ -1026,14 +1050,24 @@ write_userstats_relay_country <- function(start_p = NULL, end_p = NULL,
write_userstats_bridge_country <- function(start_p = NULL, end_p = NULL, country_p = NULL, path_p) { - load(paste(rdata_dir, "clients-bridge.RData", sep = "")) - data %>% + read_csv(file = paste(stats_dir, "clients.csv", sep = ""), + col_types = cols( + date = col_date(format = ""), + node = col_character(), + country = col_character(), + transport = col_character(), + version = col_character(), + lower = col_double(), + upper = col_double(), + clients = col_double(), + frac = col_double())) %>% + filter(node == "bridge") %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% filter(if (!is.null(country_p)) country == ifelse(country_p == "all", "", country_p) else TRUE) %>% - filter(transport == "") %>% - filter(version == "") %>% + filter(is.na(transport)) %>% + filter(is.na(version)) %>% select(date, country, clients, frac) %>% rename(users = clients) %>% write.csv(path_p, quote = FALSE, row.names = FALSE, na = "") @@ -1041,13 +1075,23 @@ write_userstats_bridge_country <- function(start_p = NULL, end_p = NULL,
write_userstats_bridge_transport <- function(start_p = NULL, end_p = NULL, transport_p = NULL, path_p) { - load(paste(rdata_dir, "clients-bridge.RData", sep = "")) - u <- data %>% + u <- read_csv(file = paste(stats_dir, "clients.csv", sep = ""), + col_types = cols( + date = col_date(format = ""), + node = col_character(), + country = col_character(), + transport = col_character(), + version = col_character(), + lower = col_double(), + upper = col_double(), + clients = col_double(), + frac = col_double())) %>% + filter(node == "bridge") %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% - filter(country == "") %>% - filter(version == "") %>% - filter(transport != "") %>% + filter(is.na(country)) %>% + filter(is.na(version)) %>% + filter(!is.na(transport)) %>% select(date, transport, clients, frac) if (is.null(transport_p) || "!<OR>" %in% transport_p) { n <- u %>% @@ -1068,12 +1112,22 @@ write_userstats_bridge_transport <- function(start_p = NULL, end_p = NULL,
write_userstats_bridge_version <- function(start_p = NULL, end_p = NULL, version_p = NULL, path_p) { - load(paste(rdata_dir, "clients-bridge.RData", sep = "")) - data %>% + read_csv(file = paste(stats_dir, "clients.csv", sep = ""), + col_types = cols( + date = col_date(format = ""), + node = col_character(), + country = col_character(), + transport = col_character(), + version = col_character(), + lower = col_double(), + upper = col_double(), + clients = col_double(), + frac = col_double())) %>% + filter(node == "bridge") %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% - filter(country == "") %>% - filter(transport == "") %>% + filter(is.na(country)) %>% + filter(is.na(transport)) %>% filter(if (!is.null(version_p)) version == version_p else TRUE) %>% select(date, version, clients, frac) %>% rename(users = clients) %>% @@ -1081,8 +1135,16 @@ write_userstats_bridge_version <- function(start_p = NULL, end_p = NULL, }
prepare_userstats_bridge_combined <- function(start_p, end_p, country_p) { - load(paste(rdata_dir, "userstats-bridge-combined.RData", sep = "")) - data %>% + read_csv(file = paste(stats_dir, "userstats-combined.csv", sep = ""), + col_types = cols( + date = col_date(format = ""), + node = col_skip(), + country = col_character(), + transport = col_character(), + version = col_skip(), + frac = col_double(), + low = col_double(), + high = col_double())) %>% filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>% filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>% filter(if (!is.null(country_p)) country == country_p else TRUE) @@ -1135,7 +1197,7 @@ prepare_advbwdist_perc <- function(start_p, end_p, p_p) { filter(if (!is.null(p_p)) percentile %in% as.numeric(p_p) else percentile != "") %>% transmute(date, percentile = as.factor(percentile), - variable = ifelse(isexit != "t", "all", "exits"), + variable = ifelse(is.na(isexit), "all", "exits"), advbw = advbw * 8 / 1e9) }
@@ -1258,11 +1320,20 @@ write_hidserv_rend_relayed_cells <- function(start_p = NULL, end_p = NULL, }
prepare_webstats_tb <- function(start_p, end_p) { - load(paste(rdata_dir, "webstats-tb.RData", sep = "")) - data %>% + read_csv(file = paste(stats_dir, "webstats.csv", sep = ""), + col_types = cols( + log_date = col_date(format = ""), + request_type = col_factor(), + platform = col_skip(), + channel = col_skip(), + locale = col_skip(), + incremental = col_skip(), + count = col_double())) %>% filter(if (!is.null(start_p)) log_date >= as.Date(start_p) else TRUE) %>% filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>% - mutate(request_type = factor(request_type)) + filter(request_type %in% c("tbid", "tbsd", "tbup", "tbur")) %>% + group_by(log_date, request_type) %>% + summarize(count = sum(count)) }
plot_webstats_tb <- function(start_p, end_p, path_p) { @@ -1296,8 +1367,15 @@ write_webstats_tb <- function(start_p = NULL, end_p = NULL, path_p) { }
prepare_webstats_tb_platform <- function(start_p, end_p) { - read.csv(paste(stats_dir, "webstats.csv", sep = ""), - colClasses = c("log_date" = "Date")) %>% + read_csv(file = paste(stats_dir, "webstats.csv", sep = ""), + col_types = cols( + log_date = col_date(format = ""), + request_type = col_factor(), + platform = col_factor(), + channel = col_skip(), + locale = col_skip(), + incremental = col_skip(), + count = col_double())) %>% filter(if (!is.null(start_p)) log_date >= as.Date(start_p) else TRUE) %>% filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>% filter(request_type %in% c("tbid", "tbup")) %>% @@ -1337,8 +1415,15 @@ write_webstats_tb_platform <- function(start_p = NULL, end_p = NULL, path_p) { }
plot_webstats_tb_locale <- function(start_p, end_p, path_p) { - d <- read.csv(paste(stats_dir, "webstats.csv", sep = ""), - colClasses = c("log_date" = "Date", "locale" = "character")) + d <- read_csv(file = paste(stats_dir, "webstats.csv", sep = ""), + col_types = cols( + log_date = col_date(format = ""), + request_type = col_factor(), + platform = col_skip(), + channel = col_skip(), + locale = col_factor(), + incremental = col_skip(), + count = col_double())) d <- d[d$log_date >= start_p & d$log_date <= end_p & d$request_type %in% c("tbid", "tbup"), ] levels(d$request_type) <- list( @@ -1375,8 +1460,15 @@ plot_webstats_tb_locale <- function(start_p, end_p, path_p) { # plot_webstats_tb_locale needs the preliminary data frame e for its # breaks and labels. Left as future work. write_webstats_tb_locale <- function(start_p = NULL, end_p = NULL, path_p) { - read.csv(paste(stats_dir, "webstats.csv", sep = ""), - colClasses = c("log_date" = "Date", "locale" = "character")) %>% + read_csv(file = paste(stats_dir, "webstats.csv", sep = ""), + col_types = cols( + log_date = col_date(format = ""), + request_type = col_factor(), + platform = col_skip(), + channel = col_skip(), + locale = col_factor(), + incremental = col_skip(), + count = col_double())) %>% filter(if (!is.null(start_p)) log_date >= as.Date(start_p) else TRUE) %>% filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>% filter(request_type %in% c("tbid", "tbup")) %>% @@ -1390,11 +1482,20 @@ write_webstats_tb_locale <- function(start_p = NULL, end_p = NULL, path_p) { }
prepare_webstats_tm <- function(start_p, end_p) { - load(paste(rdata_dir, "webstats-tm.RData", sep = "")) - data %>% + read_csv(file = paste(stats_dir, "webstats.csv", sep = ""), + col_types = cols( + log_date = col_date(format = ""), + request_type = col_factor(), + platform = col_skip(), + channel = col_skip(), + locale = col_skip(), + incremental = col_skip(), + count = col_double())) %>% filter(if (!is.null(start_p)) log_date >= as.Date(start_p) else TRUE) %>% filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>% - mutate(request_type = factor(request_type)) + filter(request_type %in% c("tmid", "tmup")) %>% + group_by(log_date, request_type) %>% + summarize(count = sum(count)) }
plot_webstats_tm <- function(start_p, end_p, path_p) { diff --git a/src/main/R/rserver/rserve-init.R b/src/main/R/rserver/rserve-init.R index b9a1d3b..f160698 100644 --- a/src/main/R/rserver/rserve-init.R +++ b/src/main/R/rserver/rserve-init.R @@ -5,6 +5,7 @@ library("RColorBrewer") library("scales") library(dplyr) library(tidyr) +library(readr)
source('graphs.R') source('tables.R') diff --git a/src/main/R/webstats/write-RData.R b/src/main/R/webstats/write-RData.R deleted file mode 100644 index 96cc840..0000000 --- a/src/main/R/webstats/write-RData.R +++ /dev/null @@ -1,16 +0,0 @@ -dir.create("RData", showWarnings = FALSE) - -d <- read.csv("stats/webstats.csv", stringsAsFactors = FALSE) -d <- d[d$request_type %in% c('tbid', 'tbsd', 'tbup', 'tbur'), ] -data <- aggregate(list(count = d$count), - by = list(log_date = as.Date(d$log_date), request_type = d$request_type), - FUN = sum) -save(data, file = "RData/webstats-tb.RData") - -d <- read.csv("stats/webstats.csv", stringsAsFactors = FALSE) -d <- d[d$request_type %in% c('tmid', 'tmup'), ] -data <- aggregate(list(count = d$count), - by = list(log_date = as.Date(d$log_date), request_type = d$request_type), - FUN = sum) -save(data, file = "RData/webstats-tm.RData") -