[tor-commits] [metrics-web/master] Make user graphs even faster.

karsten at torproject.org karsten at torproject.org
Thu Oct 27 09:36:45 UTC 2016


commit 1f90b72368bfd2a15ee60efb7fbbcc62b3c03cdc
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Thu Oct 27 11:26:45 2016 +0200

    Make user graphs even faster.
    
    In a91f2dc we split up clients.csv to make most client graphs faster
    (except for clients by country and transport).  Turns out we can do
    even better by using R's save() and load() functions.  Performance
    gain as compared to pre-a91f2dc:
    
     - userstats-relay-country: 3.0 seconds
    
     - userstats-bridge-country: 3.5 seconds
    
     - userstats-bridge-transport: 3.5 seconds
    
     - userstats-bridge-version: 3.5 seconds
    
     - userstats-bridge-combined: 2.7 seconds
---
 modules/clients/split-clients.R    | 16 +++++++++++-----
 shared/bin/80-run-clients-stats.sh |  8 ++++----
 shared/bin/99-copy-stats-files.sh  |  3 +++
 website/rserve/graphs.R            | 17 ++++++++---------
 4 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/modules/clients/split-clients.R b/modules/clients/split-clients.R
index 5f3cb74..50b03d4 100644
--- a/modules/clients/split-clients.R
+++ b/modules/clients/split-clients.R
@@ -1,6 +1,12 @@
-u <- read.csv("clients.csv", stringsAsFactors = FALSE)
-write.csv(u[u$node == 'relay', names(u) != "node"], 'clients-relay.csv',
-  quote = FALSE, row.names = FALSE, na = '')
-write.csv(u[u$node == 'bridge', names(u) != "node"], 'clients-bridge.csv',
-  quote = FALSE, row.names = FALSE, na = '')
+dir.create("RData", showWarnings = FALSE)
+
+c <- read.csv("clients.csv", stringsAsFactors = FALSE)
+data <- c[c$node == 'relay', !(names(c) %in% c("node", "frac"))]
+save(data, file = "RData/clients-relay.RData")
+data <- c[c$node == 'bridge', !(names(c) %in% c("node", "frac"))]
+save(data, file = "RData/clients-bridge.RData")
+
+u <- read.csv("userstats-combined.csv", stringsAsFactors = FALSE)
+data <- u[, !(names(u) %in% c("node", "version", "frac"))]
+save(data, file = "RData/userstats-bridge-combined.RData")
 
diff --git a/shared/bin/80-run-clients-stats.sh b/shared/bin/80-run-clients-stats.sh
index a3efbe3..fe93e44 100755
--- a/shared/bin/80-run-clients-stats.sh
+++ b/shared/bin/80-run-clients-stats.sh
@@ -21,13 +21,13 @@ python detector.py
 
 echo `date` "Merging censorship detector results."
 R --slave -f merge-clients.R > /dev/null 2>&1
-
-echo `date` "Splitting results file."
-R --slave -f split-clients.R > /dev/null 2>&1
 mkdir -p stats/
-cp clients*.csv stats/
+cp clients.csv stats/
 cp userstats-combined.csv stats/
 
+echo `date` "Saving results as .RData files."
+R --slave -f split-clients.R > /dev/null 2>&1
+
 echo `date` "Terminating."
 
 cd ../../
diff --git a/shared/bin/99-copy-stats-files.sh b/shared/bin/99-copy-stats-files.sh
index 6daf22b..d236630 100755
--- a/shared/bin/99-copy-stats-files.sh
+++ b/shared/bin/99-copy-stats-files.sh
@@ -7,3 +7,6 @@ cp -a modules/hidserv/stats/hidserv.csv shared/stats/
 cp -a modules/clients/stats/clients*.csv shared/stats/
 cp -a modules/clients/stats/userstats-combined.csv shared/stats/
 
+mkdir -p shared/RData
+cp -a modules/clients/RData/*.RData shared/RData/
+
diff --git a/website/rserve/graphs.R b/website/rserve/graphs.R
index 6f7e119..9950a15 100644
--- a/website/rserve/graphs.R
+++ b/website/rserve/graphs.R
@@ -765,15 +765,15 @@ plot_bandwidth_flags <- function(start, end, path) {
 plot_userstats <- function(start, end, node, variable, value, events,
                            path) {
   end <- min(end, as.character(Sys.Date() - 2))
-  c <- read.csv(paste("/srv/metrics.torproject.org/metrics/shared/stats/",
-                "clients-", node, ".csv", sep = ""),
-                stringsAsFactors = FALSE)
+  load(paste("/srv/metrics.torproject.org/metrics/shared/RData/clients-",
+             node, ".RData", sep = ""))
+  c <- data
   u <- c[c$date >= start & c$date <= end, ]
   u <- rbind(u, data.frame(date = start,
       country = ifelse(variable == 'country' & value != 'all', value, ''),
       transport = ifelse(variable == 'transport', value, ''),
       version = ifelse(variable == 'version', value, ''),
-      lower = 0, upper = 0, clients = 0, frac = 0))
+      lower = 0, upper = 0, clients = 0))
   if (node == 'relay') {
     if (value != 'all') {
       u <- u[u$country == value, ]
@@ -798,8 +798,7 @@ plot_userstats <- function(start, end, node, variable, value, events,
       u <- rbind(u, data.frame(date = n$date,
                                country = '', transport = '!<OR>',
                                version = '', lower = n$lower,
-                               upper = n$upper, clients = n$clients,
-                               frac = NA))
+                               upper = n$upper, clients = n$clients))
     }
     if (length(value) > 1) {
       u <- u[u$transport %in% value, ]
@@ -926,9 +925,9 @@ plot_userstats_bridge_combined <- function(start, end, country, path) {
     top <- 3
     country <- ifelse(country == "all", NA, country)
     end <- min(end, as.character(Sys.Date() - 2))
-    u <- read.csv(paste("/srv/metrics.torproject.org/metrics/shared/",
-                        "stats/userstats-combined.csv", sep = ""),
-                  stringsAsFactors = FALSE)
+    load(paste("/srv/metrics.torproject.org/metrics/shared/RData/",
+               "userstats-bridge-combined.RData", sep = ""))
+    u <- data
     u <- u[u$date >= start & u$date <= end
            & (is.na(country) | u$country == country), ]
     a <- aggregate(list(mid = (u$high + u$low) / 2),



More information about the tor-commits mailing list