[tor-commits] [metrics-web/master] Split up clients.csv for faster graphs.

karsten at torproject.org karsten at torproject.org
Wed Oct 26 13:36:10 UTC 2016


commit a91f2dc9f51c2bd0e7c20e13a84c66dcb60ccd3d
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Wed Oct 26 15:23:25 2016 +0200

    Split up clients.csv for faster graphs.
    
    Most client graphs (except for clients by country and transport) use
    the same clients.csv file as input.  That file has grown to 26M by
    now, and it seems wasteful to read numbers for clients connecting to
    bridges (relays) when graphing clients connecting to relays (bridges).
    Split up clients.csv and take out the node column.
    
    Performance gain is 1.3 seconds for updating graphs on directly
    connecting clients and 2.0 seconds for graphs showing clients
    connecting via bridges.
---
 modules/clients/split-clients.R    |  6 ++++++
 shared/bin/80-run-clients-stats.sh |  5 ++++-
 shared/bin/99-copy-stats-files.sh  |  2 +-
 website/rserve/graphs.R            | 26 ++++++++++++--------------
 4 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/modules/clients/split-clients.R b/modules/clients/split-clients.R
new file mode 100644
index 0000000..5f3cb74
--- /dev/null
+++ b/modules/clients/split-clients.R
@@ -0,0 +1,6 @@
+u <- read.csv("clients.csv", stringsAsFactors = FALSE)
+write.csv(u[u$node == 'relay', names(u) != "node"], 'clients-relay.csv',
+  quote = FALSE, row.names = FALSE, na = '')
+write.csv(u[u$node == 'bridge', names(u) != "node"], 'clients-bridge.csv',
+  quote = FALSE, row.names = FALSE, na = '')
+
diff --git a/shared/bin/80-run-clients-stats.sh b/shared/bin/80-run-clients-stats.sh
index b296c37..a3efbe3 100755
--- a/shared/bin/80-run-clients-stats.sh
+++ b/shared/bin/80-run-clients-stats.sh
@@ -21,8 +21,11 @@ python detector.py
 
 echo `date` "Merging censorship detector results."
 R --slave -f merge-clients.R > /dev/null 2>&1
+
+echo `date` "Splitting results file."
+R --slave -f split-clients.R > /dev/null 2>&1
 mkdir -p stats/
-cp clients.csv stats/
+cp clients*.csv stats/
 cp userstats-combined.csv stats/
 
 echo `date` "Terminating."
diff --git a/shared/bin/99-copy-stats-files.sh b/shared/bin/99-copy-stats-files.sh
index 504216a..6daf22b 100755
--- a/shared/bin/99-copy-stats-files.sh
+++ b/shared/bin/99-copy-stats-files.sh
@@ -4,6 +4,6 @@ cp -a modules/legacy/stats/*.csv shared/stats/
 cp -a modules/connbidirect/stats/connbidirect2.csv shared/stats/
 cp -a modules/advbwdist/stats/advbwdist.csv shared/stats/
 cp -a modules/hidserv/stats/hidserv.csv shared/stats/
-cp -a modules/clients/stats/clients.csv shared/stats/
+cp -a modules/clients/stats/clients*.csv shared/stats/
 cp -a modules/clients/stats/userstats-combined.csv shared/stats/
 
diff --git a/website/rserve/graphs.R b/website/rserve/graphs.R
index e3ccb06..6f7e119 100644
--- a/website/rserve/graphs.R
+++ b/website/rserve/graphs.R
@@ -766,21 +766,21 @@ plot_userstats <- function(start, end, node, variable, value, events,
                            path) {
   end <- min(end, as.character(Sys.Date() - 2))
   c <- read.csv(paste("/srv/metrics.torproject.org/metrics/shared/stats/",
-                "clients.csv", sep = ""), stringsAsFactors = FALSE)
+                "clients-", node, ".csv", sep = ""),
+                stringsAsFactors = FALSE)
   u <- c[c$date >= start & c$date <= end, ]
-  u <- rbind(u, data.frame(date = start, node = node,
+  u <- rbind(u, data.frame(date = start,
       country = ifelse(variable == 'country' & value != 'all', value, ''),
       transport = ifelse(variable == 'transport', value, ''),
       version = ifelse(variable == 'version', value, ''),
       lower = 0, upper = 0, clients = 0, frac = 0))
   if (node == 'relay') {
     if (value != 'all') {
-      u <- u[u$country == value & u$node == 'relay', ]
+      u <- u[u$country == value, ]
       title <- paste("Directly connecting users from ",
                      countryname(value), "\n", sep = "")
     } else {
-      u <- u[u$country == '' & u$transport == '' & u$version == '' &
-             u$node == 'relay', ]
+      u <- u[u$country == '', ]
       title <- "Directly connecting users\n"
     }
     u <- aggregate(list(lower = u$lower, upper = u$upper,
@@ -790,20 +790,19 @@ plot_userstats <- function(start, end, node, variable, value, events,
                    FUN = sum)
   } else if (variable == 'transport') {
     if ('!<OR>' %in% value) {
-      n <- u[u$transport != '' & u$transport != '<OR>' &
-             u$node == 'bridge', ]
+      n <- u[u$transport != '' & u$transport != '<OR>', ]
       n <- aggregate(list(lower = n$lower, upper = n$upper,
                           clients = n$clients),
                      by = list(date = n$date),
                      FUN = sum)
-      u <- rbind(u, data.frame(date = n$date, node = 'bridge',
+      u <- rbind(u, data.frame(date = n$date,
                                country = '', transport = '!<OR>',
                                version = '', lower = n$lower,
                                upper = n$upper, clients = n$clients,
                                frac = NA))
     }
     if (length(value) > 1) {
-      u <- u[u$transport %in% value & u$node == 'bridge', ]
+      u <- u[u$transport %in% value, ]
       u <- aggregate(list(lower = u$lower, upper = u$upper,
                           users = u$clients),
                      by = list(date = as.Date(u$date, "%Y-%m-%d"),
@@ -811,7 +810,7 @@ plot_userstats <- function(start, end, node, variable, value, events,
                      FUN = sum)
       title <- paste("Bridge users by transport\n")
     } else {
-      u <- u[u$transport == value & u$node == 'bridge', ]
+      u <- u[u$transport == value, ]
       u <- aggregate(list(lower = u$lower, upper = u$upper,
                           users = u$clients),
                      by = list(date = as.Date(u$date, "%Y-%m-%d"),
@@ -826,7 +825,7 @@ plot_userstats <- function(start, end, node, variable, value, events,
                paste('transport', value)))))), "\n", sep = "")
     }
   } else if (variable == 'version') {
-    u <- u[u$version == value & u$node == 'bridge', ]
+    u <- u[u$version == value, ]
     title <- paste("Bridge users using IP", value, "\n", sep = "")
     u <- aggregate(list(lower = u$lower, upper = u$upper,
                         users = u$clients),
@@ -835,12 +834,11 @@ plot_userstats <- function(start, end, node, variable, value, events,
                    FUN = sum)
   } else {
     if (value != 'all') {
-      u <- u[u$country == value & u$node == 'bridge', ]
+      u <- u[u$country == value, ]
       title <- paste("Bridge users from ", countryname(value),
                      "\n", sep = "")
     } else {
-      u <- u[u$country == '' & u$transport == '' & u$version == '' &
-             u$node == 'bridge', ]
+      u <- u[u$country == '' & u$transport == '' & u$version == '', ]
       title <- "Bridge users\n"
     }
     u <- aggregate(list(lower = u$lower, upper = u$upper,





More information about the tor-commits mailing list