[tor-commits] [metrics-web/release] Tune R processing in advbwdist module.

Wed May 30 13:45:11 UTC 2018

commit ab7d546a9dae35efdfc2c1f8c4a09e473df72747
Author: iwakeh <iwakeh at torproject.org>
Date:   Tue Feb 13 19:53:15 2018 +0000

    Tune R processing in advbwdist module.
    
    Processing advbwdist-validafter.csv (350M) took 150 seconds and used up to 7G.
    Performing pre-processing separately, helping R by defining read types, and
    avoiding multiple casting operations led to halving the processing time
    (to 77 seconds) and reducing the necessary memory to about 25% (approx. 1.8G).
    The resulting advbwdist.csv are identical.
    
    Avoid casting to 'Date' and make implicit cast explicit.  This saves reliably
    10 seconds processing time and reduces used memory to less than 1.65G.
    
    Total: processing time down to 44% and memory consumption down to 24%.
    
    Also indent source code for readability.
---
 src/main/R/advbwdist/aggregate.R | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/main/R/advbwdist/aggregate.R b/src/main/R/advbwdist/aggregate.R
index ee52a64..1c67dff 100644
--- a/src/main/R/advbwdist/aggregate.R
+++ b/src/main/R/advbwdist/aggregate.R
@@ -1,16 +1,25 @@
 require(reshape)
-t <- read.csv("stats/advbwdist-validafter.csv", stringsAsFactors = FALSE)
-t <- t[t$valid_after < paste(Sys.Date() - 1, "23:59:59"), ]
-t <- aggregate(list(advbw = as.numeric(t$advbw)),
-  by = list(date = as.Date(cut.Date(as.Date(t$valid_after), "day")),
-  isexit = !is.na(t$isexit), relay = ifelse(is.na(t$relay), -1, t$relay),
-  percentile = ifelse(is.na(t$percentile), -1, t$percentile)),
-  FUN = median)
-t <- data.frame(date = t$date, isexit = ifelse(t$isexit, "t", ""),
-  relay = ifelse(t$relay < 0, NA, t$relay),
-  percentile = ifelse(t$percentile < 0, NA, t$percentile),
-  advbw = floor(t$advbw))
+t <- read.csv("stats/advbwdist-validafter.csv",
+  colClasses = c("character", "logical", "integer", "integer", "integer"),
+  stringsAsFactors = FALSE)
+
+currSysDate <- paste(Sys.Date() - 1, "23:59:59")
+t <- t[t$valid_after < currSysDate, ]
+t$date <- as.factor(substr(t$valid_after, 1, 10))
+t$isexit <- !is.na(t$isexit)
+t$relay <- ifelse(is.na(t$relay), -1, t$relay)
+t$percentile <- ifelse(is.na(t$percentile), -1, t$percentile)
+
+t <- aggregate(list(advbw = t$advbw), by = list(date = t$date,
+    isexit = t$isexit, relay = t$relay, percentile = t$percentile),
+    FUN = median)
+
+t$isexit <- ifelse(t$isexit, "t", "")
+t$relay <- ifelse(t$relay < 0, NA, t$relay)
+t$percentile <- ifelse(t$percentile < 0, NA, t$percentile)
+t$advbw <- floor(t$advbw)
+
 t <- t[order(t$date, t$isexit, t$relay, t$percentile), ]
-write.csv(t, "stats/advbwdist.csv", quote = FALSE, row.names = FALSE,
-  na = "")
+
+write.csv(t, "stats/advbwdist.csv", quote = FALSE, row.names = FALSE, na = "")