commit ab7d546a9dae35efdfc2c1f8c4a09e473df72747 Author: iwakeh iwakeh@torproject.org Date: Tue Feb 13 19:53:15 2018 +0000
Tune R processing in advbwdist module.
Processing advbwdist-validafter.csv (350M) took 150 seconds and used up to 7G. Performing pre-processing separately, helping R by defining read types, and avoiding multiple casting operations led to halving the processing time (to 77 seconds) and reducing the necessary memory to about 25% (approx. 1.8G). The resulting advbwdist.csv are identical.
Avoid casting to 'Date' and make implicit cast explicit. This saves reliably 10 seconds processing time and reduces used memory to less than 1.65G.
Total: processing time down to 44% and memory consumption down to 24%.
Also indent source code for readability. --- src/main/R/advbwdist/aggregate.R | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-)
diff --git a/src/main/R/advbwdist/aggregate.R b/src/main/R/advbwdist/aggregate.R index ee52a64..1c67dff 100644 --- a/src/main/R/advbwdist/aggregate.R +++ b/src/main/R/advbwdist/aggregate.R @@ -1,16 +1,25 @@ require(reshape) -t <- read.csv("stats/advbwdist-validafter.csv", stringsAsFactors = FALSE) -t <- t[t$valid_after < paste(Sys.Date() - 1, "23:59:59"), ] -t <- aggregate(list(advbw = as.numeric(t$advbw)), - by = list(date = as.Date(cut.Date(as.Date(t$valid_after), "day")), - isexit = !is.na(t$isexit), relay = ifelse(is.na(t$relay), -1, t$relay), - percentile = ifelse(is.na(t$percentile), -1, t$percentile)), - FUN = median) -t <- data.frame(date = t$date, isexit = ifelse(t$isexit, "t", ""), - relay = ifelse(t$relay < 0, NA, t$relay), - percentile = ifelse(t$percentile < 0, NA, t$percentile), - advbw = floor(t$advbw)) +t <- read.csv("stats/advbwdist-validafter.csv", + colClasses = c("character", "logical", "integer", "integer", "integer"), + stringsAsFactors = FALSE) + +currSysDate <- paste(Sys.Date() - 1, "23:59:59") +t <- t[t$valid_after < currSysDate, ] +t$date <- as.factor(substr(t$valid_after, 1, 10)) +t$isexit <- !is.na(t$isexit) +t$relay <- ifelse(is.na(t$relay), -1, t$relay) +t$percentile <- ifelse(is.na(t$percentile), -1, t$percentile) + +t <- aggregate(list(advbw = t$advbw), by = list(date = t$date, + isexit = t$isexit, relay = t$relay, percentile = t$percentile), + FUN = median) + +t$isexit <- ifelse(t$isexit, "t", "") +t$relay <- ifelse(t$relay < 0, NA, t$relay) +t$percentile <- ifelse(t$percentile < 0, NA, t$percentile) +t$advbw <- floor(t$advbw) + t <- t[order(t$date, t$isexit, t$relay, t$percentile), ] -write.csv(t, "stats/advbwdist.csv", quote = FALSE, row.names = FALSE, - na = "") + +write.csv(t, "stats/advbwdist.csv", quote = FALSE, row.names = FALSE, na = "")
tor-commits@lists.torproject.org