[tor-commits] [metrics-web/release] Refactor even more of the graphing code.

karsten at torproject.org karsten at torproject.org
Wed May 30 13:45:12 UTC 2018


commit 5c8e727b61f516ddb30d2400d6f3c97462443801
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Thu Mar 1 12:27:07 2018 +0100

    Refactor even more of the graphing code.
---
 src/main/R/rserver/graphs.R | 428 +++++++++++++++++---------------------------
 1 file changed, 167 insertions(+), 261 deletions(-)

diff --git a/src/main/R/rserver/graphs.R b/src/main/R/rserver/graphs.R
index 8847967..93ff919 100644
--- a/src/main/R/rserver/graphs.R
+++ b/src/main/R/rserver/graphs.R
@@ -329,28 +329,20 @@ stats_dir = "/srv/metrics.torproject.org/metrics/shared/stats/"
 rdata_dir = "/srv/metrics.torproject.org/metrics/shared/RData/"
 
 prepare_networksize <- function(start, end) {
-  s <- read.csv(paste(stats_dir, "servers.csv", sep = ""),
-    stringsAsFactors = FALSE)
-  s <- s[s$date >= start & s$date <= end & s$flag == '' &
-         s$country == '' & s$version == '' & s$platform == '' &
-         s$ec2bridge == '', ]
-  s <- data.frame(date = as.Date(s$date, "%Y-%m-%d"), relays = s$relays,
-                  bridges = s$bridges)
-  s
+  read.csv(paste(stats_dir, "servers.csv", sep = ""),
+    colClasses = c("date" = "Date")) %>%
+    filter(date >= as.Date(start), date <= as.Date(end), flag == '',
+      country == '', version == '', platform == '', ec2bridge == '') %>%
+    select(date, relays, bridges)
 }
 
 plot_networksize <- function(start, end, path) {
-  s <- prepare_networksize(start, end)
-  dates <- seq(from = as.Date(start, "%Y-%m-%d"),
-      to = as.Date(end, "%Y-%m-%d"), by="1 day")
-  missing <- setdiff(dates, as.Date(s$date, origin = "1970-01-01"))
-  if (length(missing) > 0)
-    s <- rbind(s,
-        data.frame(date = as.Date(missing, origin = "1970-01-01"),
-        relays = NA, bridges = NA))
-  networksize <- gather(s, variable, value, -date)
-  ggplot(networksize, aes(x = as.Date(date, "%Y-%m-%d"), y = value,
-    colour = variable)) + geom_line() +
+  prepare_networksize(start, end) %>%
+    gather(variable, value, -date) %>%
+    complete(date = full_seq(date, period = 1),
+      variable = c("relays", "bridges")) %>%
+    ggplot(aes(x = date, y = value, colour = variable)) +
+    geom_line() +
     scale_x_date(name = "", breaks = custom_breaks,
       labels = custom_labels, minor_breaks = custom_minor_breaks) +
     scale_y_continuous(name = "", labels = formatter, limits = c(0, NA)) +
@@ -367,14 +359,11 @@ write_networksize <- function(start, end, path) {
 }
 
 prepare_versions <- function(start, end) {
-  s <- read.csv(paste(stats_dir, "servers.csv", sep = ""),
-    stringsAsFactors = FALSE)
-  s <- s[s$date >= start & s$date <= end & s$flag == '' &
-         s$country == '' & s$version != '' & s$platform == '' &
-         s$ec2bridge == '', ]
-  s <- data.frame(date = as.Date(s$date, "%Y-%m-%d"), version = s$version,
-                  relays = s$relays)
-  s
+  read.csv(paste(stats_dir, "servers.csv", sep = ""),
+    colClasses = c("date" = "Date")) %>%
+    filter(date >= as.Date(start), date <= as.Date(end), flag == '',
+      country == '', version != '', platform == '', ec2bridge == '') %>%
+    select(date, version, relays)
 }
 
 plot_versions <- function(start, end, path) {
@@ -389,8 +378,7 @@ plot_versions <- function(start, end, path) {
     stringsAsFactors = FALSE)
   versions <- s[s$version %in% known_versions, ]
   visible_versions <- sort(unique(versions$version))
-  ggplot(versions, aes(x = as.Date(date, "%Y-%m-%d"), y = relays,
-      colour = version)) +
+  ggplot(versions, aes(x = date, y = relays, colour = version)) +
     geom_line() +
     scale_x_date(name = "", breaks = custom_breaks,
       labels = custom_labels, minor_breaks = custom_minor_breaks) +
@@ -410,21 +398,18 @@ write_versions <- function(start, end, path) {
 }
 
 prepare_platforms <- function(start, end) {
-  s <- read.csv(paste(stats_dir, "servers.csv", sep = ""),
-    stringsAsFactors = FALSE)
-  s <- s[s$date >= start & s$date <= end & s$flag == '' &
-         s$country == '' & s$version == '' & s$platform != '' &
-         s$ec2bridge == '', ]
-  platforms <- data.frame(date = as.Date(s$date, "%Y-%m-%d"),
-      variable = ifelse(s$platform == "Darwin", "macOS", s$platform),
-      value = s$relays)
-  platforms
+  read.csv(paste(stats_dir, "servers.csv", sep = ""),
+    colClasses = c("date" = "Date")) %>%
+    filter(date >= as.Date(start), date <= as.Date(end), flag == '',
+      country == '', version == '', platform != '', ec2bridge == '') %>%
+    select(date, platform, relays) %>%
+    mutate(platform = ifelse(platform == "Darwin", "macOS",
+      as.character(platform)))
 }
 
 plot_platforms <- function(start, end, path) {
-  platforms <- prepare_platforms(start, end)
-  ggplot(platforms, aes(x = as.Date(date, "%Y-%m-%d"), y = value,
-      colour = variable)) +
+  prepare_platforms(start, end) %>%
+    ggplot(aes(x = date, y = relays, colour = platform)) +
     geom_line() +
     scale_x_date(name = "", breaks = custom_breaks,
       labels = custom_labels, minor_breaks = custom_minor_breaks) +
@@ -439,33 +424,31 @@ plot_platforms <- function(start, end, path) {
 
 write_platforms <- function(start, end, path) {
   prepare_platforms(start, end) %>%
-    spread(variable, value) %>%
+    spread(platform, relays) %>%
     write.csv(path, quote = FALSE, row.names = FALSE)
 }
 
 prepare_bandwidth <- function(start, end) {
-  b <- read.csv(paste(stats_dir, "bandwidth.csv", sep = ""),
-    stringsAsFactors = FALSE)
-  b <- b[b$date >= start & b$date <= end & b$isexit == '' &
-         b$isguard == '', ]
-  b <- data.frame(date = as.Date(b$date, "%Y-%m-%d"),
-                  bwadv = b$advbw * 8 / 1e9,
-                  bwhist = (b$bwread + b$bwwrite) * 8 / 2e9)
-  b
+  read.csv(paste(stats_dir, "bandwidth.csv", sep = ""),
+    colClasses = c("date" = "Date")) %>%
+    filter(date >= as.Date(start), date <= as.Date(end), isexit == '',
+      isguard == '') %>%
+    mutate(advbw = advbw * 8 / 1e9) %>%
+    mutate(bwhist = (bwread + bwwrite) * 8 / 2e9) %>%
+    select(date, advbw, bwhist)
 }
 
 plot_bandwidth <- function(start, end, path) {
-  b <- prepare_bandwidth(start, end)
-  bandwidth <- gather(b, variable, value, -date)
-  ggplot(bandwidth, aes(x = as.Date(date, "%Y-%m-%d"),
-      y = value, colour = variable)) +
+  prepare_bandwidth(start, end) %>%
+    gather(variable, value, -date) %>%
+    ggplot(aes(x = date, y = value, colour = variable)) +
     geom_line() +
     scale_x_date(name = "", breaks = custom_breaks,
       labels = custom_labels, minor_breaks = custom_minor_breaks) +
     scale_y_continuous(name = "", labels = unit_format(unit = "Gbit/s"),
       limits = c(0, NA)) +
     scale_colour_hue(name = "", h.start = 90,
-        breaks = c("bwadv", "bwhist"),
+        breaks = c("advbw", "bwhist"),
         labels = c("Advertised bandwidth", "Bandwidth history")) +
     ggtitle("Total relay bandwidth") +
     labs(caption = copyright_notice) +
@@ -479,37 +462,22 @@ write_bandwidth <- function(start, end, path) {
 }
 
 prepare_bwhist_flags <- function(start, end) {
-  b <- read.csv(paste(stats_dir, "bandwidth.csv", sep = ""),
-    stringsAsFactors = FALSE)
-  b <- b[b$date >= start & b$date <= end & b$isexit != '' &
-         b$isguard != '', ]
-  bw <- data.frame(date = as.Date(b$date, "%Y-%m-%d"),
-                  isexit = b$isexit == 't', isguard = b$isguard == 't',
-                  read = b$bwread, written = b$bwwrite)
-  dates <- seq(from = as.Date(start, "%Y-%m-%d"),
-      to = as.Date(end, "%Y-%m-%d"), by = "1 day")
-  missing <- setdiff(dates, as.Date(bw$date, origin = "1970-01-01"))
-  if (length(missing) > 0)
-    bw <- rbind(bw,
-        data.frame(date = as.Date(missing, origin = "1970-01-01"),
-        isexit = FALSE, isguard = FALSE, read = NA, written = NA),
-        data.frame(date = as.Date(missing, origin = "1970-01-01"),
-        isexit = FALSE, isguard = TRUE, read = NA, written = NA),
-        data.frame(date = as.Date(missing, origin = "1970-01-01"),
-        isexit = TRUE, isguard = FALSE, read = NA, written = NA),
-        data.frame(date = as.Date(missing, origin = "1970-01-01"),
-        isexit = TRUE, isguard = TRUE, read = NA, written = NA))
-  bw <- data.frame(date = bw$date, variable = ifelse(bw$isexit,
-        ifelse(bw$isguard, "guard_and_exit", "exit_only"),
-        ifelse(bw$isguard, "guard_only", "middle_only")),
-        value = (bw$read + bw$written) * 8 / 2e9)
-  bw
+  read.csv(paste(stats_dir, "bandwidth.csv", sep = ""),
+    colClasses = c("date" = "Date")) %>%
+    filter(date >= as.Date(start), date <= as.Date(end), isexit != '',
+      isguard != '') %>%
+    mutate(variable = ifelse(isexit == 't',
+        ifelse(isguard == 't', "guard_and_exit", "exit_only"),
+        ifelse(isguard == 't', "guard_only", "middle_only")),
+      value = (bwread + bwwrite) * 8 / 2e9) %>%
+    select(date, variable, value)
 }
 
 plot_bwhist_flags <- function(start, end, path) {
-  bw <- prepare_bwhist_flags(start, end)
-  ggplot(bw, aes(x = as.Date(date, "%Y-%m-%d"), y = value,
-      colour = variable)) +
+  prepare_bwhist_flags(start, end) %>%
+    complete(date = full_seq(date, period = 1),
+      variable = unique(variable)) %>%
+    ggplot(aes(x = date, y = value, colour = variable)) +
     geom_line() +
     scale_x_date(name = "", breaks = custom_breaks,
       labels = custom_labels, minor_breaks = custom_minor_breaks) +
@@ -532,21 +500,19 @@ write_bwhist_flags <- function(start, end, path) {
 }
 
 prepare_dirbytes <- function(start, end, path) {
-  b <- read.csv(paste(stats_dir, "bandwidth.csv", sep = ""),
-    stringsAsFactors = FALSE)
-  b <- b[b$date >= start & b$date <= end & b$isexit == '' &
-         b$isguard == '', ]
-  b <- data.frame(date = as.Date(b$date, "%Y-%m-%d"),
-                  dirread = b$dirread * 8 / 1e9,
-                  dirwrite = b$dirwrite * 8 / 1e9)
-  b
+  read.csv(paste(stats_dir, "bandwidth.csv", sep = ""),
+    colClasses = c("date" = "Date")) %>%
+    filter(date >= as.Date(start), date <= as.Date(end), isexit == '',
+      isguard == '') %>%
+    mutate(dirread = dirread * 8 / 1e9,
+      dirwrite = dirwrite * 8 / 1e9) %>%
+    select(date, dirread, dirwrite)
 }
 
 plot_dirbytes <- function(start, end, path) {
-  b <- prepare_dirbytes(start, end)
-  dir <- gather(b, variable, value, -date)
-  ggplot(dir, aes(x = as.Date(date, "%Y-%m-%d"), y = value,
-      colour = variable)) +
+  prepare_dirbytes(start, end) %>%
+    gather(variable, value, -date) %>%
+    ggplot(aes(x = date, y = value, colour = variable)) +
     geom_line() +
     scale_x_date(name = "", breaks = custom_breaks,
       labels = custom_labels, minor_breaks = custom_minor_breaks) +
@@ -567,33 +533,20 @@ write_dirbytes <- function(start, end, path) {
 }
 
 prepare_relayflags <- function(start, end, flags) {
-  s <- read.csv(paste(stats_dir, "servers.csv", sep = ""),
-    stringsAsFactors = FALSE)
-  s <- s[s$date >= start & s$date <= end & s$country == '' &
-         s$version == '' & s$platform == '' & s$ec2bridge == '', ]
-  s <- data.frame(date = as.Date(s$date, "%Y-%m-%d"),
-                  variable = ifelse(s$flag == '', 'Running', s$flag),
-                  value = s$relays)
-  networksize <- s[s$variable %in% flags, ]
-  networksize
+  read.csv(paste(stats_dir, "servers.csv", sep = ""),
+    colClasses = c("date" = "Date")) %>%
+    filter(date >= as.Date(start), date <= as.Date(end), country == '',
+      version == '', platform == '', ec2bridge == '') %>%
+    mutate(flag = ifelse(flag == '', 'Running', as.character(flag))) %>%
+    filter(flag %in% flags) %>%
+    select(date, flag, relays)
 }
 
 plot_relayflags <- function(start, end, flags, path) {
-  networksize <- prepare_relayflags(start, end, flags)
-  networksize <- rbind(data.frame(
-    date = as.Date(end) + 1,
-    variable = c("Running", "Exit", "Guard", "Fast", "Stable", "HSDir"),
-    value = NA), networksize)
-  dates <- seq(from = as.Date(start, "%Y-%m-%d"),
-      to = as.Date(end, "%Y-%m-%d"), by="1 day")
-  missing <- setdiff(dates, networksize$date)
-  if (length(missing) > 0)
-    networksize <- rbind(data.frame(
-      date = as.Date(rep(missing, 6), origin = "1970-01-01"),
-      variable = c("Running", "Exit", "Guard", "Fast", "Stable", "HSDir"),
-      value = rep(NA, length(missing) * 6)), networksize)
-  ggplot(networksize, aes(x = as.Date(date, "%Y-%m-%d"), y = value,
-    colour = as.factor(variable))) + geom_line() +
+  prepare_relayflags(start, end, flags) %>%
+    complete(date = full_seq(date, period = 1), flag = unique(flag)) %>%
+    ggplot(aes(x = date, y = relays, colour = as.factor(flag))) +
+    geom_line() +
     scale_x_date(name = "", breaks = custom_breaks,
       labels = custom_labels, minor_breaks = custom_minor_breaks) +
     scale_y_continuous(name = "", labels = formatter, limits = c(0, NA)) +
@@ -616,32 +569,24 @@ plot_torperf <- function(start, end, source, server, filesize, path) {
   filesizeVal <- ifelse(filesize == '50kb', 50 * 1024,
           ifelse(filesize == '1mb', 1024 * 1024, 5 * 1024 * 1024))
   t <- read.csv(paste(stats_dir, "torperf-1.1.csv", sep = ""),
-    stringsAsFactors = FALSE)
+    colClasses = c("date" = "Date"))
   known_sources <- c("all", unique(t[t$source != "", "source"]))
   colours <- data.frame(source = known_sources,
       colour = brewer.pal(length(known_sources), "Paired"),
       stringsAsFactors = FALSE)
-  t <- t[t$date >= start & t$date <= end & t$filesize == filesizeVal &
-         t$source == ifelse(source == 'all', '', source) &
-         t$server == server, ]
-  torperf <- data.frame(date = as.Date(t$date, "%Y-%m-%d"),
-                        q1 = t$q1, md = t$md, q3 = t$q3)
-  dates <- seq(from = as.Date(start, "%Y-%m-%d"),
-      to = as.Date(end, "%Y-%m-%d"), by="1 day")
-  missing <- setdiff(dates, torperf$date)
-  if (length(missing) > 0)
-    torperf <- rbind(torperf,
-        data.frame(date = as.Date(missing, origin = "1970-01-01"),
-        q1 = NA, md = NA, q3 = NA))
   colour <- colours[colours$source == source, "colour"]
   filesizes <- data.frame(filesizes = c("5mb", "1mb", "50kb"),
       label = c("5 MiB", "1 MiB", "50 KiB"), stringsAsFactors = FALSE)
   filesizeStr <- filesizes[filesizes$filesize == filesize, "label"]
-  ggplot(torperf, aes(x = as.Date(date, "%Y-%m-%d"), y = md/1e3,
-      fill = "line")) +
+  t[t$date >= as.Date(start) & t$date <= as.Date(end) &
+         t$filesize == filesizeVal &
+         t$source == ifelse(source == 'all', '', source) &
+         t$server == server, ] %>%
+    transmute(date, q1 = q1 / 1e3, md = md / 1e3, q3 = q3 / 1e3) %>%
+    complete(date = full_seq(date, period = 1)) %>%
+    ggplot(aes(x = date, y = md, fill = "line")) +
     geom_line(colour = colour, size = 0.75) +
-    geom_ribbon(data = torperf, aes(x = date, ymin = q1/1e3,
-      ymax = q3/1e3, fill = "ribbon")) +
+    geom_ribbon(aes(x = date, ymin = q1, ymax = q3, fill = "ribbon")) +
     scale_x_date(name = "", breaks = custom_breaks,
       labels = custom_labels, minor_breaks = custom_minor_breaks) +
     scale_y_continuous(name = "", labels = unit_format(unit = "s"),
@@ -671,8 +616,7 @@ write_torperf <- function(start, end, source_, server_, filesize_, path) {
         ifelse(filesize_ == '1mb', 1024 * 1024, 5 * 1024 * 1024)),
       source == ifelse(source_ == 'all', '', source_),
       server == server_) %>%
-    select(date, q1, md, q3) %>%
-    mutate(q1 = q1 / 1e3, md = md / 1e3, q3 = q3 / 1e3) %>%
+    transmute(date, q1 = q1 / 1e3, md = md / 1e3, q3 = q3 / 1e3) %>%
     write.csv(path, quote = FALSE, row.names = FALSE)
 }
 
@@ -680,31 +624,21 @@ prepare_torperf_failures <- function(start, end, source, server, filesize) {
   filesizeVal <- ifelse(filesize == '50kb', 50 * 1024,
           ifelse(filesize == '1mb', 1024 * 1024, 5 * 1024 * 1024))
   t <- read.csv(paste(stats_dir, "torperf-1.1.csv", sep = ""),
-    stringsAsFactors = FALSE)
-  t <- t[t$date >= start & t$date <= end & t$filesize == filesizeVal &
+    colClasses = c("date" = "Date"))
+  t[t$date >= start & t$date <= end & t$filesize == filesizeVal &
          t$source == ifelse(source == 'all', '', source) &
-         t$server == server & t$requests > 0, ]
-  torperf <- data.frame(date = as.Date(t$date, "%Y-%m-%d"),
-                        timeouts = t$timeouts / t$requests,
-                        failures = t$failures / t$requests)
-  torperf
+         t$server == server & t$requests > 0, ] %>%
+  transmute(date, timeouts = timeouts / requests,
+    failures = failures / requests)
 }
 
 plot_torperf_failures <- function(start, end, source, server, filesize, path) {
-  torperf <- prepare_torperf_failures(start, end, source, server, filesize)
-  dates <- seq(from = as.Date(start, "%Y-%m-%d"),
-      to = as.Date(end, "%Y-%m-%d"), by="1 day")
-  missing <- setdiff(dates, torperf$date)
-  if (length(missing) > 0)
-    torperf <- rbind(torperf,
-        data.frame(date = as.Date(missing, origin = "1970-01-01"),
-        timeouts = NA, failures = NA))
-  torperf <- gather(torperf, variable, value, -date)
   filesizes <- data.frame(filesizes = c("5mb", "1mb", "50kb"),
       label = c("5 MiB", "1 MiB", "50 KiB"), stringsAsFactors = FALSE)
   filesizeStr <- filesizes[filesizes$filesize == filesize, "label"]
-  ggplot(torperf, aes(x = as.Date(date, "%Y-%m-%d"), y = value,
-    colour = variable)) +
+  prepare_torperf_failures(start, end, source, server, filesize) %>%
+    gather(variable, value, -date) %>%
+    ggplot(aes(x = date, y = value, colour = variable)) +
     geom_point(size = 2) +
     scale_x_date(name = "", breaks = custom_breaks,
       labels = custom_labels, minor_breaks = custom_minor_breaks) +
@@ -726,21 +660,17 @@ write_torperf_failures <- function(start, end, source, server, filesize, path) {
 }
 
 prepare_connbidirect <- function(start, end) {
-  c <- read.csv(paste(stats_dir, "connbidirect2.csv", sep = ""),
-    stringsAsFactors = FALSE)
-  c <- c[c$date >= start & c$date <= end, ]
-  c <- data.frame(date = as.Date(c$date),
-                  direction = factor(c$direction,
-                              levels = c("both", "write", "read")),
-                  quantile = paste("X", c$quantile, sep = ""),
-                  fraction = c$fraction / 100)
-  c <- spread(c, quantile, fraction)
-  c
+  read.csv(paste(stats_dir, "connbidirect2.csv", sep = ""),
+    colClasses = c("date" = "Date", "direction" = "factor")) %>%
+    filter(date >= as.Date(start), date <= as.Date(end)) %>%
+    mutate(quantile = paste("X", quantile, sep = ""),
+      fraction = fraction / 100) %>%
+    spread(quantile, fraction)
 }
 
 plot_connbidirect <- function(start, end, path) {
-  c <- prepare_connbidirect(start, end)
-  ggplot(c, aes(x = date, y = X0.5, colour = direction)) +
+  prepare_connbidirect(start, end) %>%
+    ggplot(aes(x = date, y = X0.5, colour = direction)) +
     geom_line(size = 0.75) +
     geom_ribbon(aes(x = date, ymin = X0.25, ymax = X0.75,
                 fill = direction), alpha = 0.5, show.legend = FALSE) +
@@ -772,10 +702,10 @@ write_connbidirect <- function(start, end, path) {
 
 prepare_bandwidth_flags <- function(start, end) {
   b <- read.csv(paste(stats_dir, "bandwidth.csv", sep = ""),
-    stringsAsFactors = FALSE)
+    colClasses = c("date" = "Date"))
   b <- b[b$date >= start & b$date <= end & b$isexit != '' &
          b$isguard != '', ]
-  b <- data.frame(date = as.Date(b$date, "%Y-%m-%d"),
+  b <- data.frame(date = b$date,
                   isexit = b$isexit == 't', isguard = b$isguard == 't',
                   advbw = b$advbw * 8 / 1e9,
                   bwhist = (b$bwread + b$bwwrite) * 8 / 2e9)
@@ -789,21 +719,6 @@ prepare_bandwidth_flags <- function(start, end) {
                  na.rm = TRUE, na.action = NULL)
   b <- gather(b, type, value, -c(date, flag))
   bandwidth <- b[b$value > 0, ]
-  dates <- seq(from = as.Date(start, "%Y-%m-%d"),
-      to = as.Date(end, "%Y-%m-%d"), by = "1 day")
-  missing <- setdiff(dates, as.Date(bandwidth$date,
-    origin = "1970-01-01"))
-  if (length(missing) > 0) {
-    bandwidth <- rbind(bandwidth,
-        data.frame(date = as.Date(missing, origin = "1970-01-01"),
-        type = "advbw", flag = "exit", value = NA),
-        data.frame(date = as.Date(missing, origin = "1970-01-01"),
-        type = "bwhist", flag = "exit", value = NA),
-        data.frame(date = as.Date(missing, origin = "1970-01-01"),
-        type = "advbw", flag = "guard", value = NA),
-        data.frame(date = as.Date(missing, origin = "1970-01-01"),
-        type = "bwhist", flag = "guard", value = NA))
-  }
   bandwidth <- data.frame(date = bandwidth$date,
     variable = as.factor(paste(bandwidth$flag, "_", bandwidth$type,
     sep = "")), value = bandwidth$value)
@@ -813,9 +728,10 @@ prepare_bandwidth_flags <- function(start, end) {
 }
 
 plot_bandwidth_flags <- function(start, end, path) {
-  bandwidth <- prepare_bandwidth_flags(start, end)
-  ggplot(bandwidth, aes(x = as.Date(date, "%Y-%m-%d"),
-      y = value, colour = variable)) +
+  prepare_bandwidth_flags(start, end) %>%
+    complete(date = full_seq(date, period = 1),
+      variable = unique(variable)) %>%
+    ggplot(aes(x = date, y = value, colour = variable)) +
     geom_line() +
     scale_x_date(name = "", breaks = custom_breaks,
       labels = custom_labels, minor_breaks = custom_minor_breaks) +
@@ -1107,29 +1023,27 @@ write_userstats_bridge_combined <- function(start, end, country, path) {
 }
 
 prepare_advbwdist_perc <- function(start, end, p) {
-  t <- read.csv(paste(stats_dir,
-                "advbwdist.csv", sep = ""), stringsAsFactors = FALSE)
-  t <- t[t$date >= start & t$date <= end &
-         t$percentile %in% as.numeric(p), ]
-  t <- data.frame(date = t$date, percentile = as.factor(t$percentile),
-                  variable = ifelse(t$isexit != "t", "all", "exits"),
-                  advbw = t$advbw * 8 / 1e9)
-  t
+  read.csv(paste(stats_dir, "advbwdist.csv", sep = ""),
+    colClasses = c("date" = "Date")) %>%
+    filter(date >= as.Date(start), date <= as.Date(end),
+      percentile %in% as.numeric(p)) %>%
+    transmute(date, percentile = as.factor(percentile),
+      variable = ifelse(isexit != "t", "all", "exits"),
+      advbw = advbw * 8 / 1e9)
 }
 
 plot_advbwdist_perc <- function(start, end, p, path) {
-  t <- prepare_advbwdist_perc(start, end, p)
-  t$variable <- ifelse(t$variable == "all", "All relays",
-                       "Exits only")
-  ggplot(t, aes(x = as.Date(date), y = advbw, colour = percentile)) +
+  prepare_advbwdist_perc(start, end, p) %>%
+    mutate(variable = ifelse(variable == "all", "All relays",
+      "Exits only")) %>%
+    ggplot(aes(x = date, y = advbw, colour = percentile)) +
     facet_grid(variable ~ .) +
     geom_line() +
     scale_x_date(name = "", breaks = custom_breaks,
       labels = custom_labels, minor_breaks = custom_minor_breaks) +
     scale_y_continuous(name = "", labels = unit_format(unit = "Gbit/s"),
       limits = c(0, NA)) +
-    scale_colour_hue(name = "Percentile",
-        breaks = rev(levels(t$percentile))) +
+    scale_colour_hue(name = "Percentile") +
     ggtitle("Advertised bandwidth distribution") +
     labs(caption = copyright_notice)
   ggsave(filename = path, width = 8, height = 5, dpi = 150)
@@ -1143,27 +1057,27 @@ write_advbwdist_perc <- function(start, end, p, path) {
 }
 
 prepare_advbwdist_relay <- function(start, end, n) {
-  t <- read.csv(paste(stats_dir, "advbwdist.csv", sep = ""),
-    stringsAsFactors = FALSE)
-  t <- t[t$date >= start & t$date <= end & t$relay %in% as.numeric(n), ]
-  t <- data.frame(date = t$date, relay = as.factor(t$relay),
-                  variable = ifelse(t$isexit != "t", "all", "exits"),
-                  advbw = t$advbw * 8 / 1e9)
-  t
+  read.csv(paste(stats_dir, "advbwdist.csv", sep = ""),
+    colClasses = c("date" = "Date")) %>%
+    filter(date >= as.Date(start), date <= as.Date(end),
+      relay %in% as.numeric(n)) %>%
+    transmute(date, relay = as.factor(relay),
+      variable = ifelse(isexit != "t", "all", "exits"),
+      advbw = advbw * 8 / 1e9)
 }
 
 plot_advbwdist_relay <- function(start, end, n, path) {
-  t <- prepare_advbwdist_relay(start, end, n)
-  t$variable <- ifelse(t$variable == "all", "All relays",
-                       "Exits only")
-  ggplot(t, aes(x = as.Date(date), y = advbw, colour = relay)) +
+  prepare_advbwdist_relay(start, end, n) %>%
+    mutate(variable = ifelse(variable == "all", "All relays",
+      "Exits only")) %>%
+    ggplot(aes(x = date, y = advbw, colour = relay)) +
     facet_grid(variable ~ .) +
     geom_line() +
     scale_x_date(name = "", breaks = custom_breaks,
       labels = custom_labels, minor_breaks = custom_minor_breaks) +
     scale_y_continuous(name = "", labels = unit_format(unit = "Gbit/s"),
       limits = c(0, NA)) +
-    scale_colour_hue(name = "n", breaks = levels(t$relay)) +
+    scale_colour_hue(name = "n") +
     ggtitle("Advertised bandwidth of n-th fastest relays") +
     labs(caption = copyright_notice)
   ggsave(filename = path, width = 8, height = 5, dpi = 150)
@@ -1177,17 +1091,16 @@ write_advbwdist_relay <- function(start, end, n, path) {
 }
 
 prepare_hidserv_dir_onions_seen <- function(start, end) {
-  h <- read.csv(paste(stats_dir, "hidserv.csv", sep = ""),
-    stringsAsFactors = FALSE)
-  h <- h[h$date >= start & h$date <= end & h$type == "dir-onions-seen", ]
-  h <- data.frame(date = as.Date(h$date, "%Y-%m-%d"),
-                  onions = ifelse(h$frac >= 0.01, h$wiqm, NA))
-  h
+  read.csv(paste(stats_dir, "hidserv.csv", sep = ""),
+    colClasses = c("date" = "Date")) %>%
+    filter(date >= as.Date(start), date <= as.Date(end),
+      type == "dir-onions-seen") %>%
+    transmute(date = date, onions = ifelse(frac >= 0.01, wiqm, NA))
 }
 
 plot_hidserv_dir_onions_seen <- function(start, end, path) {
-  h <- prepare_hidserv_dir_onions_seen(start, end)
-  ggplot(h, aes(x = as.Date(date, origin = "1970-01-01"), y = onions)) +
+  prepare_hidserv_dir_onions_seen(start, end) %>%
+    ggplot(aes(x = date, y = onions)) +
     geom_line() +
     scale_x_date(name = "", breaks = custom_breaks,
       labels = custom_labels, minor_breaks = custom_minor_breaks) +
@@ -1203,19 +1116,17 @@ write_hidserv_dir_onions_seen <- function(start, end, path) {
 }
 
 prepare_hidserv_rend_relayed_cells <- function(start, end) {
-  h <- read.csv(paste(stats_dir, "hidserv.csv", sep = ""),
-    stringsAsFactors = FALSE)
-  h <- h[h$date >= start & h$date <= end &
-         h$type == "rend-relayed-cells", ]
-  h <- data.frame(date = as.Date(h$date, "%Y-%m-%d"),
-                  relayed = ifelse(h$frac >= 0.01,
-                                   h$wiqm * 8 * 512 / (86400 * 1e9), NA))
-  h
+  read.csv(paste(stats_dir, "hidserv.csv", sep = ""),
+    colClasses = c("date" = "Date")) %>%
+    filter(date >= as.Date(start), date <= as.Date(end),
+      type == "rend-relayed-cells") %>%
+    transmute(date,
+      relayed = ifelse(frac >= 0.01, wiqm * 8 * 512 / (86400 * 1e9), NA))
 }
 
 plot_hidserv_rend_relayed_cells <- function(start, end, path) {
-  h <- prepare_hidserv_rend_relayed_cells(start, end)
-  ggplot(h, aes(x = as.Date(date, origin = "1970-01-01"), y = relayed)) +
+  prepare_hidserv_rend_relayed_cells(start, end) %>%
+    ggplot(aes(x = date, y = relayed)) +
     geom_line() +
     scale_x_date(name = "", breaks = custom_breaks,
       labels = custom_labels, minor_breaks = custom_minor_breaks) +
@@ -1232,18 +1143,15 @@ write_hidserv_rend_relayed_cells <- function(start, end, path) {
 }
 
 prepare_hidserv_frac_reporting <- function(start, end) {
-  h <- read.csv(paste(stats_dir, "hidserv.csv", sep = ""),
-    stringsAsFactors = FALSE)
-  h <- h[h$date >= start & h$date <= end, ]
-  h <- data.frame(date = as.Date(h$date, "%Y-%m-%d"),
-                  frac = h$frac, type = h$type)
-  h
+  read.csv(paste(stats_dir, "hidserv.csv", sep = ""),
+    colClasses = c("date" = "Date")) %>%
+    filter(date >= as.Date(start), date <= as.Date(end)) %>%
+    select(date, frac, type)
 }
 
 plot_hidserv_frac_reporting <- function(start, end, path) {
-  h <- prepare_hidserv_frac_reporting(start, end)
-  ggplot(h, aes(x = as.Date(date, origin = "1970-01-01"), y = frac,
-      colour = type)) +
+  prepare_hidserv_frac_reporting(start, end) %>%
+    ggplot(aes(x = date, y = frac, colour = type)) +
     geom_line() +
     geom_hline(yintercept = 0.01, linetype = 2) +
     scale_x_date(name = "", breaks = custom_breaks,
@@ -1269,10 +1177,9 @@ write_hidserv_frac_reporting <- function(start, end, path) {
 
 prepare_webstats_tb <- function(start, end) {
   load(paste(rdata_dir, "webstats-tb.RData", sep = ""))
-  d <- data
-  d <- d[d$log_date >= start & d$log_date <= end, ]
-  d$request_type <- factor(d$request_type)
-  d
+  data %>%
+    filter(log_date >= as.Date(start), log_date <= as.Date(end)) %>%
+    mutate(request_type = factor(request_type))
 }
 
 plot_webstats_tb <- function(start, end, path) {
@@ -1306,17 +1213,17 @@ write_webstats_tb <- function(start, end, path) {
 }
 
 prepare_webstats_tb_platform <- function(start, end) {
-  d <- read.csv(paste(stats_dir, "webstats.csv", sep = ""),
-    stringsAsFactors = FALSE)
-  d <- d[d$log_date >= start & d$log_date <= end & d$request_type == 'tbid', ]
-  d <- aggregate(list(count = d$count), by = list(log_date = as.Date(d$log_date),
-    platform = d$platform), FUN = sum)
-  d
+  read.csv(paste(stats_dir, "webstats.csv", sep = ""),
+    colClasses = c("log_date" = "Date")) %>%
+    filter(log_date >= as.Date(start), log_date <= as.Date(end),
+      request_type == 'tbid') %>%
+    group_by(log_date, platform) %>%
+    summarize(count = sum(count))
 }
 
 plot_webstats_tb_platform <- function(start, end, path) {
-  d <- prepare_webstats_tb_platform(start, end)
-  ggplot(d, aes(x = log_date, y = count, colour = platform)) +
+  prepare_webstats_tb_platform(start, end) %>%
+    ggplot(aes(x = log_date, y = count, colour = platform)) +
     geom_point() +
     geom_line() +
     scale_x_date(name = "", breaks = custom_breaks,
@@ -1342,13 +1249,13 @@ write_webstats_tb_platform <- function(start, end, path) {
 
 plot_webstats_tb_locale <- function(start, end, path) {
   d <- read.csv(paste(stats_dir, "webstats.csv", sep = ""),
-    stringsAsFactors = FALSE)
+    colClasses = c("log_date" = "Date"))
   d <- d[d$log_date >= start & d$log_date <= end & d$request_type == 'tbid', ]
   e <- d
   e <- aggregate(list(count = e$count), by = list(locale = e$locale), FUN = sum)
   e <- e[order(e$count, decreasing = TRUE), ]
   e <- e[1:5, ]
-  d <- aggregate(list(count = d$count), by = list(log_date = as.Date(d$log_date),
+  d <- aggregate(list(count = d$count), by = list(log_date = d$log_date,
     locale = ifelse(d$locale %in% e$locale, d$locale, '(other)')), FUN = sum)
   ggplot(d, aes(x = log_date, y = count, colour = locale)) +
     geom_point() +
@@ -1373,13 +1280,13 @@ plot_webstats_tb_locale <- function(start, end, path) {
 # breaks and labels. Left as future work.
 write_webstats_tb_locale <- function(start, end, path) {
   d <- read.csv(paste(stats_dir, "webstats.csv", sep = ""),
-    stringsAsFactors = FALSE)
+    colClasses = c("log_date" = "Date"))
   d <- d[d$log_date >= start & d$log_date <= end & d$request_type == 'tbid', ]
   e <- d
   e <- aggregate(list(count = e$count), by = list(locale = e$locale), FUN = sum)
   e <- e[order(e$count, decreasing = TRUE), ]
   e <- e[1:5, ]
-  d <- aggregate(list(count = d$count), by = list(log_date = as.Date(d$log_date),
+  d <- aggregate(list(count = d$count), by = list(log_date = d$log_date,
     locale = ifelse(d$locale %in% e$locale, d$locale, 'other')), FUN = sum)
   d %>%
     mutate(locale = tolower(locale)) %>%
@@ -1390,10 +1297,9 @@ write_webstats_tb_locale <- function(start, end, path) {
 
 prepare_webstats_tm <- function(start, end) {
   load(paste(rdata_dir, "webstats-tm.RData", sep = ""))
-  d <- data
-  d <- d[d$log_date >= start & d$log_date <= end, ]
-  d$request_type <- factor(d$request_type)
-  d
+  data %>%
+    filter(log_date >= as.Date(start), log_date <= as.Date(end)) %>%
+    mutate(request_type = factor(request_type))
 }
 
 plot_webstats_tm <- function(start, end, path) {





More information about the tor-commits mailing list