[tor-commits] [metrics-web/release] Split up huge plot_userstats function.

karsten at torproject.org karsten at torproject.org
Sat Nov 9 21:45:06 UTC 2019


commit f55e63d986ed9c1054ce19ff0d4a19b1c0bce26d
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Thu Jan 10 09:54:39 2019 +0100

    Split up huge plot_userstats function.
    
    The mere size of this function made it hard to impossible to refactor
    things to using more recent R packages dplyr and tidyr. Now there are
    four plot_userstats_* functions with accompanying prepare_userstats_*
    that make the corresponding write_userstats_* functions really small.
---
 src/main/R/rserver/graphs.R | 269 +++++++++++++++++++-------------------------
 1 file changed, 115 insertions(+), 154 deletions(-)

diff --git a/src/main/R/rserver/graphs.R b/src/main/R/rserver/graphs.R
index d3ea90a..ba8862c 100644
--- a/src/main/R/rserver/graphs.R
+++ b/src/main/R/rserver/graphs.R
@@ -751,9 +751,9 @@ write_bandwidth_flags <- function(start_p = NULL, end_p = NULL, path_p) {
     write.csv(path_p, quote = FALSE, row.names = FALSE, na = "")
 }
 
-plot_userstats <- function(start_p, end_p, node_p, variable_p, value_p,
-    events_p, path_p) {
-  c <- read_csv(file = paste(stats_dir, "clients.csv", sep = ""),
+prepare_userstats_relay_country <- function(start_p, end_p, country_p,
+    events_p) {
+  read_csv(file = paste(stats_dir, "clients.csv", sep = ""),
       col_types = cols(
         date = col_date(format = ""),
         node = col_character(),
@@ -763,97 +763,26 @@ plot_userstats <- function(start_p, end_p, node_p, variable_p, value_p,
         lower = col_double(),
         upper = col_double(),
         clients = col_double(),
-        frac = col_skip()),
+        frac = col_double()),
       na = character()) %>%
-    filter(node == node_p)
-  u <- c[c$date >= start_p & c$date <= end_p, c("date", "country", "transport",
-      "version", "lower", "upper", "clients")]
-  u <- rbind(u, data.frame(date = start_p,
-      country = ifelse(variable_p == "country" & value_p != "all", value_p, ""),
-      transport = ifelse(variable_p == "transport", value_p, ""),
-      version = ifelse(variable_p == "version", value_p, ""),
-      lower = 0, upper = 0, clients = 0))
-  if (node_p == "relay") {
-    if (value_p != "all") {
-      u <- u[u$country == value_p, ]
-      title <- paste("Directly connecting users from", countryname(value_p))
-    } else {
-      u <- u[u$country == "", ]
-      title <- "Directly connecting users"
-    }
-    u <- aggregate(list(lower = u$lower, upper = u$upper,
-                        users = u$clients),
-                   by = list(date = as.Date(u$date, "%Y-%m-%d"),
-                             value = u$country),
-                   FUN = sum)
-  } else if (variable_p == "transport") {
-    if ("!<OR>" %in% value_p) {
-      n <- u[u$transport != "" & u$transport != "<OR>", ]
-      n <- aggregate(list(lower = n$lower, upper = n$upper,
-                          clients = n$clients),
-                     by = list(date = n$date),
-                     FUN = sum)
-      u <- rbind(u, data.frame(date = n$date,
-                               country = "", transport = "!<OR>",
-                               version = "", lower = n$lower,
-                               upper = n$upper, clients = n$clients))
-    }
-    if (length(value_p) > 1) {
-      u <- u[u$transport %in% value_p, ]
-      u <- aggregate(list(lower = u$lower, upper = u$upper,
-                          users = u$clients),
-                     by = list(date = as.Date(u$date, "%Y-%m-%d"),
-                               value = u$transport),
-                     FUN = sum)
-      title <- paste("Bridge users by transport")
-    } else {
-      u <- u[u$transport == value_p, ]
-      u <- aggregate(list(lower = u$lower, upper = u$upper,
-                          users = u$clients),
-                     by = list(date = as.Date(u$date, "%Y-%m-%d"),
-                               value = u$transport),
-                     FUN = sum)
-      title <- paste("Bridge users using",
-               ifelse(value_p == "<??>", "unknown pluggable transport(s)",
-               ifelse(value_p == "<OR>", "default OR protocol",
-               ifelse(value_p == "!<OR>", "any pluggable transport",
-               ifelse(value_p == "fte", "FTE",
-               ifelse(value_p == "websocket", "Flash proxy/websocket",
-               paste("transport", value_p)))))))
-    }
-  } else if (variable_p == "version") {
-    u <- u[u$version == value_p, ]
-    title <- paste("Bridge users using IP", value_p, sep = "")
-    u <- aggregate(list(lower = u$lower, upper = u$upper,
-                        users = u$clients),
-                   by = list(date = as.Date(u$date, "%Y-%m-%d"),
-                             value = u$version),
-                   FUN = sum)
-  } else {
-    if (value_p != "all") {
-      u <- u[u$country == value_p, ]
-      title <- paste("Bridge users from", countryname(value_p))
-    } else {
-      u <- u[u$country == "" & u$transport == "" & u$version == "", ]
-      title <- "Bridge users"
-    }
-    u <- aggregate(list(lower = u$lower, upper = u$upper,
-                        users = u$clients),
-                   by = list(date = as.Date(u$date, "%Y-%m-%d"),
-                             value = u$country),
-                   FUN = sum)
-  }
-  u <- merge(x = u, all.y = TRUE, y = data.frame(expand.grid(
-             date = seq(from = as.Date(start_p, "%Y-%m-%d"),
-             to = as.Date(end_p, "%Y-%m-%d"), by = "1 day"),
-             value = ifelse(value_p == "all", "", value_p))))
-  if (length(value_p) > 1) {
-    plot <- ggplot(u, aes(x = date, y = users, colour = value))
-  } else {
-    plot <- ggplot(u, aes(x = date, y = users))
-  }
+    filter(node == "relay") %>%
+    filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
+    filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
+    filter(if (!is.null(country_p))
+      country == ifelse(country_p == "all", "", country_p) else TRUE) %>%
+    filter(transport == "") %>%
+    filter(version == "") %>%
+    select(date, country, clients, lower, upper, frac) %>%
+    rename(users = clients)
+}
+
+plot_userstats_relay_country <- function(start_p, end_p, country_p, events_p,
+    path_p) {
+  u <- prepare_userstats_relay_country(start_p, end_p, country_p, events_p) %>%
+    complete(date = full_seq(date, period = 1))
+  plot <- ggplot(u, aes(x = date, y = users))
   if (length(na.omit(u$users)) > 0 & events_p != "off" &
-      variable_p == "country" & length(value_p) == 1 && value_p != "all") {
+      country_p != "all") {
     upturns <- u[u$users > u$upper, c("date", "users")]
     downturns <- u[u$users < u$lower, c("date", "users")]
     if (events_p == "on") {
@@ -875,69 +804,20 @@ plot_userstats <- function(start_p, end_p, node_p, variable_p, value_p,
     scale_x_date(name = "", breaks = custom_breaks,
       labels = custom_labels, minor_breaks = custom_minor_breaks) +
     scale_y_continuous(name = "", labels = formatter, limits = c(0, NA)) +
-    ggtitle(title) +
+    ggtitle(paste("Directly connecting users",
+        ifelse(country_p == "all", "",
+        paste(" from", countryname(country_p))), sep = "")) +
     labs(caption = copyright_notice)
-  if (length(value_p) > 1) {
-    plot <- plot +
-      scale_colour_hue(name = "", breaks = value_p,
-            labels = ifelse(value_p == "<??>", "Unknown PT",
-                     ifelse(value_p == "<OR>", "Default OR protocol",
-                     ifelse(value_p == "!<OR>", "Any PT",
-                     ifelse(value_p == "fte", "FTE",
-                     ifelse(value_p == "websocket", "Flash proxy/websocket",
-                     value_p))))))
-  }
   ggsave(filename = path_p, width = 8, height = 5, dpi = 150)
 }
 
-plot_userstats_relay_country <- function(start_p, end_p, country_p, events_p,
-    path_p) {
-  plot_userstats(start_p, end_p, "relay", "country", country_p, events_p,
-    path_p)
-}
-
-plot_userstats_bridge_country <- function(start_p, end_p, country_p, path_p) {
-  plot_userstats(start_p, end_p, "bridge", "country", country_p, "off", path_p)
-}
-
-plot_userstats_bridge_transport <- function(start_p, end_p, transport_p,
-    path_p) {
-  plot_userstats(start_p, end_p, "bridge", "transport", transport_p, "off",
-    path_p)
-}
-
-plot_userstats_bridge_version <- function(start_p, end_p, version_p, path_p) {
-  plot_userstats(start_p, end_p, "bridge", "version", version_p, "off", path_p)
-}
-
 write_userstats_relay_country <- function(start_p = NULL, end_p = NULL,
     country_p = NULL, events_p = NULL, path_p) {
-  read_csv(file = paste(stats_dir, "clients.csv", sep = ""),
-      col_types = cols(
-        date = col_date(format = ""),
-        node = col_character(),
-        country = col_character(),
-        transport = col_character(),
-        version = col_character(),
-        lower = col_double(),
-        upper = col_double(),
-        clients = col_double(),
-        frac = col_double()),
-      na = character()) %>%
-    filter(node == "relay") %>%
-    filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
-    filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
-    filter(if (!is.null(country_p))
-      country == ifelse(country_p == "all", "", country_p) else TRUE) %>%
-    filter(transport == "") %>%
-    filter(version == "") %>%
-    select(date, country, clients, lower, upper, frac) %>%
-    rename(users = clients) %>%
+  prepare_userstats_relay_country(start_p, end_p, country_p, events_p) %>%
     write.csv(path_p, quote = FALSE, row.names = FALSE, na = "")
 }
 
-write_userstats_bridge_country <- function(start_p = NULL, end_p = NULL,
-    country_p = NULL, path_p) {
+prepare_userstats_bridge_country <- function(start_p, end_p, country_p) {
   read_csv(file = paste(stats_dir, "clients.csv", sep = ""),
       col_types = cols(
         date = col_date(format = ""),
@@ -958,12 +838,32 @@ write_userstats_bridge_country <- function(start_p = NULL, end_p = NULL,
     filter(transport == "") %>%
     filter(version == "") %>%
     select(date, country, clients, frac) %>%
-    rename(users = clients) %>%
+    rename(users = clients)
+}
+
+plot_userstats_bridge_country <- function(start_p, end_p, country_p, path_p) {
+  prepare_userstats_bridge_country(start_p, end_p, country_p) %>%
+    complete(date = full_seq(date, period = 1)) %>%
+    ggplot(aes(x = date, y = users)) +
+    geom_line() +
+    scale_x_date(name = "", breaks = custom_breaks,
+      labels = custom_labels, minor_breaks = custom_minor_breaks) +
+    scale_y_continuous(name = "", labels = formatter, limits = c(0, NA)) +
+    ggtitle(paste("Bridge users",
+        ifelse(country_p == "all", "",
+        paste(" from", countryname(country_p))), sep = "")) +
+    labs(caption = copyright_notice)
+  ggsave(filename = path_p, width = 8, height = 5, dpi = 150)
+}
+
+write_userstats_bridge_country <- function(start_p = NULL, end_p = NULL,
+    country_p = NULL, path_p) {
+  prepare_userstats_bridge_country(start_p, end_p, country_p) %>%
     write.csv(path_p, quote = FALSE, row.names = FALSE, na = "")
 }
 
-write_userstats_bridge_transport <- function(start_p = NULL, end_p = NULL,
-    transport_p = NULL, path_p) {
+prepare_userstats_bridge_transport <- function(start_p = NULL, end_p = NULL,
+    transport_p = NULL) {
   u <- read_csv(file = paste(stats_dir, "clients.csv", sep = ""),
       col_types = cols(
         date = col_date(format = ""),
@@ -992,15 +892,58 @@ write_userstats_bridge_transport <- function(start_p = NULL, end_p = NULL,
   }
   u %>%
     filter(if (!is.null(transport_p)) transport %in% transport_p else TRUE) %>%
-    group_by(date, transport) %>%
     select(date, transport, clients, frac) %>%
     rename(users = clients) %>%
-    arrange(date, transport) %>%
+    arrange(date, transport)
+}
+
+plot_userstats_bridge_transport <- function(start_p, end_p, transport_p,
+    path_p) {
+  if (length(transport_p) > 1) {
+    title <- paste("Bridge users by transport")
+  } else {
+    title <- paste("Bridge users using",
+             ifelse(transport_p == "<??>", "unknown pluggable transport(s)",
+             ifelse(transport_p == "<OR>", "default OR protocol",
+             ifelse(transport_p == "!<OR>", "any pluggable transport",
+             ifelse(transport_p == "fte", "FTE",
+             ifelse(transport_p == "websocket", "Flash proxy/websocket",
+             paste("transport", transport_p)))))))
+  }
+  u <- prepare_userstats_bridge_transport(start_p, end_p, transport_p) %>%
+    complete(date = full_seq(date, period = 1), nesting(transport))
+  if (length(transport_p) > 1) {
+    plot <- ggplot(u, aes(x = date, y = users, colour = transport))
+  } else {
+    plot <- ggplot(u, aes(x = date, y = users))
+  }
+  plot <- plot +
+    geom_line() +
+    scale_x_date(name = "", breaks = custom_breaks,
+      labels = custom_labels, minor_breaks = custom_minor_breaks) +
+    scale_y_continuous(name = "", labels = formatter, limits = c(0, NA)) +
+    ggtitle(title) +
+    labs(caption = copyright_notice)
+  if (length(transport_p) > 1) {
+    plot <- plot +
+      scale_colour_hue(name = "", breaks = transport_p,
+            labels = ifelse(transport_p == "<??>", "Unknown PT",
+                     ifelse(transport_p == "<OR>", "Default OR protocol",
+                     ifelse(transport_p == "!<OR>", "Any PT",
+                     ifelse(transport_p == "fte", "FTE",
+                     ifelse(transport_p == "websocket", "Flash proxy/websocket",
+                     transport_p))))))
+  }
+  ggsave(filename = path_p, width = 8, height = 5, dpi = 150)
+}
+
+write_userstats_bridge_transport <- function(start_p = NULL, end_p = NULL,
+    transport_p = NULL, path_p) {
+  prepare_userstats_bridge_transport(start_p, end_p, transport_p) %>%
     write.csv(path_p, quote = FALSE, row.names = FALSE, na = "")
 }
 
-write_userstats_bridge_version <- function(start_p = NULL, end_p = NULL,
-    version_p = NULL, path_p) {
+prepare_userstats_bridge_version <- function(start_p, end_p, version_p) {
   read_csv(file = paste(stats_dir, "clients.csv", sep = ""),
       col_types = cols(
         date = col_date(format = ""),
@@ -1019,7 +962,25 @@ write_userstats_bridge_version <- function(start_p = NULL, end_p = NULL,
     filter(is.na(transport)) %>%
     filter(if (!is.null(version_p)) version == version_p else TRUE) %>%
     select(date, version, clients, frac) %>%
-    rename(users = clients) %>%
+    rename(users = clients)
+}
+
+plot_userstats_bridge_version <- function(start_p, end_p, version_p, path_p) {
+  prepare_userstats_bridge_version(start_p, end_p, version_p) %>%
+    complete(date = full_seq(date, period = 1)) %>%
+    ggplot(aes(x = date, y = users)) +
+    geom_line() +
+    scale_x_date(name = "", breaks = custom_breaks,
+      labels = custom_labels, minor_breaks = custom_minor_breaks) +
+    scale_y_continuous(name = "", labels = formatter, limits = c(0, NA)) +
+    ggtitle(paste("Bridge users using IP", version_p, sep = "")) +
+    labs(caption = copyright_notice)
+  ggsave(filename = path_p, width = 8, height = 5, dpi = 150)
+}
+
+write_userstats_bridge_version <- function(start_p = NULL, end_p = NULL,
+    version_p = NULL, path_p) {
+  prepare_userstats_bridge_version(start_p, end_p, version_p) %>%
     write.csv(path_p, quote = FALSE, row.names = FALSE, na = "")
 }
 





More information about the tor-commits mailing list