[tor-commits] [metrics-web/release] Switch to readr's read_csv() everywhere.

karsten at torproject.org karsten at torproject.org
Sat Nov 9 21:45:06 UTC 2019


commit a94a3844644041f7c1f6e0a4451e19ce12cae9e8
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Thu Jan 10 22:32:28 2019 +0100

    Switch to readr's read_csv() everywhere.
---
 src/main/R/rserver/graphs.R | 230 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 175 insertions(+), 55 deletions(-)

diff --git a/src/main/R/rserver/graphs.R b/src/main/R/rserver/graphs.R
index 82a51e7..205afbe 100644
--- a/src/main/R/rserver/graphs.R
+++ b/src/main/R/rserver/graphs.R
@@ -359,8 +359,11 @@ write_data <- function(FUN, ..., path_p) {
 options(readr.show_progress = FALSE)
 
 prepare_networksize <- function(start_p = NULL, end_p = NULL) {
-  read.csv(paste(stats_dir, "networksize.csv", sep = ""),
-    colClasses = c("date" = "Date")) %>%
+  read_csv(file = paste(stats_dir, "networksize.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        relays = col_double(),
+        bridges = col_double())) %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE)
 }
@@ -416,8 +419,11 @@ plot_versions <- function(start_p, end_p, path_p) {
 }
 
 prepare_platforms <- function(start_p = NULL, end_p = NULL) {
-  read.csv(paste(stats_dir, "platforms.csv", sep = ""),
-    colClasses = c("date" = "Date")) %>%
+  read_csv(file = paste(stats_dir, "platforms.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        platform = col_factor(levels = NULL),
+        relays = col_double())) %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
     mutate(platform = tolower(platform)) %>%
@@ -443,12 +449,19 @@ plot_platforms <- function(start_p, end_p, path_p) {
 }
 
 prepare_dirbytes <- function(start_p = NULL, end_p = NULL) {
-  read.csv(paste(stats_dir, "bandwidth.csv", sep = ""),
-    colClasses = c("date" = "Date")) %>%
+  read_csv(file = paste(stats_dir, "bandwidth.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        isexit = col_logical(),
+        isguard = col_logical(),
+        bwread = col_skip(),
+        bwwrite = col_skip(),
+        dirread = col_double(),
+        dirwrite = col_double())) %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
-    filter(isexit == "") %>%
-    filter(isguard == "") %>%
+    filter(is.na(isexit)) %>%
+    filter(is.na(isguard)) %>%
     mutate(dirread = dirread * 8 / 1e9,
       dirwrite = dirwrite * 8 / 1e9) %>%
     select(date, dirread, dirwrite)
@@ -473,8 +486,11 @@ plot_dirbytes <- function(start_p, end_p, path_p) {
 }
 
 prepare_relayflags <- function(start_p = NULL, end_p = NULL, flag_p = NULL) {
-  read.csv(paste(stats_dir, "relayflags.csv", sep = ""),
-    colClasses = c("date" = "Date")) %>%
+  read_csv(file = paste(stats_dir, "relayflags.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        flag = col_factor(levels = NULL),
+        relays = col_double())) %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
     filter(if (!is.null(flag_p)) flag %in% flag_p else TRUE)
@@ -483,7 +499,7 @@ prepare_relayflags <- function(start_p = NULL, end_p = NULL, flag_p = NULL) {
 plot_relayflags <- function(start_p, end_p, flag_p, path_p) {
   prepare_relayflags(start_p, end_p, flag_p) %>%
     complete(date = full_seq(date, period = 1), flag = unique(flag)) %>%
-    ggplot(aes(x = date, y = relays, colour = as.factor(flag))) +
+    ggplot(aes(x = date, y = relays, colour = flag)) +
     geom_line() +
     scale_x_date(name = "", breaks = custom_breaks,
       labels = custom_labels, minor_breaks = custom_minor_breaks) +
@@ -498,8 +514,18 @@ plot_relayflags <- function(start_p, end_p, flag_p, path_p) {
 
 prepare_torperf <- function(start_p = NULL, end_p = NULL, server_p = NULL,
     filesize_p = NULL) {
-  read.csv(paste(stats_dir, "torperf-1.1.csv", sep = ""),
-    colClasses = c("date" = "Date", "source" = "character")) %>%
+  read_csv(file = paste(stats_dir, "torperf-1.1.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        filesize = col_double(),
+        source = col_character(),
+        server = col_character(),
+        q1 = col_double(),
+        md = col_double(),
+        q3 = col_double(),
+        timeouts = col_skip(),
+        failures = col_skip(),
+        requests = col_skip())) %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
     filter(if (!is.null(server_p)) server == server_p else TRUE) %>%
@@ -535,8 +561,18 @@ plot_torperf <- function(start_p, end_p, server_p, filesize_p, path_p) {
 
 prepare_torperf_failures <- function(start_p = NULL, end_p = NULL,
     server_p = NULL, filesize_p = NULL) {
-  read.csv(paste(stats_dir, "torperf-1.1.csv", sep = ""),
-    colClasses = c("date" = "Date")) %>%
+  read_csv(file = paste(stats_dir, "torperf-1.1.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        filesize = col_double(),
+        source = col_character(),
+        server = col_character(),
+        q1 = col_skip(),
+        md = col_skip(),
+        q3 = col_skip(),
+        timeouts = col_double(),
+        failures = col_double(),
+        requests = col_double())) %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
     filter(if (!is.null(filesize_p))
@@ -573,8 +609,14 @@ plot_torperf_failures <- function(start_p, end_p, server_p, filesize_p,
 }
 
 prepare_onionperf_buildtimes <- function(start_p = NULL, end_p = NULL) {
-    read.csv(paste(stats_dir, "buildtimes.csv", sep = ""),
-    colClasses = c("date" = "Date")) %>%
+  read_csv(file = paste(stats_dir, "buildtimes.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        source = col_character(),
+        position = col_double(),
+        q1 = col_double(),
+        md = col_double(),
+        q3 = col_double())) %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE)
 }
@@ -604,8 +646,14 @@ plot_onionperf_buildtimes <- function(start_p, end_p, path_p) {
 
 prepare_onionperf_latencies <- function(start_p = NULL, end_p = NULL,
     server_p = NULL) {
-  read.csv(paste(stats_dir, "latencies.csv", sep = ""),
-    colClasses = c("date" = "Date")) %>%
+  read_csv(file = paste(stats_dir, "latencies.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        source = col_character(),
+        server = col_character(),
+        q1 = col_double(),
+        md = col_double(),
+        q3 = col_double())) %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
     filter(if (!is.null(server_p)) server == server_p else TRUE)
@@ -631,8 +679,12 @@ plot_onionperf_latencies <- function(start_p, end_p, server_p, path_p) {
 }
 
 prepare_connbidirect <- function(start_p = NULL, end_p = NULL) {
-  read.csv(paste(stats_dir, "connbidirect2.csv", sep = ""),
-    colClasses = c("date" = "Date", "direction" = "factor")) %>%
+  read_csv(file = paste(stats_dir, "connbidirect2.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        direction = col_factor(),
+        quantile = col_double(),
+        fraction = col_double())) %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
     mutate(quantile = paste("X", quantile, sep = ""),
@@ -665,19 +717,30 @@ plot_connbidirect <- function(start_p, end_p, path_p) {
 }
 
 prepare_bandwidth_flags <- function(start_p = NULL, end_p = NULL) {
-  advbw <- read.csv(paste(stats_dir, "advbw.csv", sep = ""),
-    colClasses = c("date" = "Date")) %>%
+  advbw <- read_csv(file = paste(stats_dir, "advbw.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        isexit = col_logical(),
+        isguard = col_logical(),
+        advbw = col_double())) %>%
     transmute(date, have_guard_flag = isguard, have_exit_flag = isexit,
       variable = "advbw", value = advbw * 8 / 1e9)
-  bwhist <- read.csv(paste(stats_dir, "bandwidth.csv", sep = ""),
-    colClasses = c("date" = "Date")) %>%
+  bwhist <- read_csv(file = paste(stats_dir, "bandwidth.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        isexit = col_logical(),
+        isguard = col_logical(),
+        bwread = col_double(),
+        bwwrite = col_double(),
+        dirread = col_double(),
+        dirwrite = col_double())) %>%
     transmute(date, have_guard_flag = isguard, have_exit_flag = isexit,
       variable = "bwhist", value = (bwread + bwwrite) * 8 / 2e9)
   rbind(advbw, bwhist) %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
-    filter(have_exit_flag != "") %>%
-    filter(have_guard_flag != "") %>%
+    filter(!is.na(have_exit_flag)) %>%
+    filter(!is.na(have_guard_flag)) %>%
     spread(variable, value)
 }
 
@@ -685,7 +748,8 @@ plot_bandwidth_flags <- function(start_p, end_p, path_p) {
   prepare_bandwidth_flags(start_p, end_p) %>%
     gather(variable, value, c(advbw, bwhist)) %>%
     unite(flags, have_guard_flag, have_exit_flag) %>%
-    mutate(flags = factor(flags, levels = c("f_t", "t_t", "t_f", "f_f"),
+    mutate(flags = factor(flags,
+      levels = c("FALSE_TRUE", "TRUE_TRUE", "TRUE_FALSE", "FALSE_FALSE"),
       labels = c("Exit only", "Guard and Exit", "Guard only",
       "Neither Guard nor Exit"))) %>%
     mutate(variable = ifelse(variable == "advbw",
@@ -968,14 +1032,19 @@ plot_userstats_bridge_combined <- function(start_p, end_p, country_p, path_p) {
 }
 
 prepare_advbwdist_perc <- function(start_p = NULL, end_p = NULL, p_p = NULL) {
-  read.csv(paste(stats_dir, "advbwdist.csv", sep = ""),
-    colClasses = c("date" = "Date")) %>%
+  read_csv(file = paste(stats_dir, "advbwdist.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        isexit = col_logical(),
+        relay = col_skip(),
+        percentile = col_integer(),
+        advbw = col_double())) %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
     filter(if (!is.null(p_p)) percentile %in% as.numeric(p_p) else
       percentile != "") %>%
     transmute(date, percentile = as.factor(percentile),
-      variable = ifelse(isexit == "t", "exits", "all"),
+      variable = ifelse(is.na(isexit), "all", "exits"),
       advbw = advbw * 8 / 1e9) %>%
     spread(variable, advbw) %>%
     rename(p = percentile)
@@ -1000,14 +1069,19 @@ plot_advbwdist_perc <- function(start_p, end_p, p_p, path_p) {
 }
 
 prepare_advbwdist_relay <- function(start_p = NULL, end_p = NULL, n_p = NULL) {
-  read.csv(paste(stats_dir, "advbwdist.csv", sep = ""),
-    colClasses = c("date" = "Date")) %>%
+  read_csv(file = paste(stats_dir, "advbwdist.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        isexit = col_logical(),
+        relay = col_integer(),
+        percentile = col_skip(),
+        advbw = col_double())) %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
     filter(if (!is.null(n_p)) relay %in% as.numeric(n_p) else
       relay != "") %>%
     transmute(date, relay = as.factor(relay),
-      variable = ifelse(isexit != "t", "all", "exits"),
+      variable = ifelse(is.na(isexit), "all", "exits"),
       advbw = advbw * 8 / 1e9) %>%
     spread(variable, advbw) %>%
     rename(n = relay)
@@ -1032,8 +1106,15 @@ plot_advbwdist_relay <- function(start_p, end_p, n_p, path_p) {
 }
 
 prepare_hidserv_dir_onions_seen <- function(start_p = NULL, end_p = NULL) {
-  read.csv(paste(stats_dir, "hidserv.csv", sep = ""),
-    colClasses = c("date" = "Date")) %>%
+  read_csv(file = paste(stats_dir, "hidserv.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        type = col_factor(),
+        wmean = col_skip(),
+        wmedian = col_skip(),
+        wiqm = col_double(),
+        frac = col_double(),
+        stats = col_skip())) %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
     filter(type == "dir-onions-seen") %>%
@@ -1053,8 +1134,15 @@ plot_hidserv_dir_onions_seen <- function(start_p, end_p, path_p) {
 }
 
 prepare_hidserv_rend_relayed_cells <- function(start_p = NULL, end_p = NULL) {
-  read.csv(paste(stats_dir, "hidserv.csv", sep = ""),
-    colClasses = c("date" = "Date")) %>%
+  read_csv(file = paste(stats_dir, "hidserv.csv", sep = ""),
+      col_types = cols(
+        date = col_date(format = ""),
+        type = col_factor(),
+        wmean = col_skip(),
+        wmedian = col_skip(),
+        wiqm = col_double(),
+        frac = col_double(),
+        stats = col_skip())) %>%
     filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
     filter(type == "rend-relayed-cells") %>%
@@ -1257,8 +1345,17 @@ plot_webstats_tm <- function(start_p, end_p, path_p) {
 }
 
 prepare_relays_ipv6 <- function(start_p = NULL, end_p = NULL) {
-  read.csv(paste(stats_dir, "ipv6servers.csv", sep = ""),
-    colClasses = c("valid_after_date" = "Date")) %>%
+  read_csv(file = paste(stats_dir, "ipv6servers.csv", sep = ""),
+      col_types = cols(
+        valid_after_date = col_date(format = ""),
+        server = col_factor(),
+        guard_relay = col_skip(),
+        exit_relay = col_skip(),
+        announced_ipv6 = col_logical(),
+        exiting_ipv6_relay = col_logical(),
+        reachable_ipv6_relay = col_logical(),
+        server_count_sum_avg = col_double(),
+        advertised_bandwidth_bytes_sum_avg = col_skip())) %>%
     filter(if (!is.null(start_p))
         valid_after_date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p))
@@ -1266,9 +1363,9 @@ prepare_relays_ipv6 <- function(start_p = NULL, end_p = NULL) {
     filter(server == "relay") %>%
     group_by(valid_after_date) %>%
     summarize(total = sum(server_count_sum_avg),
-      announced = sum(server_count_sum_avg[announced_ipv6 == "t"]),
-      reachable = sum(server_count_sum_avg[reachable_ipv6_relay == "t"]),
-      exiting = sum(server_count_sum_avg[exiting_ipv6_relay == "t"])) %>%
+      announced = sum(server_count_sum_avg[announced_ipv6]),
+      reachable = sum(server_count_sum_avg[reachable_ipv6_relay]),
+      exiting = sum(server_count_sum_avg[exiting_ipv6_relay])) %>%
     complete(valid_after_date = full_seq(valid_after_date, period = 1)) %>%
     gather(total, announced, reachable, exiting, key = "category",
       value = "count") %>%
@@ -1295,8 +1392,17 @@ plot_relays_ipv6 <- function(start_p, end_p, path_p) {
 }
 
 prepare_bridges_ipv6 <- function(start_p = NULL, end_p = NULL) {
-  read.csv(paste(stats_dir, "ipv6servers.csv", sep = ""),
-    colClasses = c("valid_after_date" = "Date")) %>%
+  read_csv(file = paste(stats_dir, "ipv6servers.csv", sep = ""),
+      col_types = cols(
+        valid_after_date = col_date(format = ""),
+        server = col_factor(),
+        guard_relay = col_skip(),
+        exit_relay = col_skip(),
+        announced_ipv6 = col_logical(),
+        exiting_ipv6_relay = col_skip(),
+        reachable_ipv6_relay = col_skip(),
+        server_count_sum_avg = col_double(),
+        advertised_bandwidth_bytes_sum_avg = col_skip())) %>%
     filter(if (!is.null(start_p))
         valid_after_date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p))
@@ -1304,7 +1410,7 @@ prepare_bridges_ipv6 <- function(start_p = NULL, end_p = NULL) {
     filter(server == "bridge") %>%
     group_by(valid_after_date) %>%
     summarize(total = sum(server_count_sum_avg),
-      announced = sum(server_count_sum_avg[announced_ipv6 == "t"])) %>%
+      announced = sum(server_count_sum_avg[announced_ipv6])) %>%
     complete(valid_after_date = full_seq(valid_after_date, period = 1)) %>%
     rename(date = valid_after_date)
 }
@@ -1327,8 +1433,17 @@ plot_bridges_ipv6 <- function(start_p, end_p, path_p) {
 }
 
 prepare_advbw_ipv6 <- function(start_p = NULL, end_p = NULL) {
-  read.csv(paste(stats_dir, "ipv6servers.csv", sep = ""),
-    colClasses = c("valid_after_date" = "Date")) %>%
+  read_csv(file = paste(stats_dir, "ipv6servers.csv", sep = ""),
+      col_types = cols(
+        valid_after_date = col_date(format = ""),
+        server = col_factor(),
+        guard_relay = col_logical(),
+        exit_relay = col_logical(),
+        announced_ipv6 = col_logical(),
+        exiting_ipv6_relay = col_logical(),
+        reachable_ipv6_relay = col_logical(),
+        server_count_sum_avg = col_skip(),
+        advertised_bandwidth_bytes_sum_avg = col_double())) %>%
     filter(if (!is.null(start_p))
         valid_after_date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p))
@@ -1338,14 +1453,14 @@ prepare_advbw_ipv6 <- function(start_p = NULL, end_p = NULL) {
         advertised_bandwidth_bytes_sum_avg * 8 / 1e9) %>%
     group_by(valid_after_date) %>%
     summarize(total = sum(advertised_bandwidth_bytes_sum_avg),
-      total_guard = sum(advertised_bandwidth_bytes_sum_avg[guard_relay != "f"]),
-      total_exit = sum(advertised_bandwidth_bytes_sum_avg[exit_relay != "f"]),
+      total_guard = sum(advertised_bandwidth_bytes_sum_avg[guard_relay]),
+      total_exit = sum(advertised_bandwidth_bytes_sum_avg[exit_relay]),
       reachable_guard = sum(advertised_bandwidth_bytes_sum_avg[
-        reachable_ipv6_relay != "f" & guard_relay != "f"]),
+        reachable_ipv6_relay & guard_relay]),
       reachable_exit = sum(advertised_bandwidth_bytes_sum_avg[
-        reachable_ipv6_relay != "f" & exit_relay != "f"]),
+        reachable_ipv6_relay & exit_relay]),
       exiting = sum(advertised_bandwidth_bytes_sum_avg[
-        exiting_ipv6_relay != "f"])) %>%
+        exiting_ipv6_relay])) %>%
     complete(valid_after_date = full_seq(valid_after_date, period = 1)) %>%
     rename(date = valid_after_date)
 }
@@ -1372,8 +1487,13 @@ plot_advbw_ipv6 <- function(start_p, end_p, path_p) {
 }
 
 prepare_totalcw <- function(start_p = NULL, end_p = NULL) {
-  read.csv(paste(stats_dir, "totalcw.csv", sep = ""),
-    colClasses = c("valid_after_date" = "Date", "nickname" = "character")) %>%
+  read_csv(file = paste(stats_dir, "totalcw.csv", sep = ""),
+      col_types = cols(
+        valid_after_date = col_date(format = ""),
+        nickname = col_character(),
+        have_guard_flag = col_logical(),
+        have_exit_flag = col_logical(),
+        measured_sum_avg = col_double())) %>%
     filter(if (!is.null(start_p))
         valid_after_date >= as.Date(start_p) else TRUE) %>%
     filter(if (!is.null(end_p))





More information about the tor-commits mailing list