commit 5c8e727b61f516ddb30d2400d6f3c97462443801 Author: Karsten Loesing karsten.loesing@gmx.net Date: Thu Mar 1 12:27:07 2018 +0100
Refactor even more of the graphing code. --- src/main/R/rserver/graphs.R | 428 +++++++++++++++++--------------------------- 1 file changed, 167 insertions(+), 261 deletions(-)
diff --git a/src/main/R/rserver/graphs.R b/src/main/R/rserver/graphs.R index 8847967..93ff919 100644 --- a/src/main/R/rserver/graphs.R +++ b/src/main/R/rserver/graphs.R @@ -329,28 +329,20 @@ stats_dir = "/srv/metrics.torproject.org/metrics/shared/stats/" rdata_dir = "/srv/metrics.torproject.org/metrics/shared/RData/"
prepare_networksize <- function(start, end) { - s <- read.csv(paste(stats_dir, "servers.csv", sep = ""), - stringsAsFactors = FALSE) - s <- s[s$date >= start & s$date <= end & s$flag == '' & - s$country == '' & s$version == '' & s$platform == '' & - s$ec2bridge == '', ] - s <- data.frame(date = as.Date(s$date, "%Y-%m-%d"), relays = s$relays, - bridges = s$bridges) - s + read.csv(paste(stats_dir, "servers.csv", sep = ""), + colClasses = c("date" = "Date")) %>% + filter(date >= as.Date(start), date <= as.Date(end), flag == '', + country == '', version == '', platform == '', ec2bridge == '') %>% + select(date, relays, bridges) }
plot_networksize <- function(start, end, path) { - s <- prepare_networksize(start, end) - dates <- seq(from = as.Date(start, "%Y-%m-%d"), - to = as.Date(end, "%Y-%m-%d"), by="1 day") - missing <- setdiff(dates, as.Date(s$date, origin = "1970-01-01")) - if (length(missing) > 0) - s <- rbind(s, - data.frame(date = as.Date(missing, origin = "1970-01-01"), - relays = NA, bridges = NA)) - networksize <- gather(s, variable, value, -date) - ggplot(networksize, aes(x = as.Date(date, "%Y-%m-%d"), y = value, - colour = variable)) + geom_line() + + prepare_networksize(start, end) %>% + gather(variable, value, -date) %>% + complete(date = full_seq(date, period = 1), + variable = c("relays", "bridges")) %>% + ggplot(aes(x = date, y = value, colour = variable)) + + geom_line() + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + scale_y_continuous(name = "", labels = formatter, limits = c(0, NA)) + @@ -367,14 +359,11 @@ write_networksize <- function(start, end, path) { }
prepare_versions <- function(start, end) { - s <- read.csv(paste(stats_dir, "servers.csv", sep = ""), - stringsAsFactors = FALSE) - s <- s[s$date >= start & s$date <= end & s$flag == '' & - s$country == '' & s$version != '' & s$platform == '' & - s$ec2bridge == '', ] - s <- data.frame(date = as.Date(s$date, "%Y-%m-%d"), version = s$version, - relays = s$relays) - s + read.csv(paste(stats_dir, "servers.csv", sep = ""), + colClasses = c("date" = "Date")) %>% + filter(date >= as.Date(start), date <= as.Date(end), flag == '', + country == '', version != '', platform == '', ec2bridge == '') %>% + select(date, version, relays) }
plot_versions <- function(start, end, path) { @@ -389,8 +378,7 @@ plot_versions <- function(start, end, path) { stringsAsFactors = FALSE) versions <- s[s$version %in% known_versions, ] visible_versions <- sort(unique(versions$version)) - ggplot(versions, aes(x = as.Date(date, "%Y-%m-%d"), y = relays, - colour = version)) + + ggplot(versions, aes(x = date, y = relays, colour = version)) + geom_line() + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + @@ -410,21 +398,18 @@ write_versions <- function(start, end, path) { }
prepare_platforms <- function(start, end) { - s <- read.csv(paste(stats_dir, "servers.csv", sep = ""), - stringsAsFactors = FALSE) - s <- s[s$date >= start & s$date <= end & s$flag == '' & - s$country == '' & s$version == '' & s$platform != '' & - s$ec2bridge == '', ] - platforms <- data.frame(date = as.Date(s$date, "%Y-%m-%d"), - variable = ifelse(s$platform == "Darwin", "macOS", s$platform), - value = s$relays) - platforms + read.csv(paste(stats_dir, "servers.csv", sep = ""), + colClasses = c("date" = "Date")) %>% + filter(date >= as.Date(start), date <= as.Date(end), flag == '', + country == '', version == '', platform != '', ec2bridge == '') %>% + select(date, platform, relays) %>% + mutate(platform = ifelse(platform == "Darwin", "macOS", + as.character(platform))) }
plot_platforms <- function(start, end, path) { - platforms <- prepare_platforms(start, end) - ggplot(platforms, aes(x = as.Date(date, "%Y-%m-%d"), y = value, - colour = variable)) + + prepare_platforms(start, end) %>% + ggplot(aes(x = date, y = relays, colour = platform)) + geom_line() + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + @@ -439,33 +424,31 @@ plot_platforms <- function(start, end, path) {
write_platforms <- function(start, end, path) { prepare_platforms(start, end) %>% - spread(variable, value) %>% + spread(platform, relays) %>% write.csv(path, quote = FALSE, row.names = FALSE) }
prepare_bandwidth <- function(start, end) { - b <- read.csv(paste(stats_dir, "bandwidth.csv", sep = ""), - stringsAsFactors = FALSE) - b <- b[b$date >= start & b$date <= end & b$isexit == '' & - b$isguard == '', ] - b <- data.frame(date = as.Date(b$date, "%Y-%m-%d"), - bwadv = b$advbw * 8 / 1e9, - bwhist = (b$bwread + b$bwwrite) * 8 / 2e9) - b + read.csv(paste(stats_dir, "bandwidth.csv", sep = ""), + colClasses = c("date" = "Date")) %>% + filter(date >= as.Date(start), date <= as.Date(end), isexit == '', + isguard == '') %>% + mutate(advbw = advbw * 8 / 1e9) %>% + mutate(bwhist = (bwread + bwwrite) * 8 / 2e9) %>% + select(date, advbw, bwhist) }
plot_bandwidth <- function(start, end, path) { - b <- prepare_bandwidth(start, end) - bandwidth <- gather(b, variable, value, -date) - ggplot(bandwidth, aes(x = as.Date(date, "%Y-%m-%d"), - y = value, colour = variable)) + + prepare_bandwidth(start, end) %>% + gather(variable, value, -date) %>% + ggplot(aes(x = date, y = value, colour = variable)) + geom_line() + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + scale_y_continuous(name = "", labels = unit_format(unit = "Gbit/s"), limits = c(0, NA)) + scale_colour_hue(name = "", h.start = 90, - breaks = c("bwadv", "bwhist"), + breaks = c("advbw", "bwhist"), labels = c("Advertised bandwidth", "Bandwidth history")) + ggtitle("Total relay bandwidth") + labs(caption = copyright_notice) + @@ -479,37 +462,22 @@ write_bandwidth <- function(start, end, path) { }
prepare_bwhist_flags <- function(start, end) { - b <- read.csv(paste(stats_dir, "bandwidth.csv", sep = ""), - stringsAsFactors = FALSE) - b <- b[b$date >= start & b$date <= end & b$isexit != '' & - b$isguard != '', ] - bw <- data.frame(date = as.Date(b$date, "%Y-%m-%d"), - isexit = b$isexit == 't', isguard = b$isguard == 't', - read = b$bwread, written = b$bwwrite) - dates <- seq(from = as.Date(start, "%Y-%m-%d"), - to = as.Date(end, "%Y-%m-%d"), by = "1 day") - missing <- setdiff(dates, as.Date(bw$date, origin = "1970-01-01")) - if (length(missing) > 0) - bw <- rbind(bw, - data.frame(date = as.Date(missing, origin = "1970-01-01"), - isexit = FALSE, isguard = FALSE, read = NA, written = NA), - data.frame(date = as.Date(missing, origin = "1970-01-01"), - isexit = FALSE, isguard = TRUE, read = NA, written = NA), - data.frame(date = as.Date(missing, origin = "1970-01-01"), - isexit = TRUE, isguard = FALSE, read = NA, written = NA), - data.frame(date = as.Date(missing, origin = "1970-01-01"), - isexit = TRUE, isguard = TRUE, read = NA, written = NA)) - bw <- data.frame(date = bw$date, variable = ifelse(bw$isexit, - ifelse(bw$isguard, "guard_and_exit", "exit_only"), - ifelse(bw$isguard, "guard_only", "middle_only")), - value = (bw$read + bw$written) * 8 / 2e9) - bw + read.csv(paste(stats_dir, "bandwidth.csv", sep = ""), + colClasses = c("date" = "Date")) %>% + filter(date >= as.Date(start), date <= as.Date(end), isexit != '', + isguard != '') %>% + mutate(variable = ifelse(isexit == 't', + ifelse(isguard == 't', "guard_and_exit", "exit_only"), + ifelse(isguard == 't', "guard_only", "middle_only")), + value = (bwread + bwwrite) * 8 / 2e9) %>% + select(date, variable, value) }
plot_bwhist_flags <- function(start, end, path) { - bw <- prepare_bwhist_flags(start, end) - ggplot(bw, aes(x = as.Date(date, "%Y-%m-%d"), y = value, - colour = variable)) + + prepare_bwhist_flags(start, end) %>% + complete(date = full_seq(date, period = 1), + variable = unique(variable)) %>% + ggplot(aes(x = date, y = value, colour = variable)) + geom_line() + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + @@ -532,21 +500,19 @@ write_bwhist_flags <- function(start, end, path) { }
prepare_dirbytes <- function(start, end, path) { - b <- read.csv(paste(stats_dir, "bandwidth.csv", sep = ""), - stringsAsFactors = FALSE) - b <- b[b$date >= start & b$date <= end & b$isexit == '' & - b$isguard == '', ] - b <- data.frame(date = as.Date(b$date, "%Y-%m-%d"), - dirread = b$dirread * 8 / 1e9, - dirwrite = b$dirwrite * 8 / 1e9) - b + read.csv(paste(stats_dir, "bandwidth.csv", sep = ""), + colClasses = c("date" = "Date")) %>% + filter(date >= as.Date(start), date <= as.Date(end), isexit == '', + isguard == '') %>% + mutate(dirread = dirread * 8 / 1e9, + dirwrite = dirwrite * 8 / 1e9) %>% + select(date, dirread, dirwrite) }
plot_dirbytes <- function(start, end, path) { - b <- prepare_dirbytes(start, end) - dir <- gather(b, variable, value, -date) - ggplot(dir, aes(x = as.Date(date, "%Y-%m-%d"), y = value, - colour = variable)) + + prepare_dirbytes(start, end) %>% + gather(variable, value, -date) %>% + ggplot(aes(x = date, y = value, colour = variable)) + geom_line() + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + @@ -567,33 +533,20 @@ write_dirbytes <- function(start, end, path) { }
prepare_relayflags <- function(start, end, flags) { - s <- read.csv(paste(stats_dir, "servers.csv", sep = ""), - stringsAsFactors = FALSE) - s <- s[s$date >= start & s$date <= end & s$country == '' & - s$version == '' & s$platform == '' & s$ec2bridge == '', ] - s <- data.frame(date = as.Date(s$date, "%Y-%m-%d"), - variable = ifelse(s$flag == '', 'Running', s$flag), - value = s$relays) - networksize <- s[s$variable %in% flags, ] - networksize + read.csv(paste(stats_dir, "servers.csv", sep = ""), + colClasses = c("date" = "Date")) %>% + filter(date >= as.Date(start), date <= as.Date(end), country == '', + version == '', platform == '', ec2bridge == '') %>% + mutate(flag = ifelse(flag == '', 'Running', as.character(flag))) %>% + filter(flag %in% flags) %>% + select(date, flag, relays) }
plot_relayflags <- function(start, end, flags, path) { - networksize <- prepare_relayflags(start, end, flags) - networksize <- rbind(data.frame( - date = as.Date(end) + 1, - variable = c("Running", "Exit", "Guard", "Fast", "Stable", "HSDir"), - value = NA), networksize) - dates <- seq(from = as.Date(start, "%Y-%m-%d"), - to = as.Date(end, "%Y-%m-%d"), by="1 day") - missing <- setdiff(dates, networksize$date) - if (length(missing) > 0) - networksize <- rbind(data.frame( - date = as.Date(rep(missing, 6), origin = "1970-01-01"), - variable = c("Running", "Exit", "Guard", "Fast", "Stable", "HSDir"), - value = rep(NA, length(missing) * 6)), networksize) - ggplot(networksize, aes(x = as.Date(date, "%Y-%m-%d"), y = value, - colour = as.factor(variable))) + geom_line() + + prepare_relayflags(start, end, flags) %>% + complete(date = full_seq(date, period = 1), flag = unique(flag)) %>% + ggplot(aes(x = date, y = relays, colour = as.factor(flag))) + + geom_line() + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + scale_y_continuous(name = "", labels = formatter, limits = c(0, NA)) + @@ -616,32 +569,24 @@ plot_torperf <- function(start, end, source, server, filesize, path) { filesizeVal <- ifelse(filesize == '50kb', 50 * 1024, ifelse(filesize == '1mb', 1024 * 1024, 5 * 1024 * 1024)) t <- read.csv(paste(stats_dir, "torperf-1.1.csv", sep = ""), - stringsAsFactors = FALSE) + colClasses = c("date" = "Date")) known_sources <- c("all", unique(t[t$source != "", "source"])) colours <- data.frame(source = known_sources, colour = brewer.pal(length(known_sources), "Paired"), stringsAsFactors = FALSE) - t <- t[t$date >= start & t$date <= end & t$filesize == filesizeVal & - t$source == ifelse(source == 'all', '', source) & - t$server == server, ] - torperf <- data.frame(date = as.Date(t$date, "%Y-%m-%d"), - q1 = t$q1, md = t$md, q3 = t$q3) - dates <- seq(from = as.Date(start, "%Y-%m-%d"), - to = as.Date(end, "%Y-%m-%d"), by="1 day") - missing <- setdiff(dates, torperf$date) - if (length(missing) > 0) - torperf <- rbind(torperf, - data.frame(date = as.Date(missing, origin = "1970-01-01"), - q1 = NA, md = NA, q3 = NA)) colour <- colours[colours$source == source, "colour"] filesizes <- data.frame(filesizes = c("5mb", "1mb", "50kb"), label = c("5 MiB", "1 MiB", "50 KiB"), stringsAsFactors = FALSE) filesizeStr <- filesizes[filesizes$filesize == filesize, "label"] - ggplot(torperf, aes(x = as.Date(date, "%Y-%m-%d"), y = md/1e3, - fill = "line")) + + t[t$date >= as.Date(start) & t$date <= as.Date(end) & + t$filesize == filesizeVal & + t$source == ifelse(source == 'all', '', source) & + t$server == server, ] %>% + transmute(date, q1 = q1 / 1e3, md = md / 1e3, q3 = q3 / 1e3) %>% + complete(date = full_seq(date, period = 1)) %>% + ggplot(aes(x = date, y = md, fill = "line")) + geom_line(colour = colour, size = 0.75) + - geom_ribbon(data = torperf, aes(x = date, ymin = q1/1e3, - ymax = q3/1e3, fill = "ribbon")) + + geom_ribbon(aes(x = date, ymin = q1, ymax = q3, fill = "ribbon")) + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + scale_y_continuous(name = "", labels = unit_format(unit = "s"), @@ -671,8 +616,7 @@ write_torperf <- function(start, end, source_, server_, filesize_, path) { ifelse(filesize_ == '1mb', 1024 * 1024, 5 * 1024 * 1024)), source == ifelse(source_ == 'all', '', source_), server == server_) %>% - select(date, q1, md, q3) %>% - mutate(q1 = q1 / 1e3, md = md / 1e3, q3 = q3 / 1e3) %>% + transmute(date, q1 = q1 / 1e3, md = md / 1e3, q3 = q3 / 1e3) %>% write.csv(path, quote = FALSE, row.names = FALSE) }
@@ -680,31 +624,21 @@ prepare_torperf_failures <- function(start, end, source, server, filesize) { filesizeVal <- ifelse(filesize == '50kb', 50 * 1024, ifelse(filesize == '1mb', 1024 * 1024, 5 * 1024 * 1024)) t <- read.csv(paste(stats_dir, "torperf-1.1.csv", sep = ""), - stringsAsFactors = FALSE) - t <- t[t$date >= start & t$date <= end & t$filesize == filesizeVal & + colClasses = c("date" = "Date")) + t[t$date >= start & t$date <= end & t$filesize == filesizeVal & t$source == ifelse(source == 'all', '', source) & - t$server == server & t$requests > 0, ] - torperf <- data.frame(date = as.Date(t$date, "%Y-%m-%d"), - timeouts = t$timeouts / t$requests, - failures = t$failures / t$requests) - torperf + t$server == server & t$requests > 0, ] %>% + transmute(date, timeouts = timeouts / requests, + failures = failures / requests) }
plot_torperf_failures <- function(start, end, source, server, filesize, path) { - torperf <- prepare_torperf_failures(start, end, source, server, filesize) - dates <- seq(from = as.Date(start, "%Y-%m-%d"), - to = as.Date(end, "%Y-%m-%d"), by="1 day") - missing <- setdiff(dates, torperf$date) - if (length(missing) > 0) - torperf <- rbind(torperf, - data.frame(date = as.Date(missing, origin = "1970-01-01"), - timeouts = NA, failures = NA)) - torperf <- gather(torperf, variable, value, -date) filesizes <- data.frame(filesizes = c("5mb", "1mb", "50kb"), label = c("5 MiB", "1 MiB", "50 KiB"), stringsAsFactors = FALSE) filesizeStr <- filesizes[filesizes$filesize == filesize, "label"] - ggplot(torperf, aes(x = as.Date(date, "%Y-%m-%d"), y = value, - colour = variable)) + + prepare_torperf_failures(start, end, source, server, filesize) %>% + gather(variable, value, -date) %>% + ggplot(aes(x = date, y = value, colour = variable)) + geom_point(size = 2) + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + @@ -726,21 +660,17 @@ write_torperf_failures <- function(start, end, source, server, filesize, path) { }
prepare_connbidirect <- function(start, end) { - c <- read.csv(paste(stats_dir, "connbidirect2.csv", sep = ""), - stringsAsFactors = FALSE) - c <- c[c$date >= start & c$date <= end, ] - c <- data.frame(date = as.Date(c$date), - direction = factor(c$direction, - levels = c("both", "write", "read")), - quantile = paste("X", c$quantile, sep = ""), - fraction = c$fraction / 100) - c <- spread(c, quantile, fraction) - c + read.csv(paste(stats_dir, "connbidirect2.csv", sep = ""), + colClasses = c("date" = "Date", "direction" = "factor")) %>% + filter(date >= as.Date(start), date <= as.Date(end)) %>% + mutate(quantile = paste("X", quantile, sep = ""), + fraction = fraction / 100) %>% + spread(quantile, fraction) }
plot_connbidirect <- function(start, end, path) { - c <- prepare_connbidirect(start, end) - ggplot(c, aes(x = date, y = X0.5, colour = direction)) + + prepare_connbidirect(start, end) %>% + ggplot(aes(x = date, y = X0.5, colour = direction)) + geom_line(size = 0.75) + geom_ribbon(aes(x = date, ymin = X0.25, ymax = X0.75, fill = direction), alpha = 0.5, show.legend = FALSE) + @@ -772,10 +702,10 @@ write_connbidirect <- function(start, end, path) {
prepare_bandwidth_flags <- function(start, end) { b <- read.csv(paste(stats_dir, "bandwidth.csv", sep = ""), - stringsAsFactors = FALSE) + colClasses = c("date" = "Date")) b <- b[b$date >= start & b$date <= end & b$isexit != '' & b$isguard != '', ] - b <- data.frame(date = as.Date(b$date, "%Y-%m-%d"), + b <- data.frame(date = b$date, isexit = b$isexit == 't', isguard = b$isguard == 't', advbw = b$advbw * 8 / 1e9, bwhist = (b$bwread + b$bwwrite) * 8 / 2e9) @@ -789,21 +719,6 @@ prepare_bandwidth_flags <- function(start, end) { na.rm = TRUE, na.action = NULL) b <- gather(b, type, value, -c(date, flag)) bandwidth <- b[b$value > 0, ] - dates <- seq(from = as.Date(start, "%Y-%m-%d"), - to = as.Date(end, "%Y-%m-%d"), by = "1 day") - missing <- setdiff(dates, as.Date(bandwidth$date, - origin = "1970-01-01")) - if (length(missing) > 0) { - bandwidth <- rbind(bandwidth, - data.frame(date = as.Date(missing, origin = "1970-01-01"), - type = "advbw", flag = "exit", value = NA), - data.frame(date = as.Date(missing, origin = "1970-01-01"), - type = "bwhist", flag = "exit", value = NA), - data.frame(date = as.Date(missing, origin = "1970-01-01"), - type = "advbw", flag = "guard", value = NA), - data.frame(date = as.Date(missing, origin = "1970-01-01"), - type = "bwhist", flag = "guard", value = NA)) - } bandwidth <- data.frame(date = bandwidth$date, variable = as.factor(paste(bandwidth$flag, "_", bandwidth$type, sep = "")), value = bandwidth$value) @@ -813,9 +728,10 @@ prepare_bandwidth_flags <- function(start, end) { }
plot_bandwidth_flags <- function(start, end, path) { - bandwidth <- prepare_bandwidth_flags(start, end) - ggplot(bandwidth, aes(x = as.Date(date, "%Y-%m-%d"), - y = value, colour = variable)) + + prepare_bandwidth_flags(start, end) %>% + complete(date = full_seq(date, period = 1), + variable = unique(variable)) %>% + ggplot(aes(x = date, y = value, colour = variable)) + geom_line() + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + @@ -1107,29 +1023,27 @@ write_userstats_bridge_combined <- function(start, end, country, path) { }
prepare_advbwdist_perc <- function(start, end, p) { - t <- read.csv(paste(stats_dir, - "advbwdist.csv", sep = ""), stringsAsFactors = FALSE) - t <- t[t$date >= start & t$date <= end & - t$percentile %in% as.numeric(p), ] - t <- data.frame(date = t$date, percentile = as.factor(t$percentile), - variable = ifelse(t$isexit != "t", "all", "exits"), - advbw = t$advbw * 8 / 1e9) - t + read.csv(paste(stats_dir, "advbwdist.csv", sep = ""), + colClasses = c("date" = "Date")) %>% + filter(date >= as.Date(start), date <= as.Date(end), + percentile %in% as.numeric(p)) %>% + transmute(date, percentile = as.factor(percentile), + variable = ifelse(isexit != "t", "all", "exits"), + advbw = advbw * 8 / 1e9) }
plot_advbwdist_perc <- function(start, end, p, path) { - t <- prepare_advbwdist_perc(start, end, p) - t$variable <- ifelse(t$variable == "all", "All relays", - "Exits only") - ggplot(t, aes(x = as.Date(date), y = advbw, colour = percentile)) + + prepare_advbwdist_perc(start, end, p) %>% + mutate(variable = ifelse(variable == "all", "All relays", + "Exits only")) %>% + ggplot(aes(x = date, y = advbw, colour = percentile)) + facet_grid(variable ~ .) + geom_line() + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + scale_y_continuous(name = "", labels = unit_format(unit = "Gbit/s"), limits = c(0, NA)) + - scale_colour_hue(name = "Percentile", - breaks = rev(levels(t$percentile))) + + scale_colour_hue(name = "Percentile") + ggtitle("Advertised bandwidth distribution") + labs(caption = copyright_notice) ggsave(filename = path, width = 8, height = 5, dpi = 150) @@ -1143,27 +1057,27 @@ write_advbwdist_perc <- function(start, end, p, path) { }
prepare_advbwdist_relay <- function(start, end, n) { - t <- read.csv(paste(stats_dir, "advbwdist.csv", sep = ""), - stringsAsFactors = FALSE) - t <- t[t$date >= start & t$date <= end & t$relay %in% as.numeric(n), ] - t <- data.frame(date = t$date, relay = as.factor(t$relay), - variable = ifelse(t$isexit != "t", "all", "exits"), - advbw = t$advbw * 8 / 1e9) - t + read.csv(paste(stats_dir, "advbwdist.csv", sep = ""), + colClasses = c("date" = "Date")) %>% + filter(date >= as.Date(start), date <= as.Date(end), + relay %in% as.numeric(n)) %>% + transmute(date, relay = as.factor(relay), + variable = ifelse(isexit != "t", "all", "exits"), + advbw = advbw * 8 / 1e9) }
plot_advbwdist_relay <- function(start, end, n, path) { - t <- prepare_advbwdist_relay(start, end, n) - t$variable <- ifelse(t$variable == "all", "All relays", - "Exits only") - ggplot(t, aes(x = as.Date(date), y = advbw, colour = relay)) + + prepare_advbwdist_relay(start, end, n) %>% + mutate(variable = ifelse(variable == "all", "All relays", + "Exits only")) %>% + ggplot(aes(x = date, y = advbw, colour = relay)) + facet_grid(variable ~ .) + geom_line() + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + scale_y_continuous(name = "", labels = unit_format(unit = "Gbit/s"), limits = c(0, NA)) + - scale_colour_hue(name = "n", breaks = levels(t$relay)) + + scale_colour_hue(name = "n") + ggtitle("Advertised bandwidth of n-th fastest relays") + labs(caption = copyright_notice) ggsave(filename = path, width = 8, height = 5, dpi = 150) @@ -1177,17 +1091,16 @@ write_advbwdist_relay <- function(start, end, n, path) { }
prepare_hidserv_dir_onions_seen <- function(start, end) { - h <- read.csv(paste(stats_dir, "hidserv.csv", sep = ""), - stringsAsFactors = FALSE) - h <- h[h$date >= start & h$date <= end & h$type == "dir-onions-seen", ] - h <- data.frame(date = as.Date(h$date, "%Y-%m-%d"), - onions = ifelse(h$frac >= 0.01, h$wiqm, NA)) - h + read.csv(paste(stats_dir, "hidserv.csv", sep = ""), + colClasses = c("date" = "Date")) %>% + filter(date >= as.Date(start), date <= as.Date(end), + type == "dir-onions-seen") %>% + transmute(date = date, onions = ifelse(frac >= 0.01, wiqm, NA)) }
plot_hidserv_dir_onions_seen <- function(start, end, path) { - h <- prepare_hidserv_dir_onions_seen(start, end) - ggplot(h, aes(x = as.Date(date, origin = "1970-01-01"), y = onions)) + + prepare_hidserv_dir_onions_seen(start, end) %>% + ggplot(aes(x = date, y = onions)) + geom_line() + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + @@ -1203,19 +1116,17 @@ write_hidserv_dir_onions_seen <- function(start, end, path) { }
prepare_hidserv_rend_relayed_cells <- function(start, end) { - h <- read.csv(paste(stats_dir, "hidserv.csv", sep = ""), - stringsAsFactors = FALSE) - h <- h[h$date >= start & h$date <= end & - h$type == "rend-relayed-cells", ] - h <- data.frame(date = as.Date(h$date, "%Y-%m-%d"), - relayed = ifelse(h$frac >= 0.01, - h$wiqm * 8 * 512 / (86400 * 1e9), NA)) - h + read.csv(paste(stats_dir, "hidserv.csv", sep = ""), + colClasses = c("date" = "Date")) %>% + filter(date >= as.Date(start), date <= as.Date(end), + type == "rend-relayed-cells") %>% + transmute(date, + relayed = ifelse(frac >= 0.01, wiqm * 8 * 512 / (86400 * 1e9), NA)) }
plot_hidserv_rend_relayed_cells <- function(start, end, path) { - h <- prepare_hidserv_rend_relayed_cells(start, end) - ggplot(h, aes(x = as.Date(date, origin = "1970-01-01"), y = relayed)) + + prepare_hidserv_rend_relayed_cells(start, end) %>% + ggplot(aes(x = date, y = relayed)) + geom_line() + scale_x_date(name = "", breaks = custom_breaks, labels = custom_labels, minor_breaks = custom_minor_breaks) + @@ -1232,18 +1143,15 @@ write_hidserv_rend_relayed_cells <- function(start, end, path) { }
prepare_hidserv_frac_reporting <- function(start, end) { - h <- read.csv(paste(stats_dir, "hidserv.csv", sep = ""), - stringsAsFactors = FALSE) - h <- h[h$date >= start & h$date <= end, ] - h <- data.frame(date = as.Date(h$date, "%Y-%m-%d"), - frac = h$frac, type = h$type) - h + read.csv(paste(stats_dir, "hidserv.csv", sep = ""), + colClasses = c("date" = "Date")) %>% + filter(date >= as.Date(start), date <= as.Date(end)) %>% + select(date, frac, type) }
plot_hidserv_frac_reporting <- function(start, end, path) { - h <- prepare_hidserv_frac_reporting(start, end) - ggplot(h, aes(x = as.Date(date, origin = "1970-01-01"), y = frac, - colour = type)) + + prepare_hidserv_frac_reporting(start, end) %>% + ggplot(aes(x = date, y = frac, colour = type)) + geom_line() + geom_hline(yintercept = 0.01, linetype = 2) + scale_x_date(name = "", breaks = custom_breaks, @@ -1269,10 +1177,9 @@ write_hidserv_frac_reporting <- function(start, end, path) {
prepare_webstats_tb <- function(start, end) { load(paste(rdata_dir, "webstats-tb.RData", sep = "")) - d <- data - d <- d[d$log_date >= start & d$log_date <= end, ] - d$request_type <- factor(d$request_type) - d + data %>% + filter(log_date >= as.Date(start), log_date <= as.Date(end)) %>% + mutate(request_type = factor(request_type)) }
plot_webstats_tb <- function(start, end, path) { @@ -1306,17 +1213,17 @@ write_webstats_tb <- function(start, end, path) { }
prepare_webstats_tb_platform <- function(start, end) { - d <- read.csv(paste(stats_dir, "webstats.csv", sep = ""), - stringsAsFactors = FALSE) - d <- d[d$log_date >= start & d$log_date <= end & d$request_type == 'tbid', ] - d <- aggregate(list(count = d$count), by = list(log_date = as.Date(d$log_date), - platform = d$platform), FUN = sum) - d + read.csv(paste(stats_dir, "webstats.csv", sep = ""), + colClasses = c("log_date" = "Date")) %>% + filter(log_date >= as.Date(start), log_date <= as.Date(end), + request_type == 'tbid') %>% + group_by(log_date, platform) %>% + summarize(count = sum(count)) }
plot_webstats_tb_platform <- function(start, end, path) { - d <- prepare_webstats_tb_platform(start, end) - ggplot(d, aes(x = log_date, y = count, colour = platform)) + + prepare_webstats_tb_platform(start, end) %>% + ggplot(aes(x = log_date, y = count, colour = platform)) + geom_point() + geom_line() + scale_x_date(name = "", breaks = custom_breaks, @@ -1342,13 +1249,13 @@ write_webstats_tb_platform <- function(start, end, path) {
plot_webstats_tb_locale <- function(start, end, path) { d <- read.csv(paste(stats_dir, "webstats.csv", sep = ""), - stringsAsFactors = FALSE) + colClasses = c("log_date" = "Date")) d <- d[d$log_date >= start & d$log_date <= end & d$request_type == 'tbid', ] e <- d e <- aggregate(list(count = e$count), by = list(locale = e$locale), FUN = sum) e <- e[order(e$count, decreasing = TRUE), ] e <- e[1:5, ] - d <- aggregate(list(count = d$count), by = list(log_date = as.Date(d$log_date), + d <- aggregate(list(count = d$count), by = list(log_date = d$log_date, locale = ifelse(d$locale %in% e$locale, d$locale, '(other)')), FUN = sum) ggplot(d, aes(x = log_date, y = count, colour = locale)) + geom_point() + @@ -1373,13 +1280,13 @@ plot_webstats_tb_locale <- function(start, end, path) { # breaks and labels. Left as future work. write_webstats_tb_locale <- function(start, end, path) { d <- read.csv(paste(stats_dir, "webstats.csv", sep = ""), - stringsAsFactors = FALSE) + colClasses = c("log_date" = "Date")) d <- d[d$log_date >= start & d$log_date <= end & d$request_type == 'tbid', ] e <- d e <- aggregate(list(count = e$count), by = list(locale = e$locale), FUN = sum) e <- e[order(e$count, decreasing = TRUE), ] e <- e[1:5, ] - d <- aggregate(list(count = d$count), by = list(log_date = as.Date(d$log_date), + d <- aggregate(list(count = d$count), by = list(log_date = d$log_date, locale = ifelse(d$locale %in% e$locale, d$locale, 'other')), FUN = sum) d %>% mutate(locale = tolower(locale)) %>% @@ -1390,10 +1297,9 @@ write_webstats_tb_locale <- function(start, end, path) {
prepare_webstats_tm <- function(start, end) { load(paste(rdata_dir, "webstats-tm.RData", sep = "")) - d <- data - d <- d[d$log_date >= start & d$log_date <= end, ] - d$request_type <- factor(d$request_type) - d + data %>% + filter(log_date >= as.Date(start), log_date <= as.Date(end)) %>% + mutate(request_type = factor(request_type)) }
plot_webstats_tm <- function(start, end, path) {