commit b605298c66c8c348fe589062dc1ddd3da293c8db
Author: Karsten Loesing <karsten.loesing(a)gmx.net>
Date: Mon Dec 10 15:17:46 2018 +0100
Rewrite advbwdist's aggregate.R in Java.
This is yet another step torwards making the daily update Java-only.
Implements #28801.
---
build.xml | 17 --------
src/main/R/advbwdist/aggregate.R | 25 ------------
.../torproject/metrics/stats/advbwdist/Main.java | 46 ++++++++++++++++++++++
3 files changed, 46 insertions(+), 42 deletions(-)
diff --git a/build.xml b/build.xml
index 93eda7b..42965bf 100644
--- a/build.xml
+++ b/build.xml
@@ -347,9 +347,6 @@
<target name="advbwdist">
<property name="module.name" value="advbwdist" />
<antcall target="run-java" />
- <antcall target="run-R" >
- <param name="module.Rscript" value="aggregate.R" />
- </antcall>
</target>
<target name="hidserv" >
@@ -453,20 +450,6 @@
<echo message="Java module ${module.name} finished. " />
</target>
- <target name="run-R" >
- <echo message="Running R module ${module.name}, script ${module.Rscript} ... " />
- <property name="Rscript"
- value="${Rsources}/${module.name}/${module.Rscript}" />
- <exec executable="R"
- dir="${modulebase}/${module.name}"
- failonerror="true" >
- <arg value="--slave"/>
- <arg value="-f"/>
- <arg value="${Rscript}" />
- </exec>
- <echo message="R module ${module.name}, script ${module.Rscript} finished. " />
- </target>
-
<!-- The following line adds the common targets and properties
for Metrics' Java Projects.
-->
diff --git a/src/main/R/advbwdist/aggregate.R b/src/main/R/advbwdist/aggregate.R
deleted file mode 100644
index 1c67dff..0000000
--- a/src/main/R/advbwdist/aggregate.R
+++ /dev/null
@@ -1,25 +0,0 @@
-require(reshape)
-t <- read.csv("stats/advbwdist-validafter.csv",
- colClasses = c("character", "logical", "integer", "integer", "integer"),
- stringsAsFactors = FALSE)
-
-currSysDate <- paste(Sys.Date() - 1, "23:59:59")
-t <- t[t$valid_after < currSysDate, ]
-t$date <- as.factor(substr(t$valid_after, 1, 10))
-t$isexit <- !is.na(t$isexit)
-t$relay <- ifelse(is.na(t$relay), -1, t$relay)
-t$percentile <- ifelse(is.na(t$percentile), -1, t$percentile)
-
-t <- aggregate(list(advbw = t$advbw), by = list(date = t$date,
- isexit = t$isexit, relay = t$relay, percentile = t$percentile),
- FUN = median)
-
-t$isexit <- ifelse(t$isexit, "t", "")
-t$relay <- ifelse(t$relay < 0, NA, t$relay)
-t$percentile <- ifelse(t$percentile < 0, NA, t$percentile)
-t$advbw <- floor(t$advbw)
-
-t <- t[order(t$date, t$isexit, t$relay, t$percentile), ]
-
-write.csv(t, "stats/advbwdist.csv", quote = FALSE, row.names = FALSE, na = "")
-
diff --git a/src/main/java/org/torproject/metrics/stats/advbwdist/Main.java b/src/main/java/org/torproject/metrics/stats/advbwdist/Main.java
index 7216581..6c4f4ac 100644
--- a/src/main/java/org/torproject/metrics/stats/advbwdist/Main.java
+++ b/src/main/java/org/torproject/metrics/stats/advbwdist/Main.java
@@ -10,15 +10,19 @@ import org.torproject.descriptor.NetworkStatusEntry;
import org.torproject.descriptor.RelayNetworkStatusConsensus;
import org.torproject.descriptor.ServerDescriptor;
+import org.apache.commons.math3.stat.descriptive.rank.Median;
import org.apache.commons.math3.stat.descriptive.rank.Percentile;
+import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
+import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -138,6 +142,39 @@ public class Main {
}
descriptorReader.saveHistoryFile(historyFile);
bw.close();
+
+ /* Aggregate statistics. */
+ SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+ dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+ String today = dateFormat.format(new Date());
+ SortedMap<String, List<Long>> preAggregatedValues = new TreeMap<>();
+ try (BufferedReader br = new BufferedReader(new FileReader(resultsFile))) {
+ br.readLine(); /* Skip header. */
+ String line;
+ while (null != (line = br.readLine())) {
+ String[] parts = line.split(",");
+ String date = parts[0].substring(0, 10);
+ if (date.compareTo(today) >= 0) {
+ continue;
+ }
+ String isExit = parts[1].equals("TRUE") ? "t" : "";
+ String keyWithoutTime = String.format("%s,%s,%s,%s",
+ date, isExit, parts[2], parts[3]);
+ long value = Long.parseLong(parts[4]);
+ preAggregatedValues.putIfAbsent(keyWithoutTime, new ArrayList<>());
+ preAggregatedValues.get(keyWithoutTime).add(value);
+ }
+ }
+ File aggregateResultsFile = new File("stats/advbwdist.csv");
+ aggregateResultsFile.getParentFile().mkdirs();
+ try (BufferedWriter bw2 = new BufferedWriter(
+ new FileWriter(aggregateResultsFile))) {
+ bw2.write("date,isexit,relay,percentile,advbw\n");
+ for (Map.Entry<String, List<Long>> e : preAggregatedValues.entrySet()) {
+ bw2.write(String.format("%s,%.0f%n", e.getKey(),
+ computeMedian(e.getValue())));
+ }
+ }
}
/** Compute percentiles (between 0 and 100) for the given list of values, and
@@ -168,5 +205,14 @@ public class Main {
}
return computedPercentiles;
}
+
+ /** Return the median for the given list of values, or <code>Double.NaN</code>
+ * if the given list is empty. */
+ static double computeMedian(List<Long> valueList) {
+ Median median = new Median()
+ .withEstimationType(Percentile.EstimationType.R_7);
+ median.setData(valueList.stream().mapToDouble(Long::doubleValue).toArray());
+ return Math.floor(median.evaluate());
+ }
}