commit b605298c66c8c348fe589062dc1ddd3da293c8db Author: Karsten Loesing karsten.loesing@gmx.net Date: Mon Dec 10 15:17:46 2018 +0100
Rewrite advbwdist's aggregate.R in Java.
This is yet another step torwards making the daily update Java-only.
Implements #28801. --- build.xml | 17 -------- src/main/R/advbwdist/aggregate.R | 25 ------------ .../torproject/metrics/stats/advbwdist/Main.java | 46 ++++++++++++++++++++++ 3 files changed, 46 insertions(+), 42 deletions(-)
diff --git a/build.xml b/build.xml index 93eda7b..42965bf 100644 --- a/build.xml +++ b/build.xml @@ -347,9 +347,6 @@ <target name="advbwdist"> <property name="module.name" value="advbwdist" /> <antcall target="run-java" /> - <antcall target="run-R" > - <param name="module.Rscript" value="aggregate.R" /> - </antcall> </target>
<target name="hidserv" > @@ -453,20 +450,6 @@ <echo message="Java module ${module.name} finished. " /> </target>
- <target name="run-R" > - <echo message="Running R module ${module.name}, script ${module.Rscript} ... " /> - <property name="Rscript" - value="${Rsources}/${module.name}/${module.Rscript}" /> - <exec executable="R" - dir="${modulebase}/${module.name}" - failonerror="true" > - <arg value="--slave"/> - <arg value="-f"/> - <arg value="${Rscript}" /> - </exec> - <echo message="R module ${module.name}, script ${module.Rscript} finished. " /> - </target> - <!-- The following line adds the common targets and properties for Metrics' Java Projects. --> diff --git a/src/main/R/advbwdist/aggregate.R b/src/main/R/advbwdist/aggregate.R deleted file mode 100644 index 1c67dff..0000000 --- a/src/main/R/advbwdist/aggregate.R +++ /dev/null @@ -1,25 +0,0 @@ -require(reshape) -t <- read.csv("stats/advbwdist-validafter.csv", - colClasses = c("character", "logical", "integer", "integer", "integer"), - stringsAsFactors = FALSE) - -currSysDate <- paste(Sys.Date() - 1, "23:59:59") -t <- t[t$valid_after < currSysDate, ] -t$date <- as.factor(substr(t$valid_after, 1, 10)) -t$isexit <- !is.na(t$isexit) -t$relay <- ifelse(is.na(t$relay), -1, t$relay) -t$percentile <- ifelse(is.na(t$percentile), -1, t$percentile) - -t <- aggregate(list(advbw = t$advbw), by = list(date = t$date, - isexit = t$isexit, relay = t$relay, percentile = t$percentile), - FUN = median) - -t$isexit <- ifelse(t$isexit, "t", "") -t$relay <- ifelse(t$relay < 0, NA, t$relay) -t$percentile <- ifelse(t$percentile < 0, NA, t$percentile) -t$advbw <- floor(t$advbw) - -t <- t[order(t$date, t$isexit, t$relay, t$percentile), ] - -write.csv(t, "stats/advbwdist.csv", quote = FALSE, row.names = FALSE, na = "") - diff --git a/src/main/java/org/torproject/metrics/stats/advbwdist/Main.java b/src/main/java/org/torproject/metrics/stats/advbwdist/Main.java index 7216581..6c4f4ac 100644 --- a/src/main/java/org/torproject/metrics/stats/advbwdist/Main.java +++ b/src/main/java/org/torproject/metrics/stats/advbwdist/Main.java @@ -10,15 +10,19 @@ import org.torproject.descriptor.NetworkStatusEntry; import org.torproject.descriptor.RelayNetworkStatusConsensus; import org.torproject.descriptor.ServerDescriptor;
+import org.apache.commons.math3.stat.descriptive.rank.Median; import org.apache.commons.math3.stat.descriptive.rank.Percentile;
+import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; +import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collections; +import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -138,6 +142,39 @@ public class Main { } descriptorReader.saveHistoryFile(historyFile); bw.close(); + + /* Aggregate statistics. */ + SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); + dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); + String today = dateFormat.format(new Date()); + SortedMap<String, List<Long>> preAggregatedValues = new TreeMap<>(); + try (BufferedReader br = new BufferedReader(new FileReader(resultsFile))) { + br.readLine(); /* Skip header. */ + String line; + while (null != (line = br.readLine())) { + String[] parts = line.split(","); + String date = parts[0].substring(0, 10); + if (date.compareTo(today) >= 0) { + continue; + } + String isExit = parts[1].equals("TRUE") ? "t" : ""; + String keyWithoutTime = String.format("%s,%s,%s,%s", + date, isExit, parts[2], parts[3]); + long value = Long.parseLong(parts[4]); + preAggregatedValues.putIfAbsent(keyWithoutTime, new ArrayList<>()); + preAggregatedValues.get(keyWithoutTime).add(value); + } + } + File aggregateResultsFile = new File("stats/advbwdist.csv"); + aggregateResultsFile.getParentFile().mkdirs(); + try (BufferedWriter bw2 = new BufferedWriter( + new FileWriter(aggregateResultsFile))) { + bw2.write("date,isexit,relay,percentile,advbw\n"); + for (Map.Entry<String, List<Long>> e : preAggregatedValues.entrySet()) { + bw2.write(String.format("%s,%.0f%n", e.getKey(), + computeMedian(e.getValue()))); + } + } }
/** Compute percentiles (between 0 and 100) for the given list of values, and @@ -168,5 +205,14 @@ public class Main { } return computedPercentiles; } + + /** Return the median for the given list of values, or <code>Double.NaN</code> + * if the given list is empty. */ + static double computeMedian(List<Long> valueList) { + Median median = new Median() + .withEstimationType(Percentile.EstimationType.R_7); + median.setData(valueList.stream().mapToDouble(Long::doubleValue).toArray()); + return Math.floor(median.evaluate()); + } }
tor-commits@lists.torproject.org