[tor-commits] [metrics-web/master] Rewrite advbwdist's aggregate.R in Java.

karsten at torproject.org karsten at torproject.org
Sun Dec 23 10:35:16 UTC 2018


commit b605298c66c8c348fe589062dc1ddd3da293c8db
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Mon Dec 10 15:17:46 2018 +0100

    Rewrite advbwdist's aggregate.R in Java.
    
    This is yet another step torwards making the daily update Java-only.
    
    Implements #28801.
---
 build.xml                                          | 17 --------
 src/main/R/advbwdist/aggregate.R                   | 25 ------------
 .../torproject/metrics/stats/advbwdist/Main.java   | 46 ++++++++++++++++++++++
 3 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/build.xml b/build.xml
index 93eda7b..42965bf 100644
--- a/build.xml
+++ b/build.xml
@@ -347,9 +347,6 @@
   <target name="advbwdist">
     <property name="module.name" value="advbwdist" />
     <antcall target="run-java" />
-    <antcall target="run-R" >
-      <param name="module.Rscript" value="aggregate.R" />
-    </antcall>
   </target>
 
   <target name="hidserv" >
@@ -453,20 +450,6 @@
     <echo message="Java module ${module.name} finished. " />
   </target>
 
-  <target name="run-R" >
-    <echo message="Running R module ${module.name}, script ${module.Rscript} ... " />
-    <property name="Rscript"
-              value="${Rsources}/${module.name}/${module.Rscript}" />
-    <exec executable="R"
-          dir="${modulebase}/${module.name}"
-          failonerror="true" >
-      <arg value="--slave"/>
-      <arg value="-f"/>
-      <arg value="${Rscript}" />
-    </exec>
-    <echo message="R module ${module.name}, script ${module.Rscript} finished. " />
-  </target>
-
   <!-- The following line adds the common targets and properties
        for Metrics' Java Projects.
   -->
diff --git a/src/main/R/advbwdist/aggregate.R b/src/main/R/advbwdist/aggregate.R
deleted file mode 100644
index 1c67dff..0000000
--- a/src/main/R/advbwdist/aggregate.R
+++ /dev/null
@@ -1,25 +0,0 @@
-require(reshape)
-t <- read.csv("stats/advbwdist-validafter.csv",
-  colClasses = c("character", "logical", "integer", "integer", "integer"),
-  stringsAsFactors = FALSE)
-
-currSysDate <- paste(Sys.Date() - 1, "23:59:59")
-t <- t[t$valid_after < currSysDate, ]
-t$date <- as.factor(substr(t$valid_after, 1, 10))
-t$isexit <- !is.na(t$isexit)
-t$relay <- ifelse(is.na(t$relay), -1, t$relay)
-t$percentile <- ifelse(is.na(t$percentile), -1, t$percentile)
-
-t <- aggregate(list(advbw = t$advbw), by = list(date = t$date,
-    isexit = t$isexit, relay = t$relay, percentile = t$percentile),
-    FUN = median)
-
-t$isexit <- ifelse(t$isexit, "t", "")
-t$relay <- ifelse(t$relay < 0, NA, t$relay)
-t$percentile <- ifelse(t$percentile < 0, NA, t$percentile)
-t$advbw <- floor(t$advbw)
-
-t <- t[order(t$date, t$isexit, t$relay, t$percentile), ]
-
-write.csv(t, "stats/advbwdist.csv", quote = FALSE, row.names = FALSE, na = "")
-
diff --git a/src/main/java/org/torproject/metrics/stats/advbwdist/Main.java b/src/main/java/org/torproject/metrics/stats/advbwdist/Main.java
index 7216581..6c4f4ac 100644
--- a/src/main/java/org/torproject/metrics/stats/advbwdist/Main.java
+++ b/src/main/java/org/torproject/metrics/stats/advbwdist/Main.java
@@ -10,15 +10,19 @@ import org.torproject.descriptor.NetworkStatusEntry;
 import org.torproject.descriptor.RelayNetworkStatusConsensus;
 import org.torproject.descriptor.ServerDescriptor;
 
+import org.apache.commons.math3.stat.descriptive.rank.Median;
 import org.apache.commons.math3.stat.descriptive.rank.Percentile;
 
+import java.io.BufferedReader;
 import java.io.BufferedWriter;
 import java.io.File;
+import java.io.FileReader;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.Date;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -138,6 +142,39 @@ public class Main {
     }
     descriptorReader.saveHistoryFile(historyFile);
     bw.close();
+
+    /* Aggregate statistics. */
+    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+    dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+    String today = dateFormat.format(new Date());
+    SortedMap<String, List<Long>> preAggregatedValues = new TreeMap<>();
+    try (BufferedReader br = new BufferedReader(new FileReader(resultsFile))) {
+      br.readLine(); /* Skip header. */
+      String line;
+      while (null != (line = br.readLine())) {
+        String[] parts = line.split(",");
+        String date = parts[0].substring(0, 10);
+        if (date.compareTo(today) >= 0) {
+          continue;
+        }
+        String isExit = parts[1].equals("TRUE") ? "t" : "";
+        String keyWithoutTime = String.format("%s,%s,%s,%s",
+            date, isExit, parts[2], parts[3]);
+        long value = Long.parseLong(parts[4]);
+        preAggregatedValues.putIfAbsent(keyWithoutTime, new ArrayList<>());
+        preAggregatedValues.get(keyWithoutTime).add(value);
+      }
+    }
+    File aggregateResultsFile = new File("stats/advbwdist.csv");
+    aggregateResultsFile.getParentFile().mkdirs();
+    try (BufferedWriter bw2 = new BufferedWriter(
+        new FileWriter(aggregateResultsFile))) {
+      bw2.write("date,isexit,relay,percentile,advbw\n");
+      for (Map.Entry<String, List<Long>> e : preAggregatedValues.entrySet()) {
+        bw2.write(String.format("%s,%.0f%n", e.getKey(),
+            computeMedian(e.getValue())));
+      }
+    }
   }
 
   /** Compute percentiles (between 0 and 100) for the given list of values, and
@@ -168,5 +205,14 @@ public class Main {
     }
     return computedPercentiles;
   }
+
+  /** Return the median for the given list of values, or <code>Double.NaN</code>
+   * if the given list is empty. */
+  static double computeMedian(List<Long> valueList) {
+    Median median = new Median()
+        .withEstimationType(Percentile.EstimationType.R_7);
+    median.setData(valueList.stream().mapToDouble(Long::doubleValue).toArray());
+    return Math.floor(median.evaluate());
+  }
 }
 



More information about the tor-commits mailing list