tor-commits
Threads by month
- ----- 2025 -----
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2018 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2017 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2016 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2015 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2014 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2013 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2012 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2011 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
November 2019
- 20 participants
- 2924 discussions
09 Nov '19
commit a367168a782e864bdacb610857b1dc5d58fd192d
Author: Karsten Loesing <karsten.loesing(a)gmx.net>
Date: Sun Dec 9 12:02:42 2018 +0100
Rewrite censorship detector in Java.
This allows us to remove the last remaining Python parts from the daily
updater.
Implements #21588.
---
build.xml | 26 --
.../torproject/metrics/stats/clients/Detector.java | 433 +++++++++++++++++++++
.../org/torproject/metrics/stats/clients/Main.java | 5 +
src/main/python/clients/country_info.py | 255 ------------
src/main/python/clients/detector.py | 242 ------------
5 files changed, 438 insertions(+), 523 deletions(-)
diff --git a/build.xml b/build.xml
index 6736e19..93eda7b 100644
--- a/build.xml
+++ b/build.xml
@@ -23,7 +23,6 @@
<property name="tardepends" value="war" />
<property name="Rsources" value="${basedir}/src/main/R" />
- <property name="pysources" value="${basedir}/src/main/python" />
<property name="specdir" value="${basedir}/generated/spec" />
@@ -360,32 +359,7 @@
<target name="clients" >
<property name="module.name" value="clients" />
- <property name="localmoddir" value="${modulebase}/${module.name}" />
-
- <property name="statsdir"
- value="${localmoddir}/stats" />
- <mkdir dir="${statsdir}" />
-
<antcall target="run-java" />
-
- <antcall target="run-R" >
- <param name="module.Rscript" value="userstats-detector.R" />
- </antcall>
-
- <exec executable="python"
- dir="${localmoddir}"
- failonerror="true" >
- <arg value="${pysources}/${module.name}/detector.py" />
- <arg value="userstats-detector.csv" />
- <arg value="userstats-ranges.csv" />
- </exec>
-
- <antcall target="run-R" >
- <param name="module.Rscript" value="merge-clients.R" />
- </antcall>
-
- <copy file="${localmoddir}/clients.csv" todir="${statsdir}" />
- <copy file="${localmoddir}/userstats-combined.csv" todir="${statsdir}" />
</target>
<target name="servers" >
diff --git a/src/main/java/org/torproject/metrics/stats/clients/Detector.java b/src/main/java/org/torproject/metrics/stats/clients/Detector.java
new file mode 100644
index 0000000..1a523c2
--- /dev/null
+++ b/src/main/java/org/torproject/metrics/stats/clients/Detector.java
@@ -0,0 +1,433 @@
+/* Copyright 2011 George Danezis <gdane(a)microsoft.com>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted (subject to the limitations in the
+ * disclaimer below) provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+ *
+ * * Neither the name of <Owner Organization> nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+ * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * (Clear BSD license:
+ * http://labs.metacarta.com/license-explanation.html#license)
+ *
+ * Copyright 2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.metrics.stats.clients;
+
+import org.apache.commons.math3.distribution.NormalDistribution;
+import org.apache.commons.math3.distribution.PoissonDistribution;
+import org.apache.commons.math3.stat.descriptive.moment.Mean;
+import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation;
+import org.apache.commons.math3.stat.descriptive.rank.Percentile;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.time.LocalDate;
+import java.time.format.DateTimeParseException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.stream.Collectors;
+
+/** Censorship detector that reads a .csv file of the number of Tor clients and
+ * finds anomalies that might be indicative of censorship. */
+public class Detector {
+
+ /** Input file. */
+ private static final Path INPUT_PATH = Paths.get("stats", "userstats.csv");
+
+ /** Output file. */
+ private static final Path OUTPUT_PATH = Paths.get("stats", "clients.csv");
+
+ /** Number of largest locations to be included in the detection algorithm. */
+ private static final int NUM_LARGEST_LOCATIONS = 50;
+
+ /** Time interval in days to model connection rates. */
+ private static final int INTERV = 7;
+
+ /** Compound key under which client estimates are stored in both input and
+ * output files. */
+ private static class ClientsKey implements Comparable<ClientsKey> {
+
+ /** Date when clients connected to the Tor network. */
+ private LocalDate date;
+
+ /** Whether clients connected via relays (true) or bridges (false). */
+ private boolean nodeIsRelay;
+
+ /** Two-letter lower-case country code of the country from which clients
+ * connected, "??" if the country could not be resolved, or left empty for
+ * all countries together. */
+ private String country;
+
+ /** Name of the transport used by clients to connect using bridges, or left
+ * empty for all transports together. */
+ private String transport = "";
+
+ /** IP version used by clients to connect using bridges, or left empty for
+ * all IP versions together. */
+ private String version = "";
+
+ ClientsKey(LocalDate date, boolean nodeIsRelay, String country) {
+ this.date = date;
+ this.nodeIsRelay = nodeIsRelay;
+ this.country = country;
+ }
+
+ ClientsKey(LocalDate date, boolean nodeIsRelay, String country,
+ String transport, String version) {
+ this(date, nodeIsRelay, country);
+ this.transport = transport;
+ this.version = version;
+ }
+
+ @Override
+ public int compareTo(ClientsKey other) {
+ if (!this.date.equals(other.date)) {
+ return this.date.compareTo(other.date);
+ } else if (!this.nodeIsRelay && other.nodeIsRelay) {
+ return -1;
+ } else if (this.nodeIsRelay && !other.nodeIsRelay) {
+ return 1;
+ } else if (!this.country.equals(other.country)) {
+ return this.country.compareTo(other.country);
+ } else if (!this.transport.equals(other.transport)) {
+ return this.transport.compareTo(other.transport);
+ } else if (!this.version.equals(other.version)) {
+ return this.version.compareTo(other.version);
+ } else {
+ return 0;
+ }
+ }
+
+ @Override
+ public boolean equals(Object otherObject) {
+ if (!(otherObject instanceof ClientsKey)) {
+ return false;
+ } else {
+ ClientsKey other = (ClientsKey) otherObject;
+ return this.date.equals(other.date)
+ && this.nodeIsRelay == other.nodeIsRelay
+ && this.country.equals(other.country)
+ && this.transport.equals(other.transport)
+ && this.version.equals(other.version);
+ }
+ }
+
+ @Override
+ public int hashCode() {
+ return 3 * this.date.hashCode() + (this.nodeIsRelay ? 5 : 0)
+ + 7 * this.country.hashCode() + 11 * this.transport.hashCode()
+ + 13 * this.version.hashCode();
+ }
+
+ @Override
+ public String toString() {
+ return String.format("%s,%s,%s,%s,%s",
+ this.date.toString(), this.nodeIsRelay ? "relay" : "bridge",
+ this.country, this.transport, this.version);
+ }
+ }
+
+ /** Value class that stores everything we already knew about a specific
+ * subset of clients from the input file. */
+ private static class ClientsEstimates {
+
+ /** Estimated number of clients. */
+ private int clients;
+
+ /** Fraction of relays or bridges in percent that the estimate is based on,
+ * between 0 and 100. */
+ private int frac;
+
+ ClientsEstimates(int clients, int frac) {
+ this.clients = clients;
+ this.frac = frac;
+ }
+
+ @Override
+ public String toString() {
+ return String.format("%d,%d", this.clients, this.frac);
+ }
+ }
+
+ /** Value class that stores everything we're computing here about a specific
+ * subset of clients from the input file. */
+ private static class ClientsRanges {
+
+ /** Lower number of expected clients under the assumption that there has
+ * been no censorship event, as computed here. */
+ private int lower;
+
+ /** Upper number of expected clients under the assumption that there has
+ * been no release of censorship, as computed here. */
+ private int upper;
+
+ ClientsRanges(int lower, int upper) {
+ this.lower = lower;
+ this.upper = upper;
+ }
+
+ @Override
+ public String toString() {
+ return String.format("%d,%d", this.lower, this.upper);
+ }
+ }
+
+ /** Run censorship detection. */
+ public void detect() throws IOException {
+ SortedMap<ClientsKey, ClientsEstimates> estimates = readInputFile();
+ Set<String> largestLocations = findLargestLocations(estimates);
+ Map<LocalDate, List<Double>> ratios = computeRatiosOfLargestLocations(
+ estimates, largestLocations);
+ Map<LocalDate, List<Double>> ratiosWithoutOutliers = removeOutliers(ratios);
+ SortedMap<ClientsKey, ClientsRanges> ranges = computeRanges(estimates,
+ ratiosWithoutOutliers);
+ writeOutputFile(estimates, ranges);
+ }
+
+ /** Read and return the parsed input file containing comma-separated estimates
+ * of client numbers. */
+ private static SortedMap<ClientsKey, ClientsEstimates> readInputFile()
+ throws IOException {
+ SortedMap<ClientsKey, ClientsEstimates> estimates = new TreeMap<>();
+ File inputFile = INPUT_PATH.toFile();
+ if (!inputFile.exists()) {
+ throw new IOException(String.format("Input file %s does not exist.",
+ inputFile));
+ }
+ try (LineNumberReader lnr = new LineNumberReader(
+ new FileReader(inputFile))) {
+ String line = lnr.readLine();
+ if (!"date,node,country,transport,version,frac,users".equals(line)) {
+ throw new IOException(String.format("Unable to read input file %s with "
+ + "unrecognized header line '%s'. Not running detector.", inputFile,
+ line));
+ }
+ while ((line = lnr.readLine()) != null) {
+ ClientsKey key = null;
+ ClientsEstimates value = null;
+ boolean invalidLine = false;
+ String[] lineParts = line.split(",");
+ if (lineParts.length == 7) {
+ try {
+ LocalDate date = LocalDate.parse(lineParts[0]);
+ boolean nodeIsRelay = false;
+ if ("relay".equals(lineParts[1])) {
+ nodeIsRelay = true;
+ } else if (!"bridge".equals(lineParts[1])) {
+ invalidLine = true;
+ }
+ String country = lineParts[2].replaceAll("\"", "");
+ String transport = lineParts[3].replaceAll("\"", "");
+ String version = lineParts[4].replaceAll("\"", "");
+ key = new ClientsKey(date, nodeIsRelay, country, transport,
+ version);
+ } catch (DateTimeParseException e) {
+ invalidLine = true;
+ }
+ try {
+ int frac = Integer.parseInt(lineParts[5]);
+ int clients = Integer.parseInt(lineParts[6]);
+ value = new ClientsEstimates(clients, frac);
+ } catch (NumberFormatException e) {
+ invalidLine = true;
+ }
+ } else {
+ invalidLine = true;
+ }
+ if (invalidLine) {
+ throw new IOException(String.format(
+ "Invalid line %d '%s' in input file %s.", lnr.getLineNumber(),
+ line, inputFile));
+ } else {
+ estimates.put(key, value);
+ }
+ }
+ }
+ return estimates;
+ }
+
+ /** Return the NUM_LARGEST_LOCATIONS countries (except for "??") with the
+ * largest number of estimated clients on the last known date in the input
+ * data set.
+ *
+ * <p>Note that this implies that lower/upper values are going to change,
+ * depending on which countries had most clients on the last known date in the
+ * input data set.</p> */
+ private static Set<String> findLargestLocations(
+ SortedMap<ClientsKey, ClientsEstimates> clients) throws IOException {
+ LocalDate lastKnownDate = clients.keySet().stream()
+ .filter(c -> c.nodeIsRelay)
+ .map(c -> c.date)
+ .max(LocalDate::compareTo)
+ .orElseThrow(() -> new IOException("Unable to find maximum date. Was "
+ + "the input file empty or otherwise corrupt?"));
+ return clients.entrySet().stream()
+ .filter(c -> lastKnownDate.equals(c.getKey().date))
+ .filter(c -> c.getKey().nodeIsRelay)
+ .filter(c -> !"".equals(c.getKey().country))
+ .filter(c -> !"??".equals(c.getKey().country))
+ .sorted((c1, c2) -> Integer.compare(c2.getValue().clients,
+ c1.getValue().clients))
+ .map(c -> c.getKey().country)
+ .limit(NUM_LARGEST_LOCATIONS)
+ .collect(Collectors.toSet());
+ }
+
+ /** Compute the ratio of the client number estimate for a given date and
+ * country as compared to 1 week before, for all dates, for relay users, and
+ * for the largest locations. */
+ private static Map<LocalDate, List<Double>> computeRatiosOfLargestLocations(
+ SortedMap<ClientsKey, ClientsEstimates> estimates,
+ Set<String> largestLocations) {
+ Map<LocalDate, List<Double>> ratios = new HashMap<>();
+ for (Map.Entry<ClientsKey, ClientsEstimates> numerator
+ : estimates.entrySet()) {
+ if (!numerator.getKey().nodeIsRelay
+ || !largestLocations.contains(numerator.getKey().country)) {
+ continue;
+ }
+ ClientsEstimates denominator = estimates.get(new ClientsKey(
+ numerator.getKey().date.minusDays(INTERV), true,
+ numerator.getKey().country));
+ if (null == denominator || denominator.clients == 0) {
+ continue;
+ }
+ if (!ratios.containsKey(numerator.getKey().date)) {
+ ratios.put(numerator.getKey().date, new ArrayList<>());
+ }
+ ratios.get(numerator.getKey().date).add(
+ ((double) numerator.getValue().clients)
+ / (double) denominator.clients);
+ }
+ return ratios;
+ }
+
+ /** Exclude outliers from the given ratios by date that fall outside four
+ * inter-quartile ranges of the median and make sure that at least 8 ratio
+ * values remain. */
+ private static SortedMap<LocalDate, List<Double>> removeOutliers(
+ Map<LocalDate, List<Double>> ratios) {
+ SortedMap<LocalDate, List<Double>> ratiosWithoutOutliers = new TreeMap<>();
+ for (Map.Entry<LocalDate, List<Double>> e : ratios.entrySet()) {
+ double[] values = e.getValue().stream().mapToDouble(Double::doubleValue)
+ .toArray();
+ Percentile percentile = new Percentile()
+ .withEstimationType(Percentile.EstimationType.R_7);
+ percentile.setData(values);
+ double median = percentile.evaluate(50.0);
+ double firstQuarter = percentile.evaluate(25.0);
+ double thirdQuarter = percentile.evaluate(75.0);
+ double interQuartileRange = thirdQuarter - firstQuarter;
+ List<Double> valuesWithoutOutliers = new ArrayList<>();
+ for (double value : values) {
+ if (value > median - 4 * interQuartileRange
+ && value < median + 4 * interQuartileRange) {
+ valuesWithoutOutliers.add(value);
+ }
+ }
+ if (valuesWithoutOutliers.size() < 8) {
+ continue;
+ }
+ LocalDate date = e.getKey();
+ ratiosWithoutOutliers.put(date, valuesWithoutOutliers);
+ }
+ return ratiosWithoutOutliers;
+ }
+
+ /** Compute ranges as the expected minimum and maximum number of users. */
+ private static SortedMap<ClientsKey, ClientsRanges> computeRanges(
+ SortedMap<ClientsKey, ClientsEstimates> estimates,
+ Map<LocalDate, List<Double>> ratiosWithoutOutliers) {
+ SortedMap<ClientsKey, ClientsRanges> ranges = new TreeMap<>();
+ for (Map.Entry<ClientsKey, ClientsEstimates> estimatesEntry
+ : estimates.entrySet()) {
+ LocalDate date = estimatesEntry.getKey().date;
+ if (!estimatesEntry.getKey().nodeIsRelay
+ || "".equals(estimatesEntry.getKey().country)
+ || "??".equals(estimatesEntry.getKey().country)
+ || !ratiosWithoutOutliers.containsKey(date)) {
+ continue;
+ }
+ ClientsEstimates referenceEstimate = estimates.get(
+ new ClientsKey(date.minusDays(INTERV),
+ true, estimatesEntry.getKey().country));
+ if (null == referenceEstimate || referenceEstimate.clients == 0) {
+ continue;
+ }
+ double[] values = ratiosWithoutOutliers.get(date).stream()
+ .mapToDouble(Double::doubleValue).toArray();
+ double mean = new Mean().evaluate(values);
+ double std = new StandardDeviation(false).evaluate(values);
+ NormalDistribution normalDistribution = new NormalDistribution(mean, std);
+ PoissonDistribution poissonDistribution
+ = new PoissonDistribution(referenceEstimate.clients);
+ int lower = Math.max(0,
+ (int) (normalDistribution.inverseCumulativeProbability(0.0001)
+ * poissonDistribution.inverseCumulativeProbability(0.0001)));
+ int upper =
+ (int) (normalDistribution.inverseCumulativeProbability(0.9999)
+ * poissonDistribution.inverseCumulativeProbability(0.9999));
+ ranges.put(estimatesEntry.getKey(), new ClientsRanges(lower, upper));
+ }
+ return ranges;
+ }
+
+ /** Write client number estimates together with lower and upper bounds as
+ * comma-separated values to the output file. */
+ private static void writeOutputFile(
+ SortedMap<ClientsKey, ClientsEstimates> estimates,
+ SortedMap<ClientsKey, ClientsRanges> ranges) throws IOException {
+ try (BufferedWriter bw = new BufferedWriter(
+ new FileWriter(OUTPUT_PATH.toFile()))) {
+ bw.write(
+ "date,node,country,transport,version,lower,upper,clients,frac\n");
+ for (Map.Entry<ClientsKey, ClientsEstimates> e : estimates.entrySet()) {
+ String rangesString = ",";
+ if (ranges.containsKey(e.getKey())) {
+ rangesString = ranges.get(e.getKey()).toString();
+ }
+ bw.write(String.format("%s,%s,%s%n", e.getKey().toString(),
+ rangesString, e.getValue().toString()));
+ }
+ }
+ }
+}
+
diff --git a/src/main/java/org/torproject/metrics/stats/clients/Main.java b/src/main/java/org/torproject/metrics/stats/clients/Main.java
index 48d8d8d..0f1087b 100644
--- a/src/main/java/org/torproject/metrics/stats/clients/Main.java
+++ b/src/main/java/org/torproject/metrics/stats/clients/Main.java
@@ -59,6 +59,11 @@ public class Main {
log.info("Disconnecting from database.");
database.close();
+
+ log.info("Running detector.");
+ new Detector().detect();
+
+ log.info("Terminating clients module.");
}
private static final long ONE_HOUR_MILLIS = 60L * 60L * 1000L;
diff --git a/src/main/python/clients/country_info.py b/src/main/python/clients/country_info.py
deleted file mode 100644
index 1a505d0..0000000
--- a/src/main/python/clients/country_info.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# -*- coding: utf-8 -*-
-
-countries = {
- "ad" : "Andorra",
- "ae" : "the United Arab Emirates",
- "af" : "Afghanistan",
- "ag" : "Antigua and Barbuda",
- "ai" : "Anguilla",
- "al" : "Albania",
- "am" : "Armenia",
- "an" : "the Netherlands Antilles",
- "ao" : "Angola",
- "aq" : "Antarctica",
- "ar" : "Argentina",
- "as" : "American Samoa",
- "at" : "Austria",
- "au" : "Australia",
- "aw" : "Aruba",
- "ax" : "the Aland Islands",
- "az" : "Azerbaijan",
- "ba" : "Bosnia and Herzegovina",
- "bb" : "Barbados",
- "bd" : "Bangladesh",
- "be" : "Belgium",
- "bf" : "Burkina Faso",
- "bg" : "Bulgaria",
- "bh" : "Bahrain",
- "bi" : "Burundi",
- "bj" : "Benin",
- "bl" : "Saint Bartelemey",
- "bm" : "Bermuda",
- "bn" : "Brunei",
- "bo" : "Bolivia",
- "bq" : "Bonaire, Sint Eustatius and Saba",
- "br" : "Brazil",
- "bs" : "the Bahamas",
- "bt" : "Bhutan",
- "bv" : "the Bouvet Island",
- "bw" : "Botswana",
- "by" : "Belarus",
- "bz" : "Belize",
- "ca" : "Canada",
- "cc" : "the Cocos (Keeling) Islands",
- "cd" : "the Democratic Republic of the Congo",
- "cf" : "Central African Republic",
- "cg" : "Congo",
- "ch" : "Switzerland",
- "ci" : u"Côte d'Ivoire",
- "ck" : "the Cook Islands",
- "cl" : "Chile",
- "cm" : "Cameroon",
- "cn" : "China",
- "co" : "Colombia",
- "cr" : "Costa Rica",
- "cu" : "Cuba",
- "cv" : "Cape Verde",
- "cw" : u"Curaçao",
- "cx" : "the Christmas Island",
- "cy" : "Cyprus",
- "cz" : "the Czech Republic",
- "de" : "Germany",
- "dj" : "Djibouti",
- "dk" : "Denmark",
- "dm" : "Dominica",
- "do" : "the Dominican Republic",
- "dz" : "Algeria",
- "ec" : "Ecuador",
- "ee" : "Estonia",
- "eg" : "Egypt",
- "eh" : "the Western Sahara",
- "er" : "Eritrea",
- "es" : "Spain",
- "et" : "Ethiopia",
- "fi" : "Finland",
- "fj" : "Fiji",
- "fk" : "the Falkland Islands (Malvinas)",
- "fm" : "the Federated States of Micronesia",
- "fo" : "the Faroe Islands",
- "fr" : "France",
- "ga" : "Gabon",
- "gb" : "the United Kingdom",
- "gd" : "Grenada",
- "ge" : "Georgia",
- "gf" : "French Guiana",
- "gg" : "Guernsey",
- "gh" : "Ghana",
- "gi" : "Gibraltar",
- "gl" : "Greenland",
- "gm" : "Gambia",
- "gn" : "Guinea",
- "gp" : "Guadeloupe",
- "gq" : "Equatorial Guinea",
- "gr" : "Greece",
- "gs" : "South Georgia and the South Sandwich Islands",
- "gt" : "Guatemala",
- "gu" : "Guam",
- "gw" : "Guinea-Bissau",
- "gy" : "Guyana",
- "hk" : "Hong Kong",
- "hm" : "Heard Island and McDonald Islands",
- "hn" : "Honduras",
- "hr" : "Croatia",
- "ht" : "Haiti",
- "hu" : "Hungary",
- "id" : "Indonesia",
- "ie" : "Ireland",
- "il" : "Israel",
- "im" : "the Isle of Man",
- "in" : "India",
- "io" : "the British Indian Ocean Territory",
- "iq" : "Iraq",
- "ir" : "Iran",
- "is" : "Iceland",
- "it" : "Italy",
- "je" : "Jersey",
- "jm" : "Jamaica",
- "jo" : "Jordan",
- "jp" : "Japan",
- "ke" : "Kenya",
- "kg" : "Kyrgyzstan",
- "kh" : "Cambodia",
- "ki" : "Kiribati",
- "km" : "Comoros",
- "kn" : "Saint Kitts and Nevis",
- "kp" : "North Korea",
- "kr" : "the Republic of Korea",
- "kw" : "Kuwait",
- "ky" : "the Cayman Islands",
- "kz" : "Kazakhstan",
- "la" : "Laos",
- "lb" : "Lebanon",
- "lc" : "Saint Lucia",
- "li" : "Liechtenstein",
- "lk" : "Sri Lanka",
- "lr" : "Liberia",
- "ls" : "Lesotho",
- "lt" : "Lithuania",
- "lu" : "Luxembourg",
- "lv" : "Latvia",
- "ly" : "Libya",
- "ma" : "Morocco",
- "mc" : "Monaco",
- "md" : "the Republic of Moldova",
- "me" : "Montenegro",
- "mf" : "Saint Martin",
- "mg" : "Madagascar",
- "mh" : "the Marshall Islands",
- "mk" : "Macedonia",
- "ml" : "Mali",
- "mm" : "Burma",
- "mn" : "Mongolia",
- "mo" : "Macau",
- "mp" : "the Northern Mariana Islands",
- "mq" : "Martinique",
- "mr" : "Mauritania",
- "ms" : "Montserrat",
- "mt" : "Malta",
- "mu" : "Mauritius",
- "mv" : "the Maldives",
- "mw" : "Malawi",
- "mx" : "Mexico",
- "my" : "Malaysia",
- "mz" : "Mozambique",
- "na" : "Namibia",
- "nc" : "New Caledonia",
- "ne" : "Niger",
- "nf" : "Norfolk Island",
- "ng" : "Nigeria",
- "ni" : "Nicaragua",
- "nl" : "the Netherlands",
- "no" : "Norway",
- "np" : "Nepal",
- "nr" : "Nauru",
- "nu" : "Niue",
- "nz" : "New Zealand",
- "om" : "Oman",
- "pa" : "Panama",
- "pe" : "Peru",
- "pf" : "French Polynesia",
- "pg" : "Papua New Guinea",
- "ph" : "the Philippines",
- "pk" : "Pakistan",
- "pl" : "Poland",
- "pm" : "Saint Pierre and Miquelon",
- "pn" : "the Pitcairn Islands",
- "pr" : "Puerto Rico",
- "ps" : "the Palestinian Territory",
- "pt" : "Portugal",
- "pw" : "Palau",
- "py" : "Paraguay",
- "qa" : "Qatar",
- "re" : "Reunion",
- "ro" : "Romania",
- "rs" : "Serbia",
- "ru" : "Russia",
- "rw" : "Rwanda",
- "sa" : "Saudi Arabia",
- "sb" : "the Solomon Islands",
- "sc" : "the Seychelles",
- "sd" : "Sudan",
- "se" : "Sweden",
- "sg" : "Singapore",
- "sh" : "Saint Helena",
- "si" : "Slovenia",
- "sj" : "Svalbard and Jan Mayen",
- "sk" : "Slovakia",
- "sl" : "Sierra Leone",
- "sm" : "San Marino",
- "sn" : "Senegal",
- "so" : "Somalia",
- "sr" : "Suriname",
- "ss" : "South Sudan",
- "st" : u"São Tomé and Príncipe",
- "sv" : "El Salvador",
- "sx" : "Sint Maarten",
- "sy" : "the Syrian Arab Republic",
- "sz" : "Swaziland",
- "tc" : "Turks and Caicos Islands",
- "td" : "Chad",
- "tf" : "the French Southern Territories",
- "tg" : "Togo",
- "th" : "Thailand",
- "tj" : "Tajikistan",
- "tk" : "Tokelau",
- "tl" : "East Timor",
- "tm" : "Turkmenistan",
- "tn" : "Tunisia",
- "to" : "Tonga",
- "tr" : "Turkey",
- "tt" : "Trinidad and Tobago",
- "tv" : "Tuvalu",
- "tw" : "Taiwan",
- "tz" : "the United Republic of Tanzania",
- "ua" : "Ukraine",
- "ug" : "Uganda",
- "um" : "the United States Minor Outlying Islands",
- "us" : "the United States",
- "uy" : "Uruguay",
- "uz" : "Uzbekistan",
- "va" : "Vatican City",
- "vc" : "Saint Vincent and the Grenadines",
- "ve" : "Venezuela",
- "vg" : "the British Virgin Islands",
- "vi" : "the United States Virgin Islands",
- "vn" : "Vietnam",
- "vu" : "Vanuatu",
- "wf" : "Wallis and Futuna",
- "ws" : "Samoa",
- "xk" : "Kosovo",
- "ye" : "Yemen",
- "yt" : "Mayotte",
- "za" : "South Africa",
- "zm" : "Zambia",
- "zw" : "Zimbabwe"
- }
diff --git a/src/main/python/clients/detector.py b/src/main/python/clients/detector.py
deleted file mode 100644
index b0a98af..0000000
--- a/src/main/python/clients/detector.py
+++ /dev/null
@@ -1,242 +0,0 @@
-## Copyright (c) 2011 George Danezis <gdane(a)microsoft.com>
-##
-## All rights reserved.
-##
-## Redistribution and use in source and binary forms, with or without
-## modification, are permitted (subject to the limitations in the
-## disclaimer below) provided that the following conditions are met:
-##
-## * Redistributions of source code must retain the above copyright
-## notice, this list of conditions and the following disclaimer.
-##
-## * Redistributions in binary form must reproduce the above copyright
-## notice, this list of conditions and the following disclaimer in the
-## documentation and/or other materials provided with the
-## distribution.
-##
-## * Neither the name of <Owner Organization> nor the names of its
-## contributors may be used to endorse or promote products derived
-## from this software without specific prior written permission.
-##
-## NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-## GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
-## HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-## WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-## MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-## DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-## LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-## CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-## SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
-## BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
-## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
-## IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-##
-## (Clear BSD license: http://labs.metacarta.com/license-explanation.html#license)
-
-## This script reads a .csv file of the number of Tor users and finds
-## anomalies that might be indicative of censorship.
-
-# Dep: numpy
-import numpy
-from numpy import mean, std, percentile
-
-# Dep: scipy
-import scipy.stats
-from scipy.stats.distributions import norm
-from scipy.stats.distributions import poisson
-
-# Std lib
-from datetime import date
-from datetime import timedelta
-import os.path
-
-# Country code -> Country names
-import country_info
-
-# write utf8 to file
-import codecs
-
-def get_country_name_from_cc(country_code):
- if (country_code.lower() in country_info.countries):
- return country_info.countries[country_code.lower()]
- return country_code # if we didn't find the cc in our map
-
-"""
-Represents a .csv file containing information on the number of
-connecting Tor users per country.
-
-'store': Dictionary with (<country code>, <counter>) as key, and the number of users as value.
- <country code> can also be "date"...
-'all_dates': List of the data intervals (with default timedelta: 1 day).
-'country_codes': List of all relevant country codes.
-'MAX_INDEX': Length of store, number of country codes etc.
-'date_min': The oldest date found in the .csv.
-'date_min': The latest date found in the .csv.
-"""
-class torstatstore:
- def __init__(self, file_name):
- f = file(file_name)
- country_codes = f.readline()
- country_codes = country_codes.strip().split(",")
-
- store = {}
- MAX_INDEX = 0
- for i, line in enumerate(f):
- MAX_INDEX += 1
- line_parsed = line.strip().split(",")
- for j, (ccode, val) in enumerate(zip(country_codes,line_parsed)):
- processed_val = None
- if ccode == "date":
- try:
- year, month, day = int(val[:4]), int(val[5:7]), int(val[8:10])
- processed_val = date(year, month, day)
- except Exception, e:
- print "Parsing error (ignoring line %s):" % j
- print "%s" % val,e
- break
-
- elif val != "NA":
- processed_val = int(val)
- store[(ccode, i)] = processed_val
-
- # min and max
- date_min = store[("date", 0)]
- date_max = store[("date", i)]
-
- all_dates = []
- d = date_min
- dt = timedelta(days=1)
- while d <= date_max:
- all_dates += [d]
- d = d + dt
-
- # Save for later
- self.store = store
- self.all_dates = all_dates
- self.country_codes = country_codes
- self.MAX_INDEX = MAX_INDEX
- self.date_min = date_min
- self.date_max = date_max
-
- """Return a list representing a time series of 'ccode' with respect
- to the number of connected users.
- """
- def get_country_series(self, ccode):
- assert ccode in self.country_codes
- series = {}
- for d in self.all_dates:
- series[d] = None
- for i in range(self.MAX_INDEX):
- series[self.store[("date", i)]] = self.store[(ccode, i)]
- sx = []
- for d in self.all_dates:
- sx += [series[d]]
- return sx
-
- """Return an ordered list containing tuples of the form (<number of
- users>, <country code>). The list is ordered with respect to the
- number of users for each country.
- """
- def get_largest(self, number):
- exclude = set(["all", "??", "date"])
- l = [(self.store[(c, self.MAX_INDEX-1)], c) for c in self.country_codes if c not in exclude]
- l.sort()
- l.reverse()
- return l[:number]
-
- """Return a dictionary, with <country code> as key, and the time
- series of the country code as the value.
- """
- def get_largest_locations(self, number):
- l = self.get_largest(number)
- res = {}
- for _, ccode in l[:number]:
- res[ccode] = self.get_country_series(ccode)
- return res
-
-"""Return a list containing lists (?) where each such list contains
-the difference in users for a time delta of 'days'
-"""
-def n_day_rel(series, days):
- rel = []
- for i, v in enumerate(series):
- if series[i] is None:
- rel += [None]
- continue
-
- if i - days < 0 or series[i-days] is None or series[i-days] == 0:
- rel += [None]
- else:
- rel += [ float(series[i]) / series[i-days]]
- return rel
-
-# Main model: computes the expected min / max range of number of users
-def make_tendencies_minmax(l, INTERVAL = 1):
- lminus1 = dict([(ccode, n_day_rel(l[ccode], INTERVAL)) for ccode in l])
- c = lminus1[lminus1.keys()[0]]
- dists = []
- minx = []
- maxx = []
- for i in range(len(c)):
- vals = [lminus1[ccode][i] for ccode in lminus1.keys() if lminus1[ccode][i] != None]
- if len(vals) < 8:
- dists += [None]
- minx += [None]
- maxx += [None]
- else:
- vals.sort()
- median = percentile(vals, 50)
- q1 = percentile(vals, 25)
- q2 = percentile(vals, 75)
- qd = q2 - q1
- vals = [v for v in vals if median - qd*4 < v and v < median + qd*4]
- if len(vals) < 8:
- dists += [None]
- minx += [None]
- maxx += [None]
- continue
- mu = mean(vals)
- signma = std(vals)
- dists += [(mu, signma)]
- maxx += [norm.ppf(0.9999, mu, signma)]
- minx += [norm.ppf(1 - 0.9999, mu, signma)]
- ## print minx[-1], maxx[-1]
- return minx, maxx
-
-"""Write a CSV report on the minimum/maximum users of each country per date."""
-def write_all(tss, minc, maxc, RANGES_FILE, INTERVAL=7):
- ranges_file = file(RANGES_FILE, "w")
- ranges_file.write("date,country,minusers,maxusers\n")
- exclude = set(["all", "??", "date"])
- for c in tss.country_codes:
- if c in exclude:
- continue
- series = tss.get_country_series(c)
- for i, v in enumerate(series):
- if i > 0 and i - INTERVAL >= 0 and series[i] != None and series[i-INTERVAL] != None and series[i-INTERVAL] != 0 and minc[i]!= None and maxc[i]!= None:
- minv = minc[i] * poisson.ppf(1-0.9999, series[i-INTERVAL])
- maxv = maxc[i] * poisson.ppf(0.9999, series[i-INTERVAL])
- if not minv < maxv:
- print minv, maxv, series[i-INTERVAL], minc[i], maxc[i]
- assert minv < maxv
- if minv < 0.0:
- minv = 0.0
- ranges_file.write("%s,%s,%s,%s\n" % (tss.all_dates[i], c, minv, maxv))
- ranges_file.close()
-
-# INTERV is the time interval to model connection rates;
-# consider maximum DAYS days back.
-def detect(CSV_FILE = "userstats-detector.csv",
- RANGES_FILE = "userstats-ranges.csv",
- INTERV = 7, DAYS = 6 * 31):
- tss = torstatstore(CSV_FILE)
- l = tss.get_largest_locations(50)
- minx, maxx = make_tendencies_minmax(l, INTERV)
- write_all(tss, minx, maxx, RANGES_FILE, INTERV)
-
-def main():
- detect()
-
-if __name__ == "__main__":
- main()
1
0
[metrics-web/release] Re-add missing COMMIT commands to bwhist module.
by karsten@torproject.org 09 Nov '19
by karsten@torproject.org 09 Nov '19
09 Nov '19
commit 9dd35e29084ed9380cb374c80a4f9bfb0d9a91e2
Author: Karsten Loesing <karsten.loesing(a)gmx.net>
Date: Thu Dec 20 11:28:46 2018 +0100
Re-add missing COMMIT commands to bwhist module.
Last month, in commit f8fa108 where we modernized the legacy module
and renamed it to bwhist, we split up the closeConnection() into one
method commit() to commit changes and another method closeConnection()
to close the connection. However, we somehow forgot to invoke the
commit() method.
This had two effects:
1. Newly added data was not made persistent in the database. This
lead to a moving window of roughly one week for new data and an
increasing gap between the last committed data and this 1-week
window.
2. The result of aggregating newly added data was not made
persistent. So, even after fixing the first issue above, we
accumulated newly added data, rather than only keeping the most
recent two weeks. This made the database slower over time.
This change adds two commit() calls at the right places.
---
.../metrics/stats/bwhist/RelayDescriptorDatabaseImporter.java | 2 ++
1 file changed, 2 insertions(+)
diff --git a/src/main/java/org/torproject/metrics/stats/bwhist/RelayDescriptorDatabaseImporter.java b/src/main/java/org/torproject/metrics/stats/bwhist/RelayDescriptorDatabaseImporter.java
index a6cf0cc..9f9ecff 100644
--- a/src/main/java/org/torproject/metrics/stats/bwhist/RelayDescriptorDatabaseImporter.java
+++ b/src/main/java/org/torproject/metrics/stats/bwhist/RelayDescriptorDatabaseImporter.java
@@ -532,6 +532,7 @@ public final class RelayDescriptorDatabaseImporter {
this.addExtraInfoDescriptor((ExtraInfoDescriptor) descriptor);
}
}
+ this.commit();
reader.saveHistoryFile(this.historyFile);
}
@@ -615,6 +616,7 @@ public final class RelayDescriptorDatabaseImporter {
void aggregate() throws SQLException {
Statement st = this.conn.createStatement();
st.executeQuery("SELECT refresh_all()");
+ this.commit();
}
/** Query the servers_platforms view. */
1
0
commit 28f567c1ae0458362ac24d491484b81e7bc37afd
Author: Karsten Loesing <karsten.loesing(a)gmx.net>
Date: Sat Dec 29 10:07:45 2018 +0100
Fix by-country CSV files.
Fixes #28945.
---
src/main/R/rserver/graphs.R | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/src/main/R/rserver/graphs.R b/src/main/R/rserver/graphs.R
index 1ca9357..03b5b93 100644
--- a/src/main/R/rserver/graphs.R
+++ b/src/main/R/rserver/graphs.R
@@ -922,14 +922,15 @@ write_userstats_relay_country <- function(start_p = NULL, end_p = NULL,
lower = col_double(),
upper = col_double(),
clients = col_double(),
- frac = col_double())) %>%
+ frac = col_double()),
+ na = character()) %>%
filter(node == "relay") %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
filter(if (!is.null(country_p))
country == ifelse(country_p == "all", "", country_p) else TRUE) %>%
- filter(is.na(transport)) %>%
- filter(is.na(version)) %>%
+ filter(transport == "") %>%
+ filter(version == "") %>%
select(date, country, clients, lower, upper, frac) %>%
rename(users = clients) %>%
write.csv(path_p, quote = FALSE, row.names = FALSE, na = "")
@@ -947,14 +948,15 @@ write_userstats_bridge_country <- function(start_p = NULL, end_p = NULL,
lower = col_double(),
upper = col_double(),
clients = col_double(),
- frac = col_double())) %>%
+ frac = col_double()),
+ na = character()) %>%
filter(node == "bridge") %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
filter(if (!is.null(country_p))
country == ifelse(country_p == "all", "", country_p) else TRUE) %>%
- filter(is.na(transport)) %>%
- filter(is.na(version)) %>%
+ filter(transport == "") %>%
+ filter(version == "") %>%
select(date, country, clients, frac) %>%
rename(users = clients) %>%
write.csv(path_p, quote = FALSE, row.names = FALSE, na = "")
@@ -1031,7 +1033,8 @@ prepare_userstats_bridge_combined <- function(start_p, end_p, country_p) {
version = col_skip(),
frac = col_double(),
low = col_double(),
- high = col_double())) %>%
+ high = col_double()),
+ na = character()) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
filter(if (!is.null(country_p)) country == country_p else TRUE)
1
0
[metrics-web/release] Access userstats database from Java only.
by karsten@torproject.org 09 Nov '19
by karsten@torproject.org 09 Nov '19
09 Nov '19
commit f5ef5fb0d8f46e28dc7e8536a11d95d43ee61c08
Author: Karsten Loesing <karsten.loesing(a)gmx.net>
Date: Sat Dec 8 11:06:17 2018 +0100
Access userstats database from Java only.
Previously, we used Java to write .sql files, imported them using
psql, and afterwards made queries via psql. Now we're using Java to
interact with the database directly. This is another step towards
making the daily updater Java-only.
---
build.xml | 22 ---
.../torproject/metrics/stats/clients/Database.java | 156 ++++++++++++++++
.../org/torproject/metrics/stats/clients/Main.java | 206 +++++++--------------
.../torproject/metrics/stats/clients/Writer.java | 42 +++++
4 files changed, 263 insertions(+), 163 deletions(-)
diff --git a/build.xml b/build.xml
index 250417e..6736e19 100644
--- a/build.xml
+++ b/build.xml
@@ -367,28 +367,6 @@
<mkdir dir="${statsdir}" />
<antcall target="run-java" />
- <apply executable="psql" failonerror="true" >
- <arg value="--dbname=userstats"/>
- <arg value="-f"/>
- <fileset dir="${localmoddir}/out"
- includes="*.sql" />
- </apply>
-
- <exec executable="psql"
- dir="${localmoddir}"
- failonerror="true" >
- <arg value="-c COPY (SELECT * FROM estimated) TO STDOUT WITH CSV HEADER;" />
- <arg value="--dbname=userstats"/>
- <arg value="--output=userstats.csv" />
- </exec>
-
- <exec executable="psql"
- dir="${localmoddir}"
- failonerror="true" >
- <arg value="-c COPY (SELECT * FROM combined) TO STDOUT WITH CSV HEADER;" />
- <arg value="--dbname=userstats"/>
- <arg value="--output=userstats-combined.csv" />
- </exec>
<antcall target="run-R" >
<param name="module.Rscript" value="userstats-detector.R" />
diff --git a/src/main/java/org/torproject/metrics/stats/clients/Database.java b/src/main/java/org/torproject/metrics/stats/clients/Database.java
new file mode 100644
index 0000000..7e783dc
--- /dev/null
+++ b/src/main/java/org/torproject/metrics/stats/clients/Database.java
@@ -0,0 +1,156 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.metrics.stats.clients;
+
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.sql.Timestamp;
+import java.time.Instant;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.List;
+import java.util.Locale;
+import java.util.TimeZone;
+
+/** Database wrapper to connect to the database, insert data, run the stored
+ * procedure for aggregating data, and query aggregated data as output. */
+class Database implements AutoCloseable {
+
+ /** Database connection string. */
+ private String jdbcString;
+
+ /** Connection object for all interactions with the database. */
+ private Connection connection;
+
+ /** Prepared statement for inserting a platform string into the imported
+ * table. */
+ private PreparedStatement psImportedInsert;
+
+ /** Create a new Database instance and prepare for inserting or querying
+ * data. */
+ Database(String jdbcString) throws SQLException {
+ this.jdbcString = jdbcString;
+ this.connect();
+ this.prepareStatements();
+ }
+
+ private void connect() throws SQLException {
+ this.connection = DriverManager.getConnection(this.jdbcString);
+ this.connection.setAutoCommit(false);
+ }
+
+ private void prepareStatements() throws SQLException {
+ this.psImportedInsert = this.connection.prepareStatement(
+ "INSERT INTO imported (fingerprint, node, metric, country, transport, "
+ + "version, stats_start, stats_end, val) "
+ + "VALUES (?, CAST(? AS node), CAST(? AS metric), ?, ?, ?, ?, ?, ?)");
+ }
+
+ /** Insert into the imported table. */
+ void insertIntoImported(String fingerprint, String node, String metric,
+ String country, String transport, String version, long fromMillis,
+ long toMillis, double val) throws SQLException {
+ if (fromMillis > toMillis) {
+ return;
+ }
+ psImportedInsert.clearParameters();
+ psImportedInsert.setString(1, fingerprint);
+ psImportedInsert.setString(2, node);
+ psImportedInsert.setString(3, metric);
+ psImportedInsert.setString(4, country);
+ psImportedInsert.setString(5, transport);
+ psImportedInsert.setString(6, version);
+ psImportedInsert.setTimestamp(7,
+ Timestamp.from(Instant.ofEpochMilli(fromMillis)));
+ psImportedInsert.setTimestamp(8,
+ Timestamp.from(Instant.ofEpochMilli(toMillis)));
+ psImportedInsert.setDouble(9, Math.round(val * 10.0) / 10.0);
+ psImportedInsert.execute();
+ }
+
+ /** Process the newly imported data by calling the various stored procedures
+ * and then truncating the imported table. */
+ void processImported() throws SQLException {
+ this.connection.createStatement().execute("SELECT merge()");
+ this.connection.createStatement().execute("SELECT aggregate()");
+ this.connection.createStatement().execute("SELECT combine()");
+ this.connection.createStatement().execute("TRUNCATE imported");
+ }
+
+ /** Commit all changes made in this execution. */
+ void commit() throws SQLException {
+ this.connection.commit();
+ }
+
+ /** Query the estimated view. */
+ List<String[]> queryEstimated() throws SQLException {
+ List<String[]> statistics = new ArrayList<>();
+ String columns = "date, node, country, transport, version, frac, users";
+ statistics.add(columns.split(", "));
+ Statement st = this.connection.createStatement();
+ Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC"),
+ Locale.US);
+ String queryString = "SELECT " + columns + " FROM estimated";
+ try (ResultSet rs = st.executeQuery(queryString)) {
+ while (rs.next()) {
+ String[] outputLine = new String[7];
+ outputLine[0] = rs.getDate("date", calendar).toLocalDate().toString();
+ outputLine[1] = rs.getString("node");
+ outputLine[2] = rs.getString("country");
+ outputLine[3] = rs.getString("transport");
+ outputLine[4] = rs.getString("version");
+ outputLine[5] = getIntFromResultSet(rs, "frac");
+ outputLine[6] = getIntFromResultSet(rs, "users");
+ statistics.add(outputLine);
+ }
+ }
+ return statistics;
+ }
+
+ /** Query the combined view. */
+ List<String[]> queryCombined() throws SQLException {
+ List<String[]> statistics = new ArrayList<>();
+ String columns = "date, node, country, transport, version, frac, low, high";
+ statistics.add(columns.split(", "));
+ Statement st = this.connection.createStatement();
+ Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC"),
+ Locale.US);
+ String queryString = "SELECT " + columns + " FROM combined";
+ try (ResultSet rs = st.executeQuery(queryString)) {
+ while (rs.next()) {
+ String[] outputLine = new String[8];
+ outputLine[0] = rs.getDate("date", calendar).toLocalDate().toString();
+ outputLine[1] = rs.getString("node");
+ outputLine[2] = rs.getString("country");
+ outputLine[3] = rs.getString("transport");
+ outputLine[4] = rs.getString("version");
+ outputLine[5] = getIntFromResultSet(rs, "frac");
+ outputLine[6] = getIntFromResultSet(rs, "low");
+ outputLine[7] = getIntFromResultSet(rs, "high");
+ statistics.add(outputLine);
+ }
+ }
+ return statistics;
+ }
+
+ /** Retrieve the <code>int</code> value of the designated column in the
+ * current row of the given <code>ResultSet</code> object and format it as a
+ * <code>String</code> object, or return <code>null</code> if the retrieved
+ * value was <code>NULL</code>. */
+ private static String getIntFromResultSet(ResultSet rs, String columnLabel)
+ throws SQLException {
+ int result = rs.getInt(columnLabel);
+ return rs.wasNull() ? null : String.valueOf(result);
+ }
+
+ /** Release database connection. */
+ public void close() throws SQLException {
+ this.connection.close();
+ }
+}
+
diff --git a/src/main/java/org/torproject/metrics/stats/clients/Main.java b/src/main/java/org/torproject/metrics/stats/clients/Main.java
index 3ccfe96..48d8d8d 100644
--- a/src/main/java/org/torproject/metrics/stats/clients/Main.java
+++ b/src/main/java/org/torproject/metrics/stats/clients/Main.java
@@ -15,46 +15,50 @@ import org.torproject.descriptor.RelayNetworkStatusConsensus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.BufferedWriter;
import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.text.SimpleDateFormat;
-import java.util.HashMap;
+import java.nio.file.Paths;
+import java.sql.SQLException;
import java.util.Map;
import java.util.SortedMap;
-import java.util.TimeZone;
import java.util.TreeMap;
public class Main {
private static Logger log = LoggerFactory.getLogger(Main.class);
+ private static final String jdbcString
+ = System.getProperty("clients.database", "jdbc:postgresql:userstats");
+
+ private static Database database;
+
/** Executes this data-processing module. */
public static void main(String[] args) throws Exception {
- parseArgs(args);
+
+ log.info("Starting clients module.");
+
+ log.info("Connecting to database.");
+ database = new Database(jdbcString);
+
+ log.info("Reading relay descriptors and importing relevant parts into the "
+ + "database.");
parseRelayDescriptors();
+
+ log.info("Reading bridge descriptors and importing relevant parts into the "
+ + "database.");
parseBridgeDescriptors();
- closeOutputFiles();
- }
- private static boolean writeToSingleFile = true;
- private static boolean byStatsDateNotByDescHour = false;
-
- private static void parseArgs(String[] args) {
- if (args.length == 0) {
- writeToSingleFile = true;
- } else if (args.length == 1 && args[0].equals("--stats-date")) {
- writeToSingleFile = false;
- byStatsDateNotByDescHour = true;
- } else if (args.length == 1 && args[0].equals("--desc-hour")) {
- writeToSingleFile = false;
- byStatsDateNotByDescHour = false;
- } else {
- log.warn("Usage: java {} [ --stats-date | --desc-hour ]",
- Main.class.getName());
- System.exit(1);
- }
+ log.info("Processing newly imported data.");
+ database.processImported();
+ database.commit();
+
+ log.info("Querying aggregated statistics from the database.");
+ new Writer().write(Paths.get("stats", "userstats.csv"),
+ database.queryEstimated());
+ new Writer().write(Paths.get("stats", "userstats-combined.csv"),
+ database.queryCombined());
+
+ log.info("Disconnecting from database.");
+ database.close();
}
private static final long ONE_HOUR_MILLIS = 60L * 60L * 1000L;
@@ -80,11 +84,12 @@ public class Main {
(RelayNetworkStatusConsensus) descriptor);
}
}
+ database.commit();
descriptorReader.saveHistoryFile(historyFile);
}
private static void parseRelayExtraInfoDescriptor(
- ExtraInfoDescriptor descriptor) throws IOException {
+ ExtraInfoDescriptor descriptor) throws SQLException {
long publishedMillis = descriptor.getPublishedMillis();
String fingerprint = descriptor.getFingerprint()
.toUpperCase();
@@ -103,7 +108,7 @@ public class Main {
private static void parseRelayDirreqV3Reqs(String fingerprint,
long publishedMillis, long dirreqStatsEndMillis,
long dirreqStatsIntervalLengthMillis,
- SortedMap<String, Integer> requests) throws IOException {
+ SortedMap<String, Integer> requests) throws SQLException {
if (requests == null
|| publishedMillis - dirreqStatsEndMillis > ONE_WEEK_MILLIS
|| dirreqStatsIntervalLengthMillis != ONE_DAY_MILLIS) {
@@ -130,19 +135,17 @@ public class Main {
String country = e.getKey();
double reqs = ((double) e.getValue()) - 4.0;
sum += reqs;
- writeOutputLine(fingerprint, "relay", "responses", country,
- "", "", fromMillis, toMillis, reqs * intervalFraction,
- publishedMillis);
+ database.insertIntoImported(fingerprint, "relay", "responses", country,
+ "", "", fromMillis, toMillis, reqs * intervalFraction);
}
- writeOutputLine(fingerprint, "relay", "responses", "", "",
- "", fromMillis, toMillis, sum * intervalFraction,
- publishedMillis);
+ database.insertIntoImported(fingerprint, "relay", "responses", "", "",
+ "", fromMillis, toMillis, sum * intervalFraction);
}
}
private static void parseRelayDirreqWriteHistory(String fingerprint,
long publishedMillis, BandwidthHistory dirreqWriteHistory)
- throws IOException {
+ throws SQLException {
if (dirreqWriteHistory == null
|| publishedMillis - dirreqWriteHistory.getHistoryEndMillis()
> ONE_WEEK_MILLIS) {
@@ -177,14 +180,14 @@ public class Main {
} else if (i == 1) {
break;
}
- writeOutputLine(fingerprint, "relay", "bytes", "", "", "",
- fromMillis, toMillis, writtenBytes, publishedMillis);
+ database.insertIntoImported(fingerprint, "relay", "bytes", "", "", "",
+ fromMillis, toMillis, writtenBytes);
}
}
}
private static void parseRelayNetworkStatusConsensus(
- RelayNetworkStatusConsensus consensus) throws IOException {
+ RelayNetworkStatusConsensus consensus) throws SQLException {
long fromMillis = consensus.getValidAfterMillis();
long toMillis = consensus.getFreshUntilMillis();
for (NetworkStatusEntry statusEntry
@@ -192,8 +195,8 @@ public class Main {
String fingerprint = statusEntry.getFingerprint()
.toUpperCase();
if (statusEntry.getFlags().contains("Running")) {
- writeOutputLine(fingerprint, "relay", "status", "", "", "",
- fromMillis, toMillis, 0.0, fromMillis);
+ database.insertIntoImported(fingerprint, "relay", "status", "", "", "",
+ fromMillis, toMillis, 0.0);
}
}
}
@@ -213,11 +216,12 @@ public class Main {
parseBridgeNetworkStatus((BridgeNetworkStatus) descriptor);
}
}
+ database.commit();
descriptorReader.saveHistoryFile(historyFile);
}
private static void parseBridgeExtraInfoDescriptor(
- ExtraInfoDescriptor descriptor) throws IOException {
+ ExtraInfoDescriptor descriptor) throws SQLException {
String fingerprint = descriptor.getFingerprint().toUpperCase();
long publishedMillis = descriptor.getPublishedMillis();
long dirreqStatsEndMillis = descriptor.getDirreqStatsEndMillis();
@@ -240,7 +244,7 @@ public class Main {
SortedMap<String, Integer> responses,
SortedMap<String, Integer> bridgeIps,
SortedMap<String, Integer> bridgeIpTransports,
- SortedMap<String, Integer> bridgeIpVersions) throws IOException {
+ SortedMap<String, Integer> bridgeIpVersions) throws SQLException {
if (responses == null
|| publishedMillis - dirreqStatsEndMillis > ONE_WEEK_MILLIS
|| dirreqStatsIntervalLengthMillis != ONE_DAY_MILLIS) {
@@ -264,18 +268,15 @@ public class Main {
}
double intervalFraction = ((double) (toMillis - fromMillis))
/ ((double) dirreqStatsIntervalLengthMillis);
- writeOutputLine(fingerprint, "bridge", "responses", "", "",
- "", fromMillis, toMillis, resp * intervalFraction,
- publishedMillis);
+ database.insertIntoImported(fingerprint, "bridge", "responses", "", "",
+ "", fromMillis, toMillis, resp * intervalFraction);
parseBridgeRespByCategory(fingerprint, fromMillis, toMillis, resp,
- dirreqStatsIntervalLengthMillis, "country", bridgeIps,
- publishedMillis);
+ dirreqStatsIntervalLengthMillis, "country", bridgeIps);
parseBridgeRespByCategory(fingerprint, fromMillis, toMillis, resp,
dirreqStatsIntervalLengthMillis, "transport",
- bridgeIpTransports, publishedMillis);
+ bridgeIpTransports);
parseBridgeRespByCategory(fingerprint, fromMillis, toMillis, resp,
- dirreqStatsIntervalLengthMillis, "version", bridgeIpVersions,
- publishedMillis);
+ dirreqStatsIntervalLengthMillis, "version", bridgeIpVersions);
}
}
}
@@ -283,8 +284,8 @@ public class Main {
private static void parseBridgeRespByCategory(String fingerprint,
long fromMillis, long toMillis, double resp,
long dirreqStatsIntervalLengthMillis, String category,
- SortedMap<String, Integer> frequencies, long publishedMillis)
- throws IOException {
+ SortedMap<String, Integer> frequencies)
+ throws SQLException {
double total = 0.0;
SortedMap<String, Double> frequenciesCopy = new TreeMap<>();
if (frequencies != null) {
@@ -322,16 +323,16 @@ public class Main {
double val = resp * intervalFraction * e.getValue() / total;
switch (category) {
case "country":
- writeOutputLine(fingerprint, "bridge", "responses", e.getKey(),
- "", "", fromMillis, toMillis, val, publishedMillis);
+ database.insertIntoImported(fingerprint, "bridge", "responses",
+ e.getKey(), "", "", fromMillis, toMillis, val);
break;
case "transport":
- writeOutputLine(fingerprint, "bridge", "responses", "",
- e.getKey(), "", fromMillis, toMillis, val, publishedMillis);
+ database.insertIntoImported(fingerprint, "bridge", "responses", "",
+ e.getKey(), "", fromMillis, toMillis, val);
break;
case "version":
- writeOutputLine(fingerprint, "bridge", "responses", "", "",
- e.getKey(), fromMillis, toMillis, val, publishedMillis);
+ database.insertIntoImported(fingerprint, "bridge", "responses", "",
+ "", e.getKey(), fromMillis, toMillis, val);
break;
default:
/* Ignore any other categories. */
@@ -341,7 +342,7 @@ public class Main {
private static void parseBridgeDirreqWriteHistory(String fingerprint,
long publishedMillis, BandwidthHistory dirreqWriteHistory)
- throws IOException {
+ throws SQLException {
if (dirreqWriteHistory == null
|| publishedMillis - dirreqWriteHistory.getHistoryEndMillis()
> ONE_WEEK_MILLIS) {
@@ -376,14 +377,14 @@ public class Main {
} else if (i == 1) {
break;
}
- writeOutputLine(fingerprint, "bridge", "bytes", "",
- "", "", fromMillis, toMillis, writtenBytes, publishedMillis);
+ database.insertIntoImported(fingerprint, "bridge", "bytes", "",
+ "", "", fromMillis, toMillis, writtenBytes);
}
}
}
private static void parseBridgeNetworkStatus(BridgeNetworkStatus status)
- throws IOException {
+ throws SQLException {
long publishedMillis = status.getPublishedMillis();
long fromMillis = (publishedMillis / ONE_HOUR_MILLIS)
* ONE_HOUR_MILLIS;
@@ -393,87 +394,10 @@ public class Main {
String fingerprint = statusEntry.getFingerprint()
.toUpperCase();
if (statusEntry.getFlags().contains("Running")) {
- writeOutputLine(fingerprint, "bridge", "status", "", "", "",
- fromMillis, toMillis, 0.0, publishedMillis);
+ database.insertIntoImported(fingerprint, "bridge", "status", "", "", "",
+ fromMillis, toMillis, 0.0);
}
}
}
-
- private static Map<String, BufferedWriter> openOutputFiles = new HashMap<>();
-
- private static void writeOutputLine(String fingerprint, String node,
- String metric, String country, String transport, String version,
- long fromMillis, long toMillis, double val, long publishedMillis)
- throws IOException {
- if (fromMillis > toMillis) {
- return;
- }
- String fromDateTime = formatDateTimeMillis(fromMillis);
- String toDateTime = formatDateTimeMillis(toMillis);
- BufferedWriter bw = getOutputFile(fromDateTime, publishedMillis);
- bw.write(String.format("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%.1f\n",
- fingerprint, node, metric, country, transport, version,
- fromDateTime, toDateTime, val));
- }
-
- private static SimpleDateFormat dateTimeFormat = null;
-
- private static String formatDateTimeMillis(long millis) {
- if (dateTimeFormat == null) {
- dateTimeFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- dateTimeFormat.setLenient(false);
- dateTimeFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
- }
- return dateTimeFormat.format(millis);
- }
-
- private static BufferedWriter getOutputFile(String fromDateTime,
- long publishedMillis) throws IOException {
- String outputFileName;
- if (writeToSingleFile) {
- outputFileName = "out/userstats.sql";
- } else if (byStatsDateNotByDescHour) {
- outputFileName = "out/userstats-" + fromDateTime.substring(0, 10)
- + ".sql";
- } else {
- String publishedHourDateTime = formatDateTimeMillis(
- (publishedMillis / ONE_HOUR_MILLIS) * ONE_HOUR_MILLIS);
- outputFileName = "out/userstats-"
- + publishedHourDateTime.substring(0, 10) + "-"
- + publishedHourDateTime.substring(11, 13) + ".sql";
- }
- BufferedWriter bw = openOutputFiles.get(outputFileName);
- if (bw == null) {
- bw = openOutputFile(outputFileName);
- openOutputFiles.put(outputFileName, bw);
- }
- return bw;
- }
-
- private static BufferedWriter openOutputFile(String outputFileName)
- throws IOException {
- File outputFile = new File(outputFileName);
- outputFile.getParentFile().mkdirs();
- BufferedWriter bw = new BufferedWriter(new FileWriter(
- outputFileName));
- bw.write("BEGIN;\n");
- bw.write("LOCK TABLE imported NOWAIT;\n");
- bw.write("COPY imported (fingerprint, node, metric, country, "
- + "transport, version, stats_start, stats_end, val) FROM "
- + "stdin;\n");
- return bw;
- }
-
- private static void closeOutputFiles() throws IOException {
- for (BufferedWriter bw : openOutputFiles.values()) {
- bw.write("\\.\n");
- bw.write("SELECT merge();\n");
- bw.write("SELECT aggregate();\n");
- bw.write("SELECT combine();\n");
- bw.write("TRUNCATE imported;\n");
- bw.write("COMMIT;\n");
- bw.close();
- }
- }
}
diff --git a/src/main/java/org/torproject/metrics/stats/clients/Writer.java b/src/main/java/org/torproject/metrics/stats/clients/Writer.java
new file mode 100644
index 0000000..ed10bf1
--- /dev/null
+++ b/src/main/java/org/torproject/metrics/stats/clients/Writer.java
@@ -0,0 +1,42 @@
+/* Copyright 2017--2018 The Tor Project
+ * See LICENSE for licensing information */
+
+package org.torproject.metrics.stats.clients;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+
+/** Writer that takes output line objects and writes them to a file, preceded
+ * by a column header line. */
+class Writer {
+
+ /** Write output lines to the given file. */
+ void write(Path filePath, Iterable<String[]> outputLines)
+ throws IOException {
+ File parentFile = filePath.toFile().getParentFile();
+ if (null != parentFile && !parentFile.exists()) {
+ if (!parentFile.mkdirs()) {
+ throw new IOException("Unable to create parent directory of output "
+ + "file. Not writing this file.");
+ }
+ }
+ List<String> formattedOutputLines = new ArrayList<>();
+ for (String[] outputLine : outputLines) {
+ StringBuilder formattedOutputLine = new StringBuilder();
+ for (String outputLinePart : outputLine) {
+ formattedOutputLine.append(',');
+ if (null != outputLinePart) {
+ formattedOutputLine.append(outputLinePart);
+ }
+ }
+ formattedOutputLines.add(formattedOutputLine.substring(1));
+ }
+ Files.write(filePath, formattedOutputLines, StandardCharsets.UTF_8);
+ }
+}
+
1
0
09 Nov '19
commit ffaab885748c5340b01ad87ddb88819cd779c2b0
Author: Karsten Loesing <karsten.loesing(a)gmx.net>
Date: Thu Nov 29 11:53:41 2018 +0100
Document changes to the totalcw graph.
Still part of #28137, #28328, and #28352.
---
.../resources/web/jsps/reproducible-metrics.jsp | 22 ++++++++++++++++++----
src/main/resources/web/jsps/stats.jsp | 6 +++---
2 files changed, 21 insertions(+), 7 deletions(-)
diff --git a/src/main/resources/web/jsps/reproducible-metrics.jsp b/src/main/resources/web/jsps/reproducible-metrics.jsp
index b6df6c3..24bdba0 100644
--- a/src/main/resources/web/jsps/reproducible-metrics.jsp
+++ b/src/main/resources/web/jsps/reproducible-metrics.jsp
@@ -380,7 +380,18 @@ The goal is to avoid over-representing a few statuses during periods when the br
<li>Total consensus weights across bandwidth authorities <a href="/totalcw.html" class="btn btn-primary btn-xs"><i class="fa fa-chevron-right" aria-hidden="true"></i> graph</a></li>
</ul>
-<h4>Step 1: Parse votes.</h4>
+<h4>Step 1: Parse consensuses.</h4>
+
+<p>Obtain consensuses from <a href="/collector.html#type-network-status-consensus-3">CollecTor</a>.
+Refer to the <a href="https://gitweb.torproject.org/torspec.git/tree/dir-spec.txt">Tor directory protocol, version 3</a> for details on the descriptor format.</p>
+
+<p>Parse and memorize the <code>"valid-after"</code> time from the consensus header. We use this UTC timestamp to aggregate by the UTC date.</p>
+
+<p>Parse the <code>"s"</code> lines of all status entries and skip entries without the <code>"Running"</code> flag. Optionally distinguish relays by assigned <code>"Guard"</code> and <code>"Exit"</code> flags.</p>
+
+<p>Parse the (optional) <code>"w"</code> lines of all status entries and compute the total of all bandwidth values denoted by the <code>"Bandwidth="</code> keyword. If an entry does not contain such a value, skip the entry. If a consensus does not contain a single bandwidth value, skip the consensus.</code>
+
+<h4>Step 2: Parse votes.</h4>
<p>Obtain votes from <a href="/collector.html#type-network-status-vote-3">CollecTor</a>.
Refer to the <a href="https://gitweb.torproject.org/torspec.git/tree/dir-spec.txt">Tor directory protocol, version 3</a> for details on the descriptor format.</p>
@@ -389,11 +400,14 @@ Refer to the <a href="https://gitweb.torproject.org/torspec.git/tree/dir-spec.tx
<p>Also parse the <code>"nickname"</code> and <code>"identity"</code> fields from the <code>"dir-source"</code> line. We use the identity to aggregate by authority and the nickname for display purposes.</p>
-<p>Parse the (optional) <code>"w"</code> lines of all status entries and compute the total of all measured bandwidth values denoted by the <code>"Measured="</code> keyword. If an entry does not contain such a value, skip the entry. If a vote does not contain a single measured bandwidth value, skip the vote.</code>
+<p>Parse the <code>"s"</code> lines of all status entries and skip entries without the <code>"Running"</code> flag. Optionally distinguish relays by assigned <code>"Guard"</code> and <code>"Exit"</code> flags.</p>
-<h4>Step 2: Compute daily averages</h4>
+<p>Parse the (optional) <code>"w"</code> lines of all status entries and compute the total of all measured bandwidth values denoted by the <code>"Measured="</code> keyword. If an entry does not contain such a value, skip the entry. If a vote does not contain a single measured bandwidth value, skip the vote.</p>
+
+<h4>Step 3: Compute daily averages</h4>
-<p>Go through all previously processed votes by valid-after UTC date and authority.
+<p>Go through all previously processed consensuses and votes by valid-after UTC date and authority.
+If there are less than 12 consensuses known for a given UTC date, skip consensuses from this date.
If an authority published less than 12 votes on a given UTC date, skip this date and authority.
Also skip the last date of the results, because those averages may still change throughout the day.
For all remaining combinations of date and authority, compute the arithmetic mean of total measured bandwidth, rounded down to the next-smaller integer number.</p>
diff --git a/src/main/resources/web/jsps/stats.jsp b/src/main/resources/web/jsps/stats.jsp
index 002a3af..e5f9c6a 100644
--- a/src/main/resources/web/jsps/stats.jsp
+++ b/src/main/resources/web/jsps/stats.jsp
@@ -321,9 +321,9 @@ Servers <a href="#servers" name="servers" class="anchor">#</a></h2>
<h4>Columns</h4>
<ul>
-<li><b>date:</b> UTC date (YYYY-MM-DD) when bridges have been listed as running.</li>
-<li><b>nickname:</b> Bandwidth authority nickname.</li>
-<li><b>totalcw:</b> Total consensus weight of all relays measured by the bandwidth authority.</li>
+<li><b>date:</b> UTC date (YYYY-MM-DD) when relays have been listed as running.</li>
+<li><b>nickname:</b> Bandwidth authority nickname, or the empty string in case of the consensus.</li>
+<li><b>totalcw:</b> Total consensus weight of all running relays measured by the bandwidth authority or contained in the consensus.</li>
</ul>
</div>
1
0
09 Nov '19
commit 8c24f8e174c277e1c25222bbd598bb2278289e00
Author: Karsten Loesing <karsten.loesing(a)gmx.net>
Date: Mon Jan 7 12:20:35 2019 +0100
Stop calling censorship detector BETA.
---
src/main/resources/web/jsps/graph.jsp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/src/main/resources/web/jsps/graph.jsp b/src/main/resources/web/jsps/graph.jsp
index e710d2c..baa7edb 100644
--- a/src/main/resources/web/jsps/graph.jsp
+++ b/src/main/resources/web/jsps/graph.jsp
@@ -93,8 +93,7 @@
</c:if>
<c:if test="${fn:length(events) > 0}">
<p>
- <label for="events"><b>Show possible censorship events if available (<a
- href="http://research.torproject.org/techreports/detector-2011-09-09.pdf">BETA</a>):</b></label>
+ <label for="events"><b>Show possible <a href="http://research.torproject.org/techreports/detector-2011-09-09.pdf">censorship events</a> if available:</b></label>
<select name="events" id="events">
<c:forEach var="row" items="${events}">
<option value="${row[0]}"${row[1]}>${row[2]}</option>
1
0
[metrics-web/release] Simplify plot_webstats_tb_locale function.
by karsten@torproject.org 09 Nov '19
by karsten@torproject.org 09 Nov '19
09 Nov '19
commit 2b34cd2023a3e59057f4274afb0d7b8163282a18
Author: Karsten Loesing <karsten.loesing(a)gmx.net>
Date: Thu Jan 10 10:41:48 2019 +0100
Simplify plot_webstats_tb_locale function.
---
src/main/R/rserver/graphs.R | 61 ++++++++++++++++++++-------------------------
1 file changed, 27 insertions(+), 34 deletions(-)
diff --git a/src/main/R/rserver/graphs.R b/src/main/R/rserver/graphs.R
index ba8862c..27f399d 100644
--- a/src/main/R/rserver/graphs.R
+++ b/src/main/R/rserver/graphs.R
@@ -1265,8 +1265,8 @@ write_webstats_tb_platform <- function(start_p = NULL, end_p = NULL, path_p) {
write.csv(path_p, quote = FALSE, row.names = FALSE, na = "")
}
-plot_webstats_tb_locale <- function(start_p, end_p, path_p) {
- d <- read_csv(file = paste(stats_dir, "webstats.csv", sep = ""),
+prepare_webstats_tb_locale <- function(start_p, end_p) {
+ read_csv(file = paste(stats_dir, "webstats.csv", sep = ""),
col_types = cols(
log_date = col_date(format = ""),
request_type = col_factor(),
@@ -1274,20 +1274,35 @@ plot_webstats_tb_locale <- function(start_p, end_p, path_p) {
channel = col_skip(),
locale = col_factor(),
incremental = col_skip(),
- count = col_double()))
- d <- d[d$log_date >= start_p & d$log_date <= end_p &
- d$request_type %in% c("tbid", "tbup"), ]
- levels(d$request_type) <- list(
- "Initial downloads" = "tbid",
- "Update pings" = "tbup")
+ count = col_double())) %>%
+ filter(if (!is.null(start_p)) log_date >= as.Date(start_p) else TRUE) %>%
+ filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>%
+ filter(request_type %in% c("tbid", "tbup")) %>%
+ rename(date = log_date) %>%
+ group_by(date, locale, request_type) %>%
+ summarize(count = sum(count)) %>%
+ mutate(request_type = factor(request_type, levels = c("tbid", "tbup"))) %>%
+ spread(request_type, count, fill = 0) %>%
+ rename(initial_downloads = tbid, update_pings = tbup)
+}
+
+plot_webstats_tb_locale <- function(start_p, end_p, path_p) {
+ d <- prepare_webstats_tb_locale(start_p, end_p) %>%
+ gather(request_type, count, -c(date, locale)) %>%
+ mutate(request_type = factor(request_type,
+ levels = c("initial_downloads", "update_pings"),
+ labels = c("Initial downloads", "Update pings")))
e <- d
e <- aggregate(list(count = e$count), by = list(locale = e$locale), FUN = sum)
e <- e[order(e$count, decreasing = TRUE), ]
e <- e[1:5, ]
- d <- aggregate(list(count = d$count), by = list(log_date = d$log_date,
+ d <- aggregate(list(count = d$count), by = list(date = d$date,
request_type = d$request_type,
locale = ifelse(d$locale %in% e$locale, d$locale, "(other)")), FUN = sum)
- ggplot(d, aes(x = log_date, y = count, colour = locale)) +
+ d %>%
+ complete(date = full_seq(date, period = 1),
+ nesting(locale, request_type)) %>%
+ ggplot(aes(x = date, y = count, colour = locale)) +
geom_point() +
geom_line() +
scale_x_date(name = "", breaks = custom_breaks,
@@ -1295,7 +1310,7 @@ plot_webstats_tb_locale <- function(start_p, end_p, path_p) {
scale_y_continuous(name = "", labels = formatter, limits = c(0, NA)) +
scale_colour_hue(name = "Locale",
breaks = c(e$locale, "(other)"),
- labels = c(e$locale, "Other")) +
+ labels = c(as.character(e$locale), "Other")) +
facet_grid(request_type ~ ., scales = "free_y") +
theme(strip.text.y = element_text(angle = 0, hjust = 0, size = rel(1.5)),
strip.background = element_rect(fill = NA),
@@ -1305,30 +1320,8 @@ plot_webstats_tb_locale <- function(start_p, end_p, path_p) {
ggsave(filename = path_p, width = 8, height = 5, dpi = 150)
}
-# Ideally, this function would share code with plot_webstats_tb_locale
-# by using a common prepare_webstats_tb_locale function. This just
-# turned out to be a bit harder than for other functions, because
-# plot_webstats_tb_locale needs the preliminary data frame e for its
-# breaks and labels. Left as future work.
write_webstats_tb_locale <- function(start_p = NULL, end_p = NULL, path_p) {
- read_csv(file = paste(stats_dir, "webstats.csv", sep = ""),
- col_types = cols(
- log_date = col_date(format = ""),
- request_type = col_factor(),
- platform = col_skip(),
- channel = col_skip(),
- locale = col_factor(),
- incremental = col_skip(),
- count = col_double())) %>%
- filter(if (!is.null(start_p)) log_date >= as.Date(start_p) else TRUE) %>%
- filter(if (!is.null(end_p)) log_date <= as.Date(end_p) else TRUE) %>%
- filter(request_type %in% c("tbid", "tbup")) %>%
- rename(date = log_date) %>%
- group_by(date, locale, request_type) %>%
- summarize(count = sum(count)) %>%
- mutate(request_type = factor(request_type, levels = c("tbid", "tbup"))) %>%
- spread(request_type, count, fill = 0) %>%
- rename(initial_downloads = tbid, update_pings = tbup) %>%
+ prepare_webstats_tb_locale(start_p, end_p) %>%
write.csv(path_p, quote = FALSE, row.names = FALSE, na = "")
}
1
0
09 Nov '19
commit a94a3844644041f7c1f6e0a4451e19ce12cae9e8
Author: Karsten Loesing <karsten.loesing(a)gmx.net>
Date: Thu Jan 10 22:32:28 2019 +0100
Switch to readr's read_csv() everywhere.
---
src/main/R/rserver/graphs.R | 230 +++++++++++++++++++++++++++++++++-----------
1 file changed, 175 insertions(+), 55 deletions(-)
diff --git a/src/main/R/rserver/graphs.R b/src/main/R/rserver/graphs.R
index 82a51e7..205afbe 100644
--- a/src/main/R/rserver/graphs.R
+++ b/src/main/R/rserver/graphs.R
@@ -359,8 +359,11 @@ write_data <- function(FUN, ..., path_p) {
options(readr.show_progress = FALSE)
prepare_networksize <- function(start_p = NULL, end_p = NULL) {
- read.csv(paste(stats_dir, "networksize.csv", sep = ""),
- colClasses = c("date" = "Date")) %>%
+ read_csv(file = paste(stats_dir, "networksize.csv", sep = ""),
+ col_types = cols(
+ date = col_date(format = ""),
+ relays = col_double(),
+ bridges = col_double())) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE)
}
@@ -416,8 +419,11 @@ plot_versions <- function(start_p, end_p, path_p) {
}
prepare_platforms <- function(start_p = NULL, end_p = NULL) {
- read.csv(paste(stats_dir, "platforms.csv", sep = ""),
- colClasses = c("date" = "Date")) %>%
+ read_csv(file = paste(stats_dir, "platforms.csv", sep = ""),
+ col_types = cols(
+ date = col_date(format = ""),
+ platform = col_factor(levels = NULL),
+ relays = col_double())) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
mutate(platform = tolower(platform)) %>%
@@ -443,12 +449,19 @@ plot_platforms <- function(start_p, end_p, path_p) {
}
prepare_dirbytes <- function(start_p = NULL, end_p = NULL) {
- read.csv(paste(stats_dir, "bandwidth.csv", sep = ""),
- colClasses = c("date" = "Date")) %>%
+ read_csv(file = paste(stats_dir, "bandwidth.csv", sep = ""),
+ col_types = cols(
+ date = col_date(format = ""),
+ isexit = col_logical(),
+ isguard = col_logical(),
+ bwread = col_skip(),
+ bwwrite = col_skip(),
+ dirread = col_double(),
+ dirwrite = col_double())) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
- filter(isexit == "") %>%
- filter(isguard == "") %>%
+ filter(is.na(isexit)) %>%
+ filter(is.na(isguard)) %>%
mutate(dirread = dirread * 8 / 1e9,
dirwrite = dirwrite * 8 / 1e9) %>%
select(date, dirread, dirwrite)
@@ -473,8 +486,11 @@ plot_dirbytes <- function(start_p, end_p, path_p) {
}
prepare_relayflags <- function(start_p = NULL, end_p = NULL, flag_p = NULL) {
- read.csv(paste(stats_dir, "relayflags.csv", sep = ""),
- colClasses = c("date" = "Date")) %>%
+ read_csv(file = paste(stats_dir, "relayflags.csv", sep = ""),
+ col_types = cols(
+ date = col_date(format = ""),
+ flag = col_factor(levels = NULL),
+ relays = col_double())) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
filter(if (!is.null(flag_p)) flag %in% flag_p else TRUE)
@@ -483,7 +499,7 @@ prepare_relayflags <- function(start_p = NULL, end_p = NULL, flag_p = NULL) {
plot_relayflags <- function(start_p, end_p, flag_p, path_p) {
prepare_relayflags(start_p, end_p, flag_p) %>%
complete(date = full_seq(date, period = 1), flag = unique(flag)) %>%
- ggplot(aes(x = date, y = relays, colour = as.factor(flag))) +
+ ggplot(aes(x = date, y = relays, colour = flag)) +
geom_line() +
scale_x_date(name = "", breaks = custom_breaks,
labels = custom_labels, minor_breaks = custom_minor_breaks) +
@@ -498,8 +514,18 @@ plot_relayflags <- function(start_p, end_p, flag_p, path_p) {
prepare_torperf <- function(start_p = NULL, end_p = NULL, server_p = NULL,
filesize_p = NULL) {
- read.csv(paste(stats_dir, "torperf-1.1.csv", sep = ""),
- colClasses = c("date" = "Date", "source" = "character")) %>%
+ read_csv(file = paste(stats_dir, "torperf-1.1.csv", sep = ""),
+ col_types = cols(
+ date = col_date(format = ""),
+ filesize = col_double(),
+ source = col_character(),
+ server = col_character(),
+ q1 = col_double(),
+ md = col_double(),
+ q3 = col_double(),
+ timeouts = col_skip(),
+ failures = col_skip(),
+ requests = col_skip())) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
filter(if (!is.null(server_p)) server == server_p else TRUE) %>%
@@ -535,8 +561,18 @@ plot_torperf <- function(start_p, end_p, server_p, filesize_p, path_p) {
prepare_torperf_failures <- function(start_p = NULL, end_p = NULL,
server_p = NULL, filesize_p = NULL) {
- read.csv(paste(stats_dir, "torperf-1.1.csv", sep = ""),
- colClasses = c("date" = "Date")) %>%
+ read_csv(file = paste(stats_dir, "torperf-1.1.csv", sep = ""),
+ col_types = cols(
+ date = col_date(format = ""),
+ filesize = col_double(),
+ source = col_character(),
+ server = col_character(),
+ q1 = col_skip(),
+ md = col_skip(),
+ q3 = col_skip(),
+ timeouts = col_double(),
+ failures = col_double(),
+ requests = col_double())) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
filter(if (!is.null(filesize_p))
@@ -573,8 +609,14 @@ plot_torperf_failures <- function(start_p, end_p, server_p, filesize_p,
}
prepare_onionperf_buildtimes <- function(start_p = NULL, end_p = NULL) {
- read.csv(paste(stats_dir, "buildtimes.csv", sep = ""),
- colClasses = c("date" = "Date")) %>%
+ read_csv(file = paste(stats_dir, "buildtimes.csv", sep = ""),
+ col_types = cols(
+ date = col_date(format = ""),
+ source = col_character(),
+ position = col_double(),
+ q1 = col_double(),
+ md = col_double(),
+ q3 = col_double())) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE)
}
@@ -604,8 +646,14 @@ plot_onionperf_buildtimes <- function(start_p, end_p, path_p) {
prepare_onionperf_latencies <- function(start_p = NULL, end_p = NULL,
server_p = NULL) {
- read.csv(paste(stats_dir, "latencies.csv", sep = ""),
- colClasses = c("date" = "Date")) %>%
+ read_csv(file = paste(stats_dir, "latencies.csv", sep = ""),
+ col_types = cols(
+ date = col_date(format = ""),
+ source = col_character(),
+ server = col_character(),
+ q1 = col_double(),
+ md = col_double(),
+ q3 = col_double())) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
filter(if (!is.null(server_p)) server == server_p else TRUE)
@@ -631,8 +679,12 @@ plot_onionperf_latencies <- function(start_p, end_p, server_p, path_p) {
}
prepare_connbidirect <- function(start_p = NULL, end_p = NULL) {
- read.csv(paste(stats_dir, "connbidirect2.csv", sep = ""),
- colClasses = c("date" = "Date", "direction" = "factor")) %>%
+ read_csv(file = paste(stats_dir, "connbidirect2.csv", sep = ""),
+ col_types = cols(
+ date = col_date(format = ""),
+ direction = col_factor(),
+ quantile = col_double(),
+ fraction = col_double())) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
mutate(quantile = paste("X", quantile, sep = ""),
@@ -665,19 +717,30 @@ plot_connbidirect <- function(start_p, end_p, path_p) {
}
prepare_bandwidth_flags <- function(start_p = NULL, end_p = NULL) {
- advbw <- read.csv(paste(stats_dir, "advbw.csv", sep = ""),
- colClasses = c("date" = "Date")) %>%
+ advbw <- read_csv(file = paste(stats_dir, "advbw.csv", sep = ""),
+ col_types = cols(
+ date = col_date(format = ""),
+ isexit = col_logical(),
+ isguard = col_logical(),
+ advbw = col_double())) %>%
transmute(date, have_guard_flag = isguard, have_exit_flag = isexit,
variable = "advbw", value = advbw * 8 / 1e9)
- bwhist <- read.csv(paste(stats_dir, "bandwidth.csv", sep = ""),
- colClasses = c("date" = "Date")) %>%
+ bwhist <- read_csv(file = paste(stats_dir, "bandwidth.csv", sep = ""),
+ col_types = cols(
+ date = col_date(format = ""),
+ isexit = col_logical(),
+ isguard = col_logical(),
+ bwread = col_double(),
+ bwwrite = col_double(),
+ dirread = col_double(),
+ dirwrite = col_double())) %>%
transmute(date, have_guard_flag = isguard, have_exit_flag = isexit,
variable = "bwhist", value = (bwread + bwwrite) * 8 / 2e9)
rbind(advbw, bwhist) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
- filter(have_exit_flag != "") %>%
- filter(have_guard_flag != "") %>%
+ filter(!is.na(have_exit_flag)) %>%
+ filter(!is.na(have_guard_flag)) %>%
spread(variable, value)
}
@@ -685,7 +748,8 @@ plot_bandwidth_flags <- function(start_p, end_p, path_p) {
prepare_bandwidth_flags(start_p, end_p) %>%
gather(variable, value, c(advbw, bwhist)) %>%
unite(flags, have_guard_flag, have_exit_flag) %>%
- mutate(flags = factor(flags, levels = c("f_t", "t_t", "t_f", "f_f"),
+ mutate(flags = factor(flags,
+ levels = c("FALSE_TRUE", "TRUE_TRUE", "TRUE_FALSE", "FALSE_FALSE"),
labels = c("Exit only", "Guard and Exit", "Guard only",
"Neither Guard nor Exit"))) %>%
mutate(variable = ifelse(variable == "advbw",
@@ -968,14 +1032,19 @@ plot_userstats_bridge_combined <- function(start_p, end_p, country_p, path_p) {
}
prepare_advbwdist_perc <- function(start_p = NULL, end_p = NULL, p_p = NULL) {
- read.csv(paste(stats_dir, "advbwdist.csv", sep = ""),
- colClasses = c("date" = "Date")) %>%
+ read_csv(file = paste(stats_dir, "advbwdist.csv", sep = ""),
+ col_types = cols(
+ date = col_date(format = ""),
+ isexit = col_logical(),
+ relay = col_skip(),
+ percentile = col_integer(),
+ advbw = col_double())) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
filter(if (!is.null(p_p)) percentile %in% as.numeric(p_p) else
percentile != "") %>%
transmute(date, percentile = as.factor(percentile),
- variable = ifelse(isexit == "t", "exits", "all"),
+ variable = ifelse(is.na(isexit), "all", "exits"),
advbw = advbw * 8 / 1e9) %>%
spread(variable, advbw) %>%
rename(p = percentile)
@@ -1000,14 +1069,19 @@ plot_advbwdist_perc <- function(start_p, end_p, p_p, path_p) {
}
prepare_advbwdist_relay <- function(start_p = NULL, end_p = NULL, n_p = NULL) {
- read.csv(paste(stats_dir, "advbwdist.csv", sep = ""),
- colClasses = c("date" = "Date")) %>%
+ read_csv(file = paste(stats_dir, "advbwdist.csv", sep = ""),
+ col_types = cols(
+ date = col_date(format = ""),
+ isexit = col_logical(),
+ relay = col_integer(),
+ percentile = col_skip(),
+ advbw = col_double())) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
filter(if (!is.null(n_p)) relay %in% as.numeric(n_p) else
relay != "") %>%
transmute(date, relay = as.factor(relay),
- variable = ifelse(isexit != "t", "all", "exits"),
+ variable = ifelse(is.na(isexit), "all", "exits"),
advbw = advbw * 8 / 1e9) %>%
spread(variable, advbw) %>%
rename(n = relay)
@@ -1032,8 +1106,15 @@ plot_advbwdist_relay <- function(start_p, end_p, n_p, path_p) {
}
prepare_hidserv_dir_onions_seen <- function(start_p = NULL, end_p = NULL) {
- read.csv(paste(stats_dir, "hidserv.csv", sep = ""),
- colClasses = c("date" = "Date")) %>%
+ read_csv(file = paste(stats_dir, "hidserv.csv", sep = ""),
+ col_types = cols(
+ date = col_date(format = ""),
+ type = col_factor(),
+ wmean = col_skip(),
+ wmedian = col_skip(),
+ wiqm = col_double(),
+ frac = col_double(),
+ stats = col_skip())) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
filter(type == "dir-onions-seen") %>%
@@ -1053,8 +1134,15 @@ plot_hidserv_dir_onions_seen <- function(start_p, end_p, path_p) {
}
prepare_hidserv_rend_relayed_cells <- function(start_p = NULL, end_p = NULL) {
- read.csv(paste(stats_dir, "hidserv.csv", sep = ""),
- colClasses = c("date" = "Date")) %>%
+ read_csv(file = paste(stats_dir, "hidserv.csv", sep = ""),
+ col_types = cols(
+ date = col_date(format = ""),
+ type = col_factor(),
+ wmean = col_skip(),
+ wmedian = col_skip(),
+ wiqm = col_double(),
+ frac = col_double(),
+ stats = col_skip())) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
filter(type == "rend-relayed-cells") %>%
@@ -1257,8 +1345,17 @@ plot_webstats_tm <- function(start_p, end_p, path_p) {
}
prepare_relays_ipv6 <- function(start_p = NULL, end_p = NULL) {
- read.csv(paste(stats_dir, "ipv6servers.csv", sep = ""),
- colClasses = c("valid_after_date" = "Date")) %>%
+ read_csv(file = paste(stats_dir, "ipv6servers.csv", sep = ""),
+ col_types = cols(
+ valid_after_date = col_date(format = ""),
+ server = col_factor(),
+ guard_relay = col_skip(),
+ exit_relay = col_skip(),
+ announced_ipv6 = col_logical(),
+ exiting_ipv6_relay = col_logical(),
+ reachable_ipv6_relay = col_logical(),
+ server_count_sum_avg = col_double(),
+ advertised_bandwidth_bytes_sum_avg = col_skip())) %>%
filter(if (!is.null(start_p))
valid_after_date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p))
@@ -1266,9 +1363,9 @@ prepare_relays_ipv6 <- function(start_p = NULL, end_p = NULL) {
filter(server == "relay") %>%
group_by(valid_after_date) %>%
summarize(total = sum(server_count_sum_avg),
- announced = sum(server_count_sum_avg[announced_ipv6 == "t"]),
- reachable = sum(server_count_sum_avg[reachable_ipv6_relay == "t"]),
- exiting = sum(server_count_sum_avg[exiting_ipv6_relay == "t"])) %>%
+ announced = sum(server_count_sum_avg[announced_ipv6]),
+ reachable = sum(server_count_sum_avg[reachable_ipv6_relay]),
+ exiting = sum(server_count_sum_avg[exiting_ipv6_relay])) %>%
complete(valid_after_date = full_seq(valid_after_date, period = 1)) %>%
gather(total, announced, reachable, exiting, key = "category",
value = "count") %>%
@@ -1295,8 +1392,17 @@ plot_relays_ipv6 <- function(start_p, end_p, path_p) {
}
prepare_bridges_ipv6 <- function(start_p = NULL, end_p = NULL) {
- read.csv(paste(stats_dir, "ipv6servers.csv", sep = ""),
- colClasses = c("valid_after_date" = "Date")) %>%
+ read_csv(file = paste(stats_dir, "ipv6servers.csv", sep = ""),
+ col_types = cols(
+ valid_after_date = col_date(format = ""),
+ server = col_factor(),
+ guard_relay = col_skip(),
+ exit_relay = col_skip(),
+ announced_ipv6 = col_logical(),
+ exiting_ipv6_relay = col_skip(),
+ reachable_ipv6_relay = col_skip(),
+ server_count_sum_avg = col_double(),
+ advertised_bandwidth_bytes_sum_avg = col_skip())) %>%
filter(if (!is.null(start_p))
valid_after_date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p))
@@ -1304,7 +1410,7 @@ prepare_bridges_ipv6 <- function(start_p = NULL, end_p = NULL) {
filter(server == "bridge") %>%
group_by(valid_after_date) %>%
summarize(total = sum(server_count_sum_avg),
- announced = sum(server_count_sum_avg[announced_ipv6 == "t"])) %>%
+ announced = sum(server_count_sum_avg[announced_ipv6])) %>%
complete(valid_after_date = full_seq(valid_after_date, period = 1)) %>%
rename(date = valid_after_date)
}
@@ -1327,8 +1433,17 @@ plot_bridges_ipv6 <- function(start_p, end_p, path_p) {
}
prepare_advbw_ipv6 <- function(start_p = NULL, end_p = NULL) {
- read.csv(paste(stats_dir, "ipv6servers.csv", sep = ""),
- colClasses = c("valid_after_date" = "Date")) %>%
+ read_csv(file = paste(stats_dir, "ipv6servers.csv", sep = ""),
+ col_types = cols(
+ valid_after_date = col_date(format = ""),
+ server = col_factor(),
+ guard_relay = col_logical(),
+ exit_relay = col_logical(),
+ announced_ipv6 = col_logical(),
+ exiting_ipv6_relay = col_logical(),
+ reachable_ipv6_relay = col_logical(),
+ server_count_sum_avg = col_skip(),
+ advertised_bandwidth_bytes_sum_avg = col_double())) %>%
filter(if (!is.null(start_p))
valid_after_date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p))
@@ -1338,14 +1453,14 @@ prepare_advbw_ipv6 <- function(start_p = NULL, end_p = NULL) {
advertised_bandwidth_bytes_sum_avg * 8 / 1e9) %>%
group_by(valid_after_date) %>%
summarize(total = sum(advertised_bandwidth_bytes_sum_avg),
- total_guard = sum(advertised_bandwidth_bytes_sum_avg[guard_relay != "f"]),
- total_exit = sum(advertised_bandwidth_bytes_sum_avg[exit_relay != "f"]),
+ total_guard = sum(advertised_bandwidth_bytes_sum_avg[guard_relay]),
+ total_exit = sum(advertised_bandwidth_bytes_sum_avg[exit_relay]),
reachable_guard = sum(advertised_bandwidth_bytes_sum_avg[
- reachable_ipv6_relay != "f" & guard_relay != "f"]),
+ reachable_ipv6_relay & guard_relay]),
reachable_exit = sum(advertised_bandwidth_bytes_sum_avg[
- reachable_ipv6_relay != "f" & exit_relay != "f"]),
+ reachable_ipv6_relay & exit_relay]),
exiting = sum(advertised_bandwidth_bytes_sum_avg[
- exiting_ipv6_relay != "f"])) %>%
+ exiting_ipv6_relay])) %>%
complete(valid_after_date = full_seq(valid_after_date, period = 1)) %>%
rename(date = valid_after_date)
}
@@ -1372,8 +1487,13 @@ plot_advbw_ipv6 <- function(start_p, end_p, path_p) {
}
prepare_totalcw <- function(start_p = NULL, end_p = NULL) {
- read.csv(paste(stats_dir, "totalcw.csv", sep = ""),
- colClasses = c("valid_after_date" = "Date", "nickname" = "character")) %>%
+ read_csv(file = paste(stats_dir, "totalcw.csv", sep = ""),
+ col_types = cols(
+ valid_after_date = col_date(format = ""),
+ nickname = col_character(),
+ have_guard_flag = col_logical(),
+ have_exit_flag = col_logical(),
+ measured_sum_avg = col_double())) %>%
filter(if (!is.null(start_p))
valid_after_date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p))
1
0
commit 46059cf2a7bd58c651f44b89aaa009c911fe3084
Author: Karsten Loesing <karsten.loesing(a)gmx.net>
Date: Wed Nov 28 10:14:21 2018 +0100
Stop signing jars.
Implements #28584.
---
CERT | 21 ---------------------
src/build | 2 +-
src/submods/metrics-lib | 2 +-
3 files changed, 2 insertions(+), 23 deletions(-)
diff --git a/CERT b/CERT
deleted file mode 100644
index b90b397..0000000
--- a/CERT
+++ /dev/null
@@ -1,21 +0,0 @@
------BEGIN CERTIFICATE-----
-MIIDaTCCAlGgAwIBAgIELle0dTANBgkqhkiG9w0BAQsFADBlMQswCQYDVQQGEwJV
-UzELMAkGA1UECBMCV0ExEDAOBgNVBAcTB1NlYXR0bGUxHTAbBgNVBAoTFFRoZSBU
-b3IgUHJvamVjdCwgSW5jMRgwFgYDVQQDEw9LYXJzdGVuIExvZXNpbmcwHhcNMTgw
-ODI4MDcwNjM2WhcNMTgxMTI2MDcwNjM2WjBlMQswCQYDVQQGEwJVUzELMAkGA1UE
-CBMCV0ExEDAOBgNVBAcTB1NlYXR0bGUxHTAbBgNVBAoTFFRoZSBUb3IgUHJvamVj
-dCwgSW5jMRgwFgYDVQQDEw9LYXJzdGVuIExvZXNpbmcwggEiMA0GCSqGSIb3DQEB
-AQUAA4IBDwAwggEKAoIBAQChXn+IUp+o6G+k4ffxk3TkxZb3iXfiG7byNsG63olU
-6aTpAjDMeaT4ctUwxH4+56Sbcf/wB0vEFBbX8MyRd1eY02PKwMVJ6VBhjOQcIlrd
-Qw+VAhKTcEIv4yiR0BWapQyR07pgmKirYVjN6s6ef8NJzUptpxLlaYJ3ZfQfc4aE
-MXzScgaccwDFIWQ661lzLGCfeSxxa3Xy4wWsGwzNzLITYrrABcbg7yogLo2btNvD
-oEwGL3/baQdhl0dra6biVCZr9ydn3Hg57S55pUU0rBY25id78zUO8xrfNHw54wwX
-lOblGt75OOkahP/ZZSBxxoiknJ6y5VQV8y+noA4vigXFAgMBAAGjITAfMB0GA1Ud
-DgQWBBSeh60M+/wMYyYhlxtuff2Hk9n7bzANBgkqhkiG9w0BAQsFAAOCAQEAkXZs
-3T3GTkZ+EGvZG5puzKdgZiSsLgIy25xdWsIx147AIZEJFKjEAtbu0osMpkTa96B6
-a+BHf7PTjQUuH3YOEmeW9ab8pwu5SRijCq2qkuvjjSLBcJzWnalcKDYYvoQte1//
-Di8JqpRXCw20WY2bldTiafyG80E0RGfiX2I8vbDiPIhjwz9Wox8Q1rw1c9T/vRn9
-pI8FrHgTnDO6R54yD25QSpsj+hC+IDkFKO17vGCIaJrPG5o6th438ijEwJsG+LRB
-4zKKKsFTby7UJI3Ag8xolIhsBkRZO2j4Na35i15SZ7QJNj9J5g171z8RyOmyIQbg
-q7OXN2iiRIxiIJwoQw==
------END CERTIFICATE-----
diff --git a/src/build b/src/build
index 08514a3..e639c69 160000
--- a/src/build
+++ b/src/build
@@ -1 +1 @@
-Subproject commit 08514a32afefbeef848b80f9a338ee840c282604
+Subproject commit e639c697e9e94c6dbb26e946e5247c20a62c0661
diff --git a/src/submods/metrics-lib b/src/submods/metrics-lib
index 603a439..23927c2 160000
--- a/src/submods/metrics-lib
+++ b/src/submods/metrics-lib
@@ -1 +1 @@
-Subproject commit 603a439f802c6d4a8b29367ce13b345ae8cf02bc
+Subproject commit 23927c2777f273c42ad3e75fc0a2940ed8eb4bf6
1
0
09 Nov '19
commit 09cfdfdff4efc1aa1cc60f53f7f1353a6193e6ad
Author: Karsten Loesing <karsten.loesing(a)gmx.net>
Date: Mon Nov 12 19:50:46 2018 +0100
Remove advbw column from bandwidth.csv.
Instead use advbw data from ipv6servers module.
As a result, we can stop aggregating advertised bandwidths in the
legacy module.
Required schema changes to live tordir databases:
DROP VIEW stats_bandwidth;
CREATE VIEW stats_bandwidth [...]
CREATE OR REPLACE FUNCTION refresh_all() [...]
DROP FUNCTION refresh_bandwidth_flags();
DROP FUNCTION refresh_relay_statuses_per_day();
DROP TABLE relay_statuses_per_day;
DROP TABLE bandwidth_flags;
DROP TABLE consensus;
DROP FUNCTION delete_old_descriptor();
DROP TABLE descriptor;
Part of #28116.
---
src/main/R/rserver/graphs.R | 58 +++---
.../metrics/stats/ipv6servers/Database.java | 22 ++
.../torproject/metrics/stats/ipv6servers/Main.java | 2 +
.../metrics/stats/servers/Configuration.java | 1 -
.../servers/RelayDescriptorDatabaseImporter.java | 232 +--------------------
src/main/sql/ipv6servers/init-ipv6servers.sql | 11 +
src/main/sql/legacy/tordir.sql | 135 +-----------
7 files changed, 73 insertions(+), 388 deletions(-)
diff --git a/src/main/R/rserver/graphs.R b/src/main/R/rserver/graphs.R
index 9dc8c2d..df108e2 100644
--- a/src/main/R/rserver/graphs.R
+++ b/src/main/R/rserver/graphs.R
@@ -446,16 +446,19 @@ write_platforms <- function(start_p = NULL, end_p = NULL, path_p) {
}
prepare_bandwidth <- function(start_p, end_p) {
- read.csv(paste(stats_dir, "bandwidth.csv", sep = ""),
+ advbw <- read.csv(paste(stats_dir, "advbw.csv", sep = ""),
+ colClasses = c("date" = "Date")) %>%
+ transmute(date, variable = "advbw", value = advbw * 8 / 1e9)
+ bwhist <- read.csv(paste(stats_dir, "bandwidth.csv", sep = ""),
colClasses = c("date" = "Date")) %>%
+ transmute(date, variable = "bwhist", value = (bwread + bwwrite) * 8 / 2e9)
+ rbind(advbw, bwhist) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
- filter(isexit != "") %>%
- filter(isguard != "") %>%
- group_by(date) %>%
- summarize(advbw = sum(advbw) * 8 / 1e9,
- bwhist = sum(bwread + bwwrite) * 8 / 2e9) %>%
- select(date, advbw, bwhist)
+ filter(!is.na(value)) %>%
+ group_by(date, variable) %>%
+ summarize(value = sum(value)) %>%
+ spread(variable, value)
}
plot_bandwidth <- function(start_p, end_p, path_p) {
@@ -810,33 +813,24 @@ write_connbidirect <- function(start_p = NULL, end_p = NULL, path_p) {
}
prepare_bandwidth_flags <- function(start_p, end_p) {
- b <- read.csv(paste(stats_dir, "bandwidth.csv", sep = ""),
- colClasses = c("date" = "Date"))
- b <- b %>%
+ advbw <- read.csv(paste(stats_dir, "advbw.csv", sep = ""),
+ colClasses = c("date" = "Date")) %>%
+ transmute(date, isguard, isexit, variable = "advbw",
+ value = advbw * 8 / 1e9)
+ bwhist <- read.csv(paste(stats_dir, "bandwidth.csv", sep = ""),
+ colClasses = c("date" = "Date")) %>%
+ transmute(date, isguard, isexit, variable = "bwhist",
+ value = (bwread + bwwrite) * 8 / 2e9)
+ rbind(advbw, bwhist) %>%
filter(if (!is.null(start_p)) date >= as.Date(start_p) else TRUE) %>%
filter(if (!is.null(end_p)) date <= as.Date(end_p) else TRUE) %>%
- filter(isexit != "") %>%
- filter(isguard != "")
- b <- data.frame(date = b$date,
- isexit = b$isexit == "t", isguard = b$isguard == "t",
- advbw = b$advbw * 8 / 1e9,
- bwhist = (b$bwread + b$bwwrite) * 8 / 2e9)
- b <- rbind(
- data.frame(b[b$isguard == TRUE, ], flag = "guard"),
- data.frame(b[b$isexit == TRUE, ], flag = "exit"))
- b <- data.frame(date = b$date, advbw = b$advbw, bwhist = b$bwhist,
- flag = b$flag)
- b <- aggregate(list(advbw = b$advbw, bwhist = b$bwhist),
- by = list(date = b$date, flag = b$flag), FUN = sum,
- na.rm = TRUE, na.action = NULL)
- b <- gather(b, type, value, -c(date, flag))
- bandwidth <- b[b$value > 0, ]
- bandwidth <- data.frame(date = bandwidth$date,
- variable = as.factor(paste(bandwidth$flag, "_", bandwidth$type,
- sep = "")), value = bandwidth$value)
- bandwidth$variable <- factor(bandwidth$variable,
- levels = levels(bandwidth$variable)[c(3, 4, 1, 2)])
- bandwidth
+ group_by(date, variable) %>%
+ summarize(exit = sum(value[isexit == "t"]),
+ guard = sum(value[isguard == "t"])) %>%
+ gather(flag, value, -date, -variable) %>%
+ unite(variable, flag, variable) %>%
+ mutate(variable = factor(variable,
+ levels = c("guard_advbw", "guard_bwhist", "exit_advbw", "exit_bwhist")))
}
plot_bandwidth_flags <- function(start_p, end_p, path_p) {
diff --git a/src/main/java/org/torproject/metrics/stats/ipv6servers/Database.java b/src/main/java/org/torproject/metrics/stats/ipv6servers/Database.java
index c3a1fec..b5efe3e 100644
--- a/src/main/java/org/torproject/metrics/stats/ipv6servers/Database.java
+++ b/src/main/java/org/torproject/metrics/stats/ipv6servers/Database.java
@@ -435,6 +435,28 @@ class Database implements AutoCloseable {
return statistics;
}
+ /** Query the bandwidth_advbw view. */
+ List<String[]> queryAdvbw() throws SQLException {
+ List<String[]> statistics = new ArrayList<>();
+ String columns = "date, isexit, isguard, advbw";
+ statistics.add(columns.split(", "));
+ Statement st = this.connection.createStatement();
+ Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC"),
+ Locale.US);
+ String queryString = "SELECT " + columns + " FROM bandwidth_advbw";
+ try (ResultSet rs = st.executeQuery(queryString)) {
+ while (rs.next()) {
+ String[] outputLine = new String[4];
+ outputLine[0] = rs.getDate("date", calendar).toLocalDate().toString();
+ outputLine[1] = rs.getString("isexit");
+ outputLine[2] = rs.getString("isguard");
+ outputLine[3] = getLongFromResultSet(rs, "advbw");
+ statistics.add(outputLine);
+ }
+ }
+ return statistics;
+ }
+
/** Query the servers_networksize view. */
List<String[]> queryNetworksize() throws SQLException {
List<String[]> statistics = new ArrayList<>();
diff --git a/src/main/java/org/torproject/metrics/stats/ipv6servers/Main.java b/src/main/java/org/torproject/metrics/stats/ipv6servers/Main.java
index a91a74f..d322a2e 100644
--- a/src/main/java/org/torproject/metrics/stats/ipv6servers/Main.java
+++ b/src/main/java/org/torproject/metrics/stats/ipv6servers/Main.java
@@ -88,6 +88,8 @@ public class Main {
log.info("Querying aggregated statistics from the database.");
new Writer().write(Paths.get(Configuration.output, "ipv6servers.csv"),
database.queryServersIpv6());
+ new Writer().write(Paths.get(Configuration.output, "advbw.csv"),
+ database.queryAdvbw());
new Writer().write(Paths.get(Configuration.output, "networksize.csv"),
database.queryNetworksize());
new Writer().write(Paths.get(Configuration.output, "relayflags.csv"),
diff --git a/src/main/java/org/torproject/metrics/stats/servers/Configuration.java b/src/main/java/org/torproject/metrics/stats/servers/Configuration.java
index c4597bc..76788df 100644
--- a/src/main/java/org/torproject/metrics/stats/servers/Configuration.java
+++ b/src/main/java/org/torproject/metrics/stats/servers/Configuration.java
@@ -102,7 +102,6 @@ public class Configuration {
if (this.directoryArchivesDirectories.isEmpty()) {
String prefix = "../../shared/in/recent/relay-descriptors/";
return Arrays.asList(new File(prefix + "consensuses/"),
- new File(prefix + "server-descriptors/"),
new File(prefix + "extra-infos/"));
} else {
return this.directoryArchivesDirectories;
diff --git a/src/main/java/org/torproject/metrics/stats/servers/RelayDescriptorDatabaseImporter.java b/src/main/java/org/torproject/metrics/stats/servers/RelayDescriptorDatabaseImporter.java
index c9a6fa7..2d1ae47 100644
--- a/src/main/java/org/torproject/metrics/stats/servers/RelayDescriptorDatabaseImporter.java
+++ b/src/main/java/org/torproject/metrics/stats/servers/RelayDescriptorDatabaseImporter.java
@@ -9,7 +9,6 @@ import org.torproject.descriptor.DescriptorSourceFactory;
import org.torproject.descriptor.ExtraInfoDescriptor;
import org.torproject.descriptor.NetworkStatusEntry;
import org.torproject.descriptor.RelayNetworkStatusConsensus;
-import org.torproject.descriptor.ServerDescriptor;
import org.postgresql.util.PGbytea;
@@ -20,7 +19,6 @@ import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
-import java.nio.charset.StandardCharsets;
import java.sql.CallableStatement;
import java.sql.Connection;
import java.sql.DriverManager;
@@ -28,7 +26,6 @@ import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Timestamp;
-import java.sql.Types;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
@@ -44,10 +41,6 @@ import java.util.TreeSet;
/**
* Parse directory data.
*/
-
-/* TODO Split up this class and move its parts to cron.network,
- * cron.users, and status.relaysearch packages. Requires extensive
- * changes to the database schema though. */
public final class RelayDescriptorDatabaseImporter {
/**
@@ -58,20 +51,10 @@ public final class RelayDescriptorDatabaseImporter {
/* Counters to keep track of the number of records committed before
* each transaction. */
- private int rdsCount = 0;
-
- private int resCount = 0;
-
private int rhsCount = 0;
private int rrsCount = 0;
- private int rcsCount = 0;
-
- private int rvsCount = 0;
-
- private int rqsCount = 0;
-
/**
* Relay descriptor database connection.
*/
@@ -85,18 +68,6 @@ public final class RelayDescriptorDatabaseImporter {
private PreparedStatement psSs;
/**
- * Prepared statement to check whether a given server descriptor has
- * been imported into the database before.
- */
- private PreparedStatement psDs;
-
- /**
- * Prepared statement to check whether a given network status consensus
- * has been imported into the database before.
- */
- private PreparedStatement psCs;
-
- /**
* Set of dates that have been inserted into the database for being
* included in the next refresh run.
*/
@@ -115,22 +86,11 @@ public final class RelayDescriptorDatabaseImporter {
private PreparedStatement psR;
/**
- * Prepared statement to insert a server descriptor into the database.
- */
- private PreparedStatement psD;
-
- /**
* Callable statement to insert the bandwidth history of an extra-info
* descriptor into the database.
*/
private CallableStatement csH;
- /**
- * Prepared statement to insert a network status consensus into the
- * database.
- */
- private PreparedStatement psC;
-
private static Logger log
= LoggerFactory.getLogger(RelayDescriptorDatabaseImporter.class);
@@ -145,21 +105,11 @@ public final class RelayDescriptorDatabaseImporter {
private BufferedWriter statusentryOut;
/**
- * Raw import file containing server descriptors.
- */
- private BufferedWriter descriptorOut;
-
- /**
* Raw import file containing bandwidth histories.
*/
private BufferedWriter bwhistOut;
/**
- * Raw import file containing consensuses.
- */
- private BufferedWriter consensusOut;
-
- /**
* Date format to parse timestamps.
*/
private SimpleDateFormat dateTimeFormat;
@@ -212,10 +162,6 @@ public final class RelayDescriptorDatabaseImporter {
/* Prepare statements. */
this.psSs = conn.prepareStatement("SELECT fingerprint "
+ "FROM statusentry WHERE validafter = ?");
- this.psDs = conn.prepareStatement("SELECT COUNT(*) "
- + "FROM descriptor WHERE descriptor = ?");
- this.psCs = conn.prepareStatement("SELECT COUNT(*) "
- + "FROM consensus WHERE validafter = ?");
this.psR = conn.prepareStatement("INSERT INTO statusentry "
+ "(validafter, nickname, fingerprint, descriptor, "
+ "published, address, orport, dirport, isauthority, "
@@ -224,16 +170,8 @@ public final class RelayDescriptorDatabaseImporter {
+ "isvalid, isv2dir, isv3dir, version, bandwidth, ports, "
+ "rawdesc) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, "
+ "?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
- this.psD = conn.prepareStatement("INSERT INTO descriptor "
- + "(descriptor, nickname, address, orport, dirport, "
- + "fingerprint, bandwidthavg, bandwidthburst, "
- + "bandwidthobserved, platform, published, uptime, "
- + "extrainfo) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, "
- + "?)");
this.csH = conn.prepareCall("{call insert_bwhist(?, ?, ?, ?, ?, "
+ "?)}");
- this.psC = conn.prepareStatement("INSERT INTO consensus "
- + "(validafter) VALUES (?)");
this.psU = conn.prepareStatement("INSERT INTO scheduled_updates "
+ "(date) VALUES (?)");
this.scheduledUpdates = new HashSet<>();
@@ -390,95 +328,9 @@ public final class RelayDescriptorDatabaseImporter {
}
/**
- * Insert server descriptor into database.
- */
- public void addServerDescriptorContents(String descriptor,
- String nickname, String address, int orPort, int dirPort,
- String relayIdentifier, long bandwidthAvg, long bandwidthBurst,
- long bandwidthObserved, String platform, long published,
- Long uptime, String extraInfoDigest) {
- if (this.importIntoDatabase) {
- try {
- this.addDateToScheduledUpdates(published);
- this.addDateToScheduledUpdates(
- published + 24L * 60L * 60L * 1000L);
- Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
- this.psDs.setString(1, descriptor);
- ResultSet rs = psDs.executeQuery();
- rs.next();
- if (rs.getInt(1) == 0) {
- this.psD.clearParameters();
- this.psD.setString(1, descriptor);
- this.psD.setString(2, nickname);
- this.psD.setString(3, address);
- this.psD.setInt(4, orPort);
- this.psD.setInt(5, dirPort);
- this.psD.setString(6, relayIdentifier);
- this.psD.setLong(7, bandwidthAvg);
- this.psD.setLong(8, bandwidthBurst);
- this.psD.setLong(9, bandwidthObserved);
- /* Remove all non-ASCII characters from the platform string, or
- * we'll make Postgres unhappy. Sun's JDK and OpenJDK behave
- * differently when creating a new String with a given encoding.
- * That's what the regexp below is for. */
- this.psD.setString(10, new String(platform.getBytes(),
- StandardCharsets.US_ASCII).replaceAll("[^\\p{ASCII}]",""));
- this.psD.setTimestamp(11, new Timestamp(published), cal);
- if (null != uptime) {
- this.psD.setLong(12, uptime);
- } else {
- this.psD.setNull(12, Types.BIGINT);
- }
- this.psD.setString(13, extraInfoDigest);
- this.psD.executeUpdate();
- rdsCount++;
- if (rdsCount % autoCommitCount == 0) {
- this.conn.commit();
- }
- }
- } catch (SQLException e) {
- log.warn("Could not add server "
- + "descriptor. We won't make any further SQL requests in "
- + "this execution.", e);
- this.importIntoDatabase = false;
- }
- }
- if (this.writeRawImportFiles) {
- try {
- if (this.descriptorOut == null) {
- new File(rawFilesDirectory).mkdirs();
- this.descriptorOut = new BufferedWriter(new FileWriter(
- rawFilesDirectory + "/descriptor.sql"));
- this.descriptorOut.write(" COPY descriptor (descriptor, "
- + "nickname, address, orport, dirport, fingerprint, "
- + "bandwidthavg, bandwidthburst, bandwidthobserved, "
- + "platform, published, uptime, extrainfo) FROM stdin;\n");
- }
- this.descriptorOut.write(descriptor.toLowerCase() + "\t"
- + nickname + "\t" + address + "\t" + orPort + "\t" + dirPort
- + "\t" + relayIdentifier + "\t" + bandwidthAvg + "\t"
- + bandwidthBurst + "\t" + bandwidthObserved + "\t"
- + (platform != null && platform.length() > 0
- ? new String(platform.getBytes(), StandardCharsets.US_ASCII)
- : "\\N") + "\t" + this.dateTimeFormat.format(published) + "\t"
- + (uptime >= 0 ? uptime : "\\N") + "\t"
- + (extraInfoDigest != null ? extraInfoDigest : "\\N")
- + "\n");
- } catch (IOException e) {
- log.warn("Could not write server "
- + "descriptor to raw database import file. We won't make "
- + "any further attempts to write raw import files in this "
- + "execution.", e);
- this.writeRawImportFiles = false;
- }
- }
- }
-
- /**
* Insert extra-info descriptor into database.
*/
- public void addExtraInfoDescriptorContents(String extraInfoDigest,
- String nickname, String fingerprint, long published,
+ public void addExtraInfoDescriptorContents(String fingerprint, long published,
List<String> bandwidthHistoryLines) {
if (!bandwidthHistoryLines.isEmpty()) {
this.addBandwidthHistory(fingerprint.toLowerCase(), published,
@@ -766,55 +618,6 @@ public final class RelayDescriptorDatabaseImporter {
}
}
- /**
- * Insert network status consensus into database.
- */
- public void addConsensus(long validAfter) {
- if (this.importIntoDatabase) {
- try {
- this.addDateToScheduledUpdates(validAfter);
- Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
- Timestamp validAfterTimestamp = new Timestamp(validAfter);
- this.psCs.setTimestamp(1, validAfterTimestamp, cal);
- ResultSet rs = psCs.executeQuery();
- rs.next();
- if (rs.getInt(1) == 0) {
- this.psC.clearParameters();
- this.psC.setTimestamp(1, validAfterTimestamp, cal);
- this.psC.executeUpdate();
- rcsCount++;
- if (rcsCount % autoCommitCount == 0) {
- this.conn.commit();
- }
- }
- } catch (SQLException e) {
- log.warn("Could not add network status "
- + "consensus. We won't make any further SQL requests in "
- + "this execution.", e);
- this.importIntoDatabase = false;
- }
- }
- if (this.writeRawImportFiles) {
- try {
- if (this.consensusOut == null) {
- new File(rawFilesDirectory).mkdirs();
- this.consensusOut = new BufferedWriter(new FileWriter(
- rawFilesDirectory + "/consensus.sql"));
- this.consensusOut.write(" COPY consensus (validafter) "
- + "FROM stdin;\n");
- }
- String validAfterString = this.dateTimeFormat.format(validAfter);
- this.consensusOut.write(validAfterString + "\n");
- } catch (IOException e) {
- log.warn("Could not write network status "
- + "consensus to raw database import file. We won't make "
- + "any further attempts to write raw import files in this "
- + "execution.", e);
- this.writeRawImportFiles = false;
- }
- }
- }
-
/** Imports relay descriptors into the database. */
public void importRelayDescriptors() {
log.info("Importing files in directories " + archivesDirectories
@@ -834,8 +637,6 @@ public final class RelayDescriptorDatabaseImporter {
if (descriptor instanceof RelayNetworkStatusConsensus) {
this.addRelayNetworkStatusConsensus(
(RelayNetworkStatusConsensus) descriptor);
- } else if (descriptor instanceof ServerDescriptor) {
- this.addServerDescriptor((ServerDescriptor) descriptor);
} else if (descriptor instanceof ExtraInfoDescriptor) {
this.addExtraInfoDescriptor((ExtraInfoDescriptor) descriptor);
}
@@ -862,18 +663,6 @@ public final class RelayDescriptorDatabaseImporter {
statusEntry.getBandwidth(), statusEntry.getPortList(),
statusEntry.getStatusEntryBytes());
}
- this.addConsensus(consensus.getValidAfterMillis());
- }
-
- private void addServerDescriptor(ServerDescriptor descriptor) {
- this.addServerDescriptorContents(
- descriptor.getDigestSha1Hex(), descriptor.getNickname(),
- descriptor.getAddress(), descriptor.getOrPort(),
- descriptor.getDirPort(), descriptor.getFingerprint(),
- descriptor.getBandwidthRate(), descriptor.getBandwidthBurst(),
- descriptor.getBandwidthObserved(), descriptor.getPlatform(),
- descriptor.getPublishedMillis(), descriptor.getUptime(),
- descriptor.getExtraInfoDigestSha1Hex());
}
private void addExtraInfoDescriptor(ExtraInfoDescriptor descriptor) {
@@ -892,8 +681,7 @@ public final class RelayDescriptorDatabaseImporter {
bandwidthHistoryLines.add(
descriptor.getDirreqReadHistory().getLine());
}
- this.addExtraInfoDescriptorContents(descriptor.getDigestSha1Hex(),
- descriptor.getNickname(),
+ this.addExtraInfoDescriptorContents(
descriptor.getFingerprint().toLowerCase(),
descriptor.getPublishedMillis(), bandwidthHistoryLines);
}
@@ -904,12 +692,8 @@ public final class RelayDescriptorDatabaseImporter {
public void closeConnection() {
/* Log stats about imported descriptors. */
- log.info("Finished importing relay "
- + "descriptors: {} consensuses, {} network status entries, {} "
- + "votes, {} server descriptors, {} extra-info descriptors, {} "
- + "bandwidth history elements, and {} dirreq stats elements",
- rcsCount, rrsCount, rvsCount, rdsCount, resCount, rhsCount,
- rqsCount);
+ log.info("Finished importing relay descriptors: {} network status entries "
+ + "and {} bandwidth history elements", rrsCount, rhsCount);
/* Insert scheduled updates a second time, just in case the refresh
* run has started since inserting them the first time in which case
@@ -951,18 +735,10 @@ public final class RelayDescriptorDatabaseImporter {
this.statusentryOut.write("\\.\n");
this.statusentryOut.close();
}
- if (this.descriptorOut != null) {
- this.descriptorOut.write("\\.\n");
- this.descriptorOut.close();
- }
if (this.bwhistOut != null) {
this.bwhistOut.write("\\.\n");
this.bwhistOut.close();
}
- if (this.consensusOut != null) {
- this.consensusOut.write("\\.\n");
- this.consensusOut.close();
- }
} catch (IOException e) {
log.warn("Could not close one or more raw database import files.", e);
}
diff --git a/src/main/sql/ipv6servers/init-ipv6servers.sql b/src/main/sql/ipv6servers/init-ipv6servers.sql
index b478a49..c94a19d 100644
--- a/src/main/sql/ipv6servers/init-ipv6servers.sql
+++ b/src/main/sql/ipv6servers/init-ipv6servers.sql
@@ -312,6 +312,17 @@ GROUP BY DATE(valid_after), server, guard_relay, exit_relay, announced_ipv6,
ORDER BY valid_after_date, server, guard_relay, exit_relay, announced_ipv6,
exiting_ipv6_relay, reachable_ipv6_relay;
+-- View on advertised bandwidth by Exit/Guard flag combination.
+CREATE OR REPLACE VIEW bandwidth_advbw AS
+SELECT valid_after_date AS date,
+ exit_relay AS isexit,
+ guard_relay AS isguard,
+ FLOOR(SUM(advertised_bandwidth_bytes_sum_avg)) AS advbw
+FROM ipv6servers
+WHERE server = 'relay'
+GROUP BY date, isexit, isguard
+ORDER BY date, isexit, isguard;
+
-- View on the number of running servers by relay flag.
CREATE OR REPLACE VIEW servers_flags_complete AS
WITH included_statuses AS (
diff --git a/src/main/sql/legacy/tordir.sql b/src/main/sql/legacy/tordir.sql
index f1d6767..dfe7b5d 100644
--- a/src/main/sql/legacy/tordir.sql
+++ b/src/main/sql/legacy/tordir.sql
@@ -3,33 +3,6 @@
CREATE LANGUAGE plpgsql;
--- TABLE descriptor
--- Contains all of the descriptors published by routers.
-CREATE TABLE descriptor (
- descriptor CHARACTER(40) NOT NULL,
- nickname CHARACTER VARYING(19) NOT NULL,
- address CHARACTER VARYING(15) NOT NULL,
- orport INTEGER NOT NULL,
- dirport INTEGER NOT NULL,
- fingerprint CHARACTER(40) NOT NULL,
- bandwidthavg BIGINT NOT NULL,
- bandwidthburst BIGINT NOT NULL,
- bandwidthobserved BIGINT NOT NULL,
- platform CHARACTER VARYING(256),
- published TIMESTAMP WITHOUT TIME ZONE NOT NULL,
- uptime BIGINT,
- extrainfo CHARACTER(40),
- CONSTRAINT descriptor_pkey PRIMARY KEY (descriptor)
-);
-
-CREATE OR REPLACE FUNCTION delete_old_descriptor()
-RETURNS INTEGER AS $$
- BEGIN
- DELETE FROM descriptor WHERE DATE(published) < current_date - 14;
- RETURN 1;
- END;
-$$ LANGUAGE plpgsql;
-
-- Contains bandwidth histories reported by relays in extra-info
-- descriptors. Each row contains the reported bandwidth in 15-minute
-- intervals for each relay and date.
@@ -97,22 +70,6 @@ RETURNS INTEGER AS $$
END;
$$ LANGUAGE plpgsql;
--- TABLE consensus
--- Contains all of the consensuses published by the directories.
-CREATE TABLE consensus (
- validafter TIMESTAMP WITHOUT TIME ZONE NOT NULL,
- CONSTRAINT consensus_pkey PRIMARY KEY (validafter)
-);
-
--- TABLE bandwidth_flags
-CREATE TABLE bandwidth_flags (
- date DATE NOT NULL,
- isexit BOOLEAN NOT NULL,
- isguard BOOLEAN NOT NULL,
- bwadvertised BIGINT NOT NULL,
- CONSTRAINT bandwidth_flags_pkey PRIMARY KEY(date, isexit, isguard)
-);
-
-- TABLE bwhist_flags
CREATE TABLE bwhist_flags (
date DATE NOT NULL,
@@ -149,15 +106,6 @@ CREATE TABLE user_stats (
CONSTRAINT user_stats_pkey PRIMARY KEY(date, country)
);
--- TABLE relay_statuses_per_day
--- A helper table which is commonly used to update the tables above in the
--- refresh_* functions.
-CREATE TABLE relay_statuses_per_day (
- date DATE NOT NULL,
- count INTEGER NOT NULL,
- CONSTRAINT relay_statuses_per_day_pkey PRIMARY KEY(date)
-);
-
-- Dates to be included in the next refresh run.
CREATE TABLE scheduled_updates (
id SERIAL,
@@ -174,24 +122,6 @@ CREATE TABLE updates (
date DATE
);
--- FUNCTION refresh_relay_statuses_per_day()
--- Updates helper table which is used to refresh the aggregate tables.
-CREATE OR REPLACE FUNCTION refresh_relay_statuses_per_day()
-RETURNS INTEGER AS $$
- BEGIN
- DELETE FROM relay_statuses_per_day
- WHERE date IN (SELECT date FROM updates);
- INSERT INTO relay_statuses_per_day (date, count)
- SELECT DATE(validafter) AS date, COUNT(*) AS count
- FROM consensus
- WHERE DATE(validafter) >= (SELECT MIN(date) FROM updates)
- AND DATE(validafter) <= (SELECT MAX(date) FROM updates)
- AND DATE(validafter) IN (SELECT date FROM updates)
- GROUP BY DATE(validafter);
- RETURN 1;
- END;
-$$ LANGUAGE plpgsql;
-
CREATE OR REPLACE FUNCTION array_sum (BIGINT[]) RETURNS BIGINT AS $$
SELECT SUM($1[i])::bigint
FROM generate_series(array_lower($1, 1), array_upper($1, 1)) index(i);
@@ -247,45 +177,11 @@ $$ LANGUAGE plpgsql;
-- refresh_* functions
-- The following functions keep their corresponding aggregate tables
--- up-to-date. They should be called every time ERNIE is run, or when new
--- data is finished being added to the descriptor or statusentry tables.
+-- up-to-date. They should be called every time this module is run, or when new
+-- data is finished being added to the statusentry tables.
-- They find what new data has been entered or updated based on the
-- updates table.
-CREATE OR REPLACE FUNCTION refresh_bandwidth_flags() RETURNS INTEGER AS $$
- DECLARE
- min_date TIMESTAMP WITHOUT TIME ZONE;
- max_date TIMESTAMP WITHOUT TIME ZONE;
- BEGIN
-
- min_date := (SELECT MIN(date) FROM updates);
- max_date := (SELECT MAX(date) + 1 FROM updates);
-
- DELETE FROM bandwidth_flags WHERE date IN (SELECT date FROM updates);
- EXECUTE '
- INSERT INTO bandwidth_flags (date, isexit, isguard, bwadvertised)
- SELECT DATE(validafter) AS date,
- BOOL_OR(isexit) AS isexit,
- BOOL_OR(isguard) AS isguard,
- (SUM(LEAST(bandwidthavg, bandwidthobserved))
- / relay_statuses_per_day.count)::BIGINT AS bwadvertised
- FROM descriptor RIGHT JOIN statusentry
- ON descriptor.descriptor = statusentry.descriptor
- JOIN relay_statuses_per_day
- ON DATE(validafter) = relay_statuses_per_day.date
- WHERE isrunning = TRUE
- AND validafter >= ''' || min_date || '''
- AND validafter < ''' || max_date || '''
- AND DATE(validafter) IN (SELECT date FROM updates)
- AND relay_statuses_per_day.date >= ''' || min_date || '''
- AND relay_statuses_per_day.date < ''' || max_date || '''
- AND DATE(relay_statuses_per_day.date) IN
- (SELECT date FROM updates)
- GROUP BY DATE(validafter), isexit, isguard, relay_statuses_per_day.count';
- RETURN 1;
- END;
-$$ LANGUAGE plpgsql;
-
CREATE OR REPLACE FUNCTION refresh_bwhist_flags() RETURNS INTEGER AS $$
DECLARE
min_date TIMESTAMP WITHOUT TIME ZONE;
@@ -391,18 +287,12 @@ CREATE OR REPLACE FUNCTION refresh_all() RETURNS INTEGER AS $$
DELETE FROM updates;
RAISE NOTICE '% Copying scheduled dates.', timeofday();
INSERT INTO updates SELECT * FROM scheduled_updates;
- RAISE NOTICE '% Refreshing relay statuses per day.', timeofday();
- PERFORM refresh_relay_statuses_per_day();
- RAISE NOTICE '% Refreshing total relay bandwidth.', timeofday();
- PERFORM refresh_bandwidth_flags();
RAISE NOTICE '% Refreshing bandwidth history.', timeofday();
PERFORM refresh_bwhist_flags();
RAISE NOTICE '% Refreshing user statistics.', timeofday();
PERFORM refresh_user_stats();
RAISE NOTICE '% Deleting processed dates.', timeofday();
DELETE FROM scheduled_updates WHERE id IN (SELECT id FROM updates);
- RAISE NOTICE '% Deleting old descriptors.', timeofday();
- PERFORM delete_old_descriptor();
RAISE NOTICE '% Deleting old bandwidth histories.', timeofday();
PERFORM delete_old_bwhist();
RAISE NOTICE '% Deleting old status entries.', timeofday();
@@ -414,23 +304,14 @@ $$ LANGUAGE plpgsql;
-- View for exporting bandwidth statistics.
CREATE VIEW stats_bandwidth AS
- (SELECT COALESCE(bandwidth_flags.date, bwhist_flags.date) AS date,
- COALESCE(bandwidth_flags.isexit, bwhist_flags.isexit) AS isexit,
- COALESCE(bandwidth_flags.isguard, bwhist_flags.isguard) AS isguard,
- bandwidth_flags.bwadvertised AS advbw,
- CASE WHEN bwhist_flags.read IS NOT NULL
- THEN bwhist_flags.read / 86400 END AS bwread,
- CASE WHEN bwhist_flags.written IS NOT NULL
- THEN bwhist_flags.written / 86400 END AS bwwrite,
+ (SELECT date, isexit, isguard,
+ read / 86400 AS bwread,
+ written / 86400 AS bwwrite,
NULL AS dirread, NULL AS dirwrite
- FROM bandwidth_flags FULL OUTER JOIN bwhist_flags
- ON bandwidth_flags.date = bwhist_flags.date
- AND bandwidth_flags.isexit = bwhist_flags.isexit
- AND bandwidth_flags.isguard = bwhist_flags.isguard
- WHERE COALESCE(bandwidth_flags.date, bwhist_flags.date) <
- current_date - 2)
+ FROM bwhist_flags
+ WHERE date < current_date - 2)
UNION ALL
- (SELECT date, NULL AS isexit, NULL AS isguard, NULL AS advbw,
+ (SELECT date, NULL AS isexit, NULL AS isguard,
NULL AS bwread, NULL AS bwwrite,
FLOOR(CAST(dr AS NUMERIC) / CAST(86400 AS NUMERIC)) AS dirread,
FLOOR(CAST(dw AS NUMERIC) / CAST(86400 AS NUMERIC)) AS dirwrite
1
0