[or-cvs] [metrics/master] Update parsing scripts for client requests to directories.

karsten at seul.org karsten at seul.org
Wed Jul 1 18:15:44 UTC 2009


Author: Karsten Loesing <karsten.loesing at gmx.net>
Date: Wed, 1 Jul 2009 19:20:07 +0200
Subject: Update parsing scripts for client requests to directories.
Commit: 88f7c49cd3ded6b50ef58493fd8f5122efe9f769

---
 HOWTO                                              |    6 +-
 scripts/dirreq/dirreq-censored.R                   |   39 ---
 scripts/dirreq/dirreq.R                            |   33 --
 .../metrics/dirreq/ParseDirectoryRequests.java     |  353 --------------------
 .../torproject/metrics/dirreq/ParseGeoipStats.java |  234 +++++++++++++
 5 files changed, 237 insertions(+), 428 deletions(-)
 delete mode 100644 scripts/dirreq/dirreq-censored.R
 delete mode 100644 scripts/dirreq/dirreq.R
 delete mode 100644 src/org/torproject/metrics/dirreq/ParseDirectoryRequests.java
 create mode 100644 src/org/torproject/metrics/dirreq/ParseGeoipStats.java

diff --git a/HOWTO b/HOWTO
index b49f228..94fd802 100644
--- a/HOWTO
+++ b/HOWTO
@@ -181,12 +181,12 @@ $ javac -d bin/ -cp src/:lib/* src/org/torproject/metrics/dirreq/*.java
 
 Run the parsing script:
 
-$ java -cp bin/:lib/* org.torproject.metrics.dirreq.ParseDirectoryRequests
-  data/dirreq/ out/dirreq/ 168 0
+$ java -cp bin/:lib/* org.torproject.metrics.dirreq.ParseGeoipStats
+  data/geoipstats/ out/geoipstats/
 
 $ mkdir report/
 $ mkdir report/dirreq/
-$ R -q --no-save < scripts/dirreq/dirreq.R
+$ R -q --no-save < scripts/dirreq/geoipstats.R
 
 
 3  Bridge archives
diff --git a/scripts/dirreq/dirreq-censored.R b/scripts/dirreq/dirreq-censored.R
deleted file mode 100644
index dcbc755..0000000
--- a/scripts/dirreq/dirreq-censored.R
+++ /dev/null
@@ -1,39 +0,0 @@
-a <- read.csv("out/dirreq/moria-dir64.log-req.csv")
-b <- read.csv("out/dirreq/moria-dir128.log-req.csv")
-c <- read.csv("out/dirreq/gabelmoo-dir512.log-req.csv")
-a <- a/1e3
-b <- b/1e3
-c <- c/1e3
-a19 <- c(a$cnt, a$rut, a$egt, a$vnt, a$sat, a$irt, a$mat, a$jot, a$pkt, a$byt, a$kzt, a$syt, a$aet, a$sdt, a$uzt, a$azt, a$yet)
-b19 <- c(b$cnt, b$rut, b$egt, b$vnt, b$sat, b$irt, b$mat, b$jot, b$pkt, b$byt, b$kzt, b$syt, b$aet, b$sdt, b$uzt, b$azt, b$yet)
-c19 <- c(c$cnt, c$rut, c$egt, c$vnt, c$sat, c$irt, c$mat, c$jot, c$pkt, c$byt, c$kzt, c$syt, c$aet, c$sdt, c$uzt, c$azt, c$yet)
-print(a19)
-print(b19)
-print(c19)
-m <- matrix(rev(c(c19, b19, a19)), nrow=17, ncol=3, byrow=FALSE)
-pdf("report/dirreq/dirreq-censored.pdf", width=8, height=6)
-oldpar <- par(mar=c(2.1, 3.9, 1.4, 4.9))
-barplot(m, col = c("orange", "red", "purple", "darkgreen", "red", "yellow", "blue"), ylab = "Requests for network statuses seen in 1 week [in K]", main = "Requests to directory caches by country", border = "white", names.arg = c("Directory with 64 KB/s", "Directory with 128 KB/s", "Directory with 512 KB/s"))
-mtext("China", side=4, las=1, at=12.6) #(cn)
-mtext("Russia", side=4, las=1, at=4.65) #(ru)
-mtext("Egypt", side=4, las=1, at=2.78) #(eg)
-mtext("Viet Nam", side=4, las=1, at=2.13) #(vn)
-mtext("Saudi Arabia", side=4, las=1, at=1.55) #(sa)
-mtext("Iran", side=4, las=1, at=1.0) #(ir)
-#mtext("Kazakhstan", side=4, las=1, at=1000) #(kz)
-#mtext("Belarus", side=4, las=1, at=1659) #(by)
-#mtext("Pakistan", side=4, las=1, at=1459) #(pk)
-#mtext("Jordan", side=4, las=1, at=1259) #(jo)
-#mtext("Syria", side=4, las=1, at=1059) #(sy)
-#mtext("U.A.E.", side=4, las=1, at=859) #(ae)
-#mtext("Uzbekistan", side=4, las=1, at=659) #(uz)
-#mtext("Yemen", side=4, las=1, at=459) #(ye)
-#mtext("Azerbaijan", side=4, las=1, at=259) #(az)
-#mtext("Egypt", side=4, las=1, at=59) #(eg)
-##mtext("Myanmar", side=4, las=1, at=59) #(mm)
-##mtext("Morocco", side=4, las=1, at=59) #(ma)
-##mtext("Sudan", side=4, las=1, at=59) #(sd)
-##mtext("Tunisia", side=4, las=1, at=59) #(tn)
-par(oldpar)
-dev.off();
-
diff --git a/scripts/dirreq/dirreq.R b/scripts/dirreq/dirreq.R
deleted file mode 100644
index a450af6..0000000
--- a/scripts/dirreq/dirreq.R
+++ /dev/null
@@ -1,33 +0,0 @@
-a <- read.csv("out/dirreq/moria-dir64.log-req.csv")
-b <- read.csv("out/dirreq/moria-dir128.log-req.csv")
-c <- read.csv("out/dirreq/gabelmoo-dir512.log-req.csv")
-a <- a/1e3
-b <- b/1e3
-c <- c/1e3
-sort(apply(a[,seq(7,466,3)], 2, mean), decreasing = TRUE)[1:15]
-sort(apply(b[,seq(7,478,3)], 2, mean), decreasing = TRUE)[1:15]
-sort(apply(c[,seq(7,523,3)], 2, mean), decreasing = TRUE)[1:15]
-asum <- sum(sort(apply(a[,seq(7,466,3)], 2, mean), decreasing = TRUE))
-bsum <- sum(sort(apply(b[,seq(7,478,3)], 2, mean), decreasing = TRUE))
-csum <- sum(sort(apply(c[,seq(7,523,3)], 2, mean), decreasing = TRUE))
-atop10 <- c(a$ust, a$det, a$cnt, a$itt, a$krt, a$gbt, a$frt, a$rut, a$cat, a$jpt)
-btop10 <- c(b$ust, b$det, b$cnt, b$itt, b$krt, b$gbt, b$frt, b$rut, b$cat, b$jpt)
-ctop10 <- c(c$ust, c$det, c$cnt, c$itt, c$krt, c$gbt, c$frt, c$rut, c$cat, c$jpt)
-m <- matrix(rev(c(ctop10, csum - sum(ctop10), btop10, bsum - sum(btop10), atop10, asum - sum(atop10))), nrow=11, ncol=3, byrow=FALSE)
-pdf("report/dirreq/dirreq.pdf", width=8, height=6)
-oldpar <- par(mar=c(2.1, 3.9, 1.4, 4.9))
-barplot(m, col = c("orange", "red", "purple", "darkgreen", "red", "yellow", "blue"), ylab = "Requests for network statuses seen in 1 week [in K]", main = "Requests to directory caches by country", border = "white", names.arg = c("Directory with 64 KB/s", "Directory with 128 KB/s", "Directory with 512 KB/s"))
-mtext("U.S.A.", side=4, las=1, at=93.5)
-mtext("Germany", side=4, las=1, at=75.5)
-mtext("China", side=4, las=1, at=61)
-mtext("Italy", side=4, las=1, at=52)
-mtext("South Korea", side=4, las=1, at=48)
-mtext("U.K.", side=4, las=1, at=44.2)
-mtext("France", side=4, las=1, at=41)
-mtext("Russia", side=4, las=1, at=38.1)
-mtext("Canada", side=4, las=1, at=35.3)
-mtext("Japan", side=4, las=1, at=32.5)
-mtext("Others", side=4, las=1, at=15.5)
-par(oldpar)
-dev.off();
-
diff --git a/src/org/torproject/metrics/dirreq/ParseDirectoryRequests.java b/src/org/torproject/metrics/dirreq/ParseDirectoryRequests.java
deleted file mode 100644
index 76e8831..0000000
--- a/src/org/torproject/metrics/dirreq/ParseDirectoryRequests.java
+++ /dev/null
@@ -1,353 +0,0 @@
-/* Copyright 2009 Karsten Loesing
- * See LICENSE for licensing information */
-package org.torproject.metrics.dirreq;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.text.ParsePosition;
-import java.text.SimpleDateFormat;
-import java.util.Calendar;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import java.util.SortedMap;
-import java.util.SortedSet;
-import java.util.TimeZone;
-import java.util.TreeMap;
-import java.util.TreeSet;
-
-import com.maxmind.geoip.LookupService;
-
-public final class ParseDirectoryRequests {
-
-    private ParseDirectoryRequests() {
-    }
-
-    public static void main(final String[] args) throws Exception {
-
-        // check input parameters
-        if (args.length < 4) {
-            System.err.println("Usage: java "
-                    + ParseDirectoryRequests.class.getSimpleName()
-                    + " <input directory> <output directory> "
-                    + "<unique interval length> <accumulate unique IPs>");
-            System.exit(1);
-        }
-        File inputDirectory = new File(args[0]);
-        if (!inputDirectory.exists() || !inputDirectory.isDirectory()) {
-            System.err.println("Input directory '"
-                    + inputDirectory.getAbsolutePath()
-                    + "' does not exist or is not a directory.");
-            System.exit(1);
-        }
-        File outputDirectory = new File(args[1]);
-        if (outputDirectory.exists() && !outputDirectory.isDirectory()) {
-            System.err.println("Output directory '"
-                    + outputDirectory.getAbsolutePath()
-                    + "' exists, but is not a directory.");
-            System.exit(1);
-        }
-        outputDirectory.mkdir();
-        long uniqueIntervalLength = Long.parseLong(args[2])
-                * 60L * 60L * 1000L;
-        boolean accumulate = Integer.parseInt(args[3]) != 0;
-
-        long started = System.currentTimeMillis();
-
-        String dbfile2 = "res/GeoIP.dat";
-        LookupService cl = new LookupService(dbfile2,
-                LookupService.GEOIP_MEMORY_CACHE);
-
-        // parse input files
-        for (File inputFile : inputDirectory.listFiles()) {
-
-            // this is a terrible hack! but it works for our data..
-            String timeZone = "Europe/Berlin";
-            if (inputFile.getName().startsWith("moria")) {
-                timeZone = "US/Eastern";
-            }
-            System.out.println("Parsing " + inputFile.getName()
-                    + " with timezone " + timeZone);
-
-            // Tor's logs don't contain years. use the current year
-            // instead. this is a pretty bad hack, but it works. just
-            // make sure that logs don't cross year boundaries!!
-            int currentYear = Calendar.getInstance().get(Calendar.YEAR);
-
-            // only consider events in a fixed interval. of course, this
-            // depends on the parsed data, so change it with the data!
-            long intervalBegin = 1233777600000L, intervalEnd = 1234382400000L;
-
-            // prepare for collection of unique addresses by country and total
-            // requests
-            SortedMap<Long, SortedMap<String, Map<String, int[]>>>
-                    allRequests = new TreeMap<Long, SortedMap<String,
-                    Map<String, int[]>>>();
-            SortedMap<String, Map<String, int[]>> uniqueIPs =
-                    new TreeMap<String, Map<String, int[]>>();
-            Set<String> ipsSeenSoFar = new HashSet<String>();
-            long currentInterval = intervalBegin;
-            SortedSet<String> allCountries = new TreeSet<String>();
-
-            // prepare parsing
-            Calendar c = Calendar.getInstance();
-            SimpleDateFormat timeFormat =
-                    new SimpleDateFormat("MMM dd HH:mm:ss.SSS");
-            timeFormat.setTimeZone(TimeZone.getTimeZone(timeZone));
-
-            // parse input file
-            BufferedReader br = new BufferedReader(new FileReader(
-                    inputFile));
-            String line = null;
-            long timestamp = -1L;
-            while ((line = br.readLine()) != null) {
-
-                // parse timestamp
-                Date logTime = timeFormat.parse(line.substring(0, 19),
-                        new ParsePosition(0));
-                c.setTimeInMillis(logTime.getTime());
-                c.set(Calendar.YEAR, currentYear);
-                timestamp = c.getTimeInMillis();
-
-                // if this event happened before the considered interval, move
-                // on
-                if (timestamp < intervalBegin) {
-                     continue;
-                }
-
-                // check if we should evaluate the current interval now
-                // (this approach requires that we have at least 1 event
-                // after the last considered interval, that we don't
-                // evaluate -- this is a tiny hack which fits our data,
-                // though.)
-                if (timestamp > currentInterval + uniqueIntervalLength) {
-                    // save collected data for later evaluation
-                    allRequests.put(currentInterval, uniqueIPs);
-                    // go on with next interval
-                    uniqueIPs = new TreeMap<String, Map<String, int[]>>();
-                    currentInterval += uniqueIntervalLength;
-                }
-
-                // if this event happened after the considered interval, move on
-                if (timestamp > intervalEnd) {
-                    continue;
-                }
-
-                // parse the rest
-                String[] split = line.substring(line.indexOf(
-                        "GET request by client ") + 22).split(" ");
-                String address = split[0].replace("'", "");
-
-                // look up in geoIP database
-                String country = cl.getCountry(address).getCode()
-                        .toLowerCase();
-                String url = split[3].replace("'", "");
-                url = url.substring(0, url.length() - 1);
-
-                // don't distinguish between compressed and non-compressed
-                if (url.endsWith(".z")) {
-                    url = url.substring(0, url.length() - 2);
-                }
-                //String type = "unknown";
-                int countAsVersion = -1;
-
-                // version 1 dir protocol
-                if (url.equals("/tor/")) {
-                    // full directory -- however, these requests
-                    // are probably not tor clients, but crawlers;
-                    // tor rewrites all requests for /<x> to /tor/<x>,
-                    // so that these requests could also be for /.
-                    // also, v1 tor clients would ask for /tor/dir.z,
-                    // not /tor/.
-                } else if (url.equals("/tor/dir")) {
-                    // fetch compressed full directory (rather: dir.z)
-                    //type = "v1status";
-                } else if (url.equals("/tor/running-routers")) {
-                    // fetch network-status descriptor
-                    // (do we want to count all three requests? what do
-                    // clients request typically?)
-                    //type = "v1status";
-
-                // version 2 network status
-                } else if (url.equals("/tor/status/all")) {
-                    // network-status documents from all known authorities
-                    //type = "v2status";
-                    countAsVersion = 2;
-                } else if (url.equals("/tor/status/authority")) {
-                    // network-status document by this authority
-                    //type = "v2status";
-                    countAsVersion = 2;
-                } else if (url.startsWith("/tor/status/fp/")) {
-                    // network-status document(s) by identity fingerprint
-                    // (unfortunately, we didn't preserve all fingerprints,
-                    // but only the first)
-                    //type = "v2status";
-                    countAsVersion = 2;
-
-                // version 3 network-status consensus, votes, and certificates
-                } else if (url.equals("/tor/status-vote/current/consensus")) {
-                    // current network-status consensus
-                    //type = "v3status";
-                    countAsVersion = 3;
-                } else if (url.startsWith(
-                        "/tor/status-vote/current/consensus/")) {
-                    // current network-status consensus, created by the
-                    // authorities the client trusts (unfortunately, we didn't
-                    // preserve all identities, but only the first;
-                    // otherwise, we might re-construct which versions clients
-                    // are using)
-                    //type = "v3status";
-                    countAsVersion = 3;
-
-                } else if (url.startsWith("/tor/status-vote/current/")
-                        && !url.startsWith(
-                        "/tor/status-vote/current/consensus")) {
-                    // other documents used in the v3 directory protocol in the
-                    // current voting period, e.g., votes, signatures.
-                    //type = "v3other";
-                } else if (url.startsWith("/tor/status-vote/next/")) {
-                    // documents used in the v3 directory protocol in the next
-                    // voting period, e.g., consensus, votes, signatures.
-                    //type = "v3other";
-                } else if (url.equals("/tor/keys/fp")) {
-                    // _empty_ list of key certificates? this is a bug that
-                    // DoSes the authorities! there were more than only 1 IPs
-                    // requesting this URL; is there a pattern?
-                    // are these clients (unlikely, because they wouldn't work
-                    // and be so persistent) are relays?
-                } else if (url.startsWith("/tor/keys/")
-                        && !url.equals("/tor/keys/fp")) {
-                    // list of key certificates
-                    //type = "v3other";
-
-                // router descriptors
-                } else if (url.equals("/tor/server/all")) {
-                    // all router descriptors (which versions request such a
-                    // thing?!)
-                    //type = "router";
-                } else if (url.equals("/tor/server/authority")) {
-                    // router descriptor of this relay,
-                    // mainly requested for debugging purposes and self test
-                    //type = "router";
-                } else if (url.startsWith("/tor/server/d/")) {
-                    // router descriptor by descriptor identifier
-                    //type = "router";
-                } else if (url.startsWith("/tor/server/fp/")) {
-                    // router descriptor by router identity (should be avoided)
-                    //type = "router";
-
-                // extra-info documents
-                } else if (url.equals("/tor/extra/all")) {
-                    // all extra-info documents
-                    //type = "extra";
-                } else if (url.equals("/tor/extra/authority")) {
-                    // extra-info document of this relay
-                    //type = "extra";
-                } else if (url.startsWith("/tor/extra/d/")) {
-                    // extra-info document by identifier
-                    //type = "extra";
-                } else if (url.startsWith("/tor/extra/fp/")) {
-                    // extra-info document by router identity
-                    //type = "extra";
-                }
-
-                // should this request be considered for evaluation?
-                if (countAsVersion > 0) {
-                    /*if (bridgeIPs.contains(address))
-                        bridgeRequests++;
-                    else
-                        nonBridgeRequests++;*/
-                    // consider IP address for 24 hour interval
-                    if (!accumulate || !ipsSeenSoFar.contains(address)) {
-                        ipsSeenSoFar.add(address);
-                        if (uniqueIPs.containsKey(country)) {
-                            Map<String, int[]> ips = uniqueIPs.get(country);
-                            if (ips.containsKey(address)) {
-                                int[] versions = ips.get(address);
-                                versions[countAsVersion - 2] += 1;
-                            } else {
-                                int[] versions = new int[2];
-                                versions[countAsVersion - 2] = 1;
-                                ips.put(address, versions);
-                            }
-                        } else {
-                            Map<String, int[]> ips =
-                                    new HashMap<String, int[]>();
-                            int[] versions = new int[2];
-                            versions[countAsVersion - 2] = 1;
-                            ips.put(address, versions);
-                            uniqueIPs.put(country, ips);
-                        }
-                    }
-                    allCountries.add(country);
-                }
-            }
-            // close input file
-            br.close();
-
-            File fileIPA = new File(outputDirectory.getAbsolutePath()
-                    + File.separatorChar + inputFile.getName() + "-uip.csv");
-            File fileReq = new File(outputDirectory.getAbsolutePath()
-                    + File.separatorChar + inputFile.getName() + "-req.csv");
-            BufferedWriter outIPA = new BufferedWriter(new FileWriter(
-                    fileIPA, false));
-            BufferedWriter outReq = new BufferedWriter(new FileWriter(
-                    fileReq, false));
-            StringBuilder sb = new StringBuilder();
-            for (String f : allCountries) {
-                sb.append(f + "2," + f + "3," + f + "t,");
-            }
-            outIPA.write("time," + sb.toString() + "total2,total3,total\n");
-            outReq.write("time," + sb.toString() + "total2,total3,total\n");
-            for (Map.Entry<Long, SortedMap<String, Map<String, int[]>>> hour
-                    : allRequests.entrySet()) {
-                int totalUnique2 = 0, totalUnique3 = 0, totalUniqueT = 0;
-                int totalRequests2 = 0, totalRequests3 = 0;
-                outIPA.write(hour.getKey() + ",");
-                outReq.write(hour.getKey() + ",");
-                SortedMap<String, Map<String, int[]>> req = hour.getValue();
-                for (String f : allCountries) {
-                    if (req.containsKey(f)) {
-                        int unique2 = 0, unique3 = 0, uniqueT = 0;
-                        int requests2 = 0, requests3 = 0;
-                        for (int[] vers : req.get(f).values()) {
-                            unique2 += vers[0] > 0 ? 1 : 0;
-                            unique3 += vers[1] > 0 ? 1 : 0;
-                            uniqueT++;
-                            requests2 += vers[0];
-                            requests3 += vers[1];
-                        }
-                        outIPA.write(unique2 + "," + unique3 + ","
-                                + uniqueT + ",");
-                        outReq.write(requests2 + "," + requests3 + ","
-                                + (requests2 + requests3) + ",");
-                        totalUnique2 += unique2;
-                        totalUnique3 += unique3;
-                        totalUniqueT += uniqueT;
-                        totalRequests2 += requests2;
-                        totalRequests3 += requests3;
-                    } else {
-                        outIPA.write("0,0,0,");
-                        outReq.write("0,0,0,");
-                    }
-                }
-                outIPA.write(totalUnique2 + "," + totalUnique3 + ","
-                        + totalUniqueT + "\n");
-                outReq.write(totalRequests2 + "," + totalRequests3 + ","
-                        + (totalRequests2 + totalRequests3) + "\n");
-            }
-            outIPA.close();
-            outReq.close();
-        }
-
-        System.out.println("Parsing finished after "
-            + ((System.currentTimeMillis() - started) / 1000)
-            + " seconds.");
-    }
-}
-
diff --git a/src/org/torproject/metrics/dirreq/ParseGeoipStats.java b/src/org/torproject/metrics/dirreq/ParseGeoipStats.java
new file mode 100644
index 0000000..7f69e7f
--- /dev/null
+++ b/src/org/torproject/metrics/dirreq/ParseGeoipStats.java
@@ -0,0 +1,234 @@
+/* Copyright 2009 Karsten Loesing
+ * See LICENSE for licensing information */
+package org.torproject.metrics.dirreq;
+
+import java.io.*;
+import java.text.*;
+import java.util.*;
+
+import com.maxmind.geoip.LookupService;
+
+public final class ParseGeoipStats {
+
+    private static class DataPoint {
+        String date;
+        SortedMap<String, Integer> v2Ips;
+        SortedMap<String, Integer> v3Ips;
+        SortedMap<String, Integer> v2Reqs;
+        SortedMap<String, Integer> v3Reqs;
+        int v2Share;
+        int v3Share;
+    }
+
+    private static SortedSet<String> allCountries = new TreeSet<String>();
+    private static SortedSet<String> allDates = new TreeSet<String>();
+    private static SortedMap<String, SortedMap<String, DataPoint>> allDataPoints
+            = new TreeMap<String, SortedMap<String, DataPoint>>();
+
+    private static SortedMap<String, Integer> parseCountryLine(String line) {
+        SortedMap<String, Integer> result = new TreeMap<String, Integer>();
+        if (line.length() < 2 || line.split(" ").length < 2) {
+            return result;
+        }
+        String[] countries = line.split(" ")[1].split(",");
+        for (String part : countries) {
+            String country = part.split("=")[0];
+            Integer count = Integer.parseInt(part.split("=")[1]) - 4;
+            allCountries.add(country);
+            result.put(country, count);
+        }
+        return result;
+    }
+
+    private static String estimateRequestsAndClients(int localRequests,
+            int localIpsInt, int shareAsInt) {
+        double share = ((double) shareAsInt) / 10000 * 5 / 4;
+        double totalRequests = (double) localRequests / share;
+        double totalClients = 10000.0D;
+        double localIps = (double) localIpsInt;
+        int maxIterations = 40;
+        double step = 10000.0D;
+        boolean add = true;
+        while (maxIterations-- > 0) {
+            double c = totalClients * (1.0D - Math.pow(1.0D -share,
+                    totalRequests / totalClients));
+            if (Math.abs(localIps - c) < 0.1D) {
+                break;
+            } else if (c > localIps) {
+                if (add) step /= 2.0;
+                totalClients -= step;
+            } else if (c < localIps) {
+                if (!add) step /= 2.0;
+                totalClients += step;
+            }
+        }
+        double requestsPerClient = totalRequests / totalClients;
+        return String.format("%d,%d,%.2f", (int) totalRequests,
+                (int) totalClients, requestsPerClient);
+    }
+
+    private ParseGeoipStats() {
+    }
+
+    public static void main(final String[] args) throws Exception {
+
+        // check input parameters
+        if (args.length < 2) {
+            System.err.println("Usage: java "
+                    + ParseGeoipStats.class.getSimpleName()
+                    + " <input directory> <output directory>");
+            System.exit(1);
+        }
+        File inputDirectory = new File(args[0]);
+        if (!inputDirectory.exists() || !inputDirectory.isDirectory()) {
+            System.err.println("Input directory '"
+                    + inputDirectory.getAbsolutePath()
+                    + "' does not exist or is not a directory.");
+            System.exit(1);
+        }
+        File outputDirectory = new File(args[1]);
+        if (outputDirectory.exists() && !outputDirectory.isDirectory()) {
+            System.err.println("Output directory '"
+                    + outputDirectory.getAbsolutePath()
+                    + "' exists, but is not a directory.");
+            System.exit(1);
+        }
+        outputDirectory.mkdir();
+
+        long started = System.currentTimeMillis();
+
+        // parse input files
+        for (File inputFile : inputDirectory.listFiles()) {
+            SortedMap<String, DataPoint> currentDataPoints
+                    = new TreeMap<String, DataPoint>();
+            allDataPoints.put(inputFile.getName(), currentDataPoints);
+            BufferedReader br = new BufferedReader(new FileReader(
+                    inputFile));
+            String line = null;
+            String currentDate = null;
+            DataPoint currentDataPoint = null;
+            boolean haveSeenActualNumbers = false;
+            while ((line = br.readLine()) != null) {
+                if (line.startsWith("written ")) {
+                    if (haveSeenActualNumbers) {
+                        currentDataPoints.put(currentDate, currentDataPoint);
+                    }
+                    currentDataPoint = new DataPoint();
+                    currentDate = line.split(" ")[1];
+                    allDates.add(currentDate);
+                } else if (line.startsWith("started-at ")) {
+                    // ignored
+                } else if (line.startsWith("ns-ips ")) {
+                    currentDataPoint.v3Ips = parseCountryLine(line);
+                    if (line.split(" ").length > 1) {
+                        haveSeenActualNumbers = true;
+                    }
+                } else if (line.startsWith("ns-v2-ips ")) {
+                    currentDataPoint.v2Ips = parseCountryLine(line);
+                    if (line.split(" ").length > 1) {
+                        haveSeenActualNumbers = true;
+                    }
+                } else if (line.startsWith("requests-start ")) {
+                    // ignored
+                } else if (line.startsWith("n-ns-reqs ")) {
+                    currentDataPoint.v3Reqs = parseCountryLine(line);
+                    if (line.split(" ").length > 1) {
+                        haveSeenActualNumbers = true;
+                    }
+                } else if (line.startsWith("n-v2-ns-reqs ")) {
+                    currentDataPoint.v2Reqs = parseCountryLine(line);
+                    if (line.split(" ").length > 1) {
+                        haveSeenActualNumbers = true;
+                    }
+                } else if (line.startsWith("v2-ns-share ")) {
+                    currentDataPoint.v2Share = Integer.parseInt(
+                            line.split(" ")[1].replace('.', ';')
+                            .replace('%', ';').replaceAll(";", ""));
+                } else if (line.startsWith("v3-ns-share ")) {
+                    currentDataPoint.v3Share = Integer.parseInt(
+                            line.split(" ")[1].replace('.', ';')
+                            .replace('%', ';').replaceAll(";", ""));
+                }
+            }
+            if (haveSeenActualNumbers) {
+                currentDataPoints.put(currentDate, currentDataPoint);
+            }
+            br.close();
+        }
+
+        System.out.printf("We have seen %d countries on %d days on %d "
+                + "directories.%n", allCountries.size(), allDates.size(),
+                allDataPoints.size());
+
+        for (Map.Entry<String, SortedMap<String, DataPoint>> e
+                : allDataPoints.entrySet()) {
+            String directory = e.getKey();
+            SortedMap<String, DataPoint> dataPoints = e.getValue();
+            File outFile = new File(outputDirectory.getAbsolutePath()
+                    + File.separatorChar + directory + ".csv");
+            BufferedWriter out = new BufferedWriter(new FileWriter(
+                    outFile, false));
+            out.write("time,");
+            for (String f : allCountries) {
+                out.write(String.format("ip2%s,ip3%<s,ipt%<s,"
+                        + "req2%<s,req3%<s,reqt%<s,", f));
+            }
+            out.write("ip2total,ip3total,ipttotal,"
+                    + "req2total,req3total,reqttotal,"
+                    + "v2share,v3share,"
+                    + "req2estimate,ip2estimate,reqperip2,"
+                    + "req3estimate,ip3estimate,reqperip3\n");
+            for (String date : allDates) {
+                if (!dataPoints.containsKey(date)) {
+                    out.write(date + ",");
+                    int nas = allCountries.size() * 6 + 7;
+                    for (int i = 0; i < nas; i++) {
+                        out.write("NA,");
+                    }
+                    out.write("NA\n");
+                } else {
+                    DataPoint currentDataPoint = dataPoints.get(date);
+                    out.write(date + ",");
+                    int ip2total = 0, ip3total = 0, ipttotal = 0,
+                            req2total = 0, req3total = 0, reqttotal = 0;
+                    for (String f : allCountries) {
+                        int v2Ips = currentDataPoint.v2Ips.containsKey(f)
+                                ? currentDataPoint.v2Ips.get(f) : 0;
+                        int v3Ips = currentDataPoint.v3Ips.containsKey(f)
+                                ? currentDataPoint.v3Ips.get(f) : 0;
+                        int v2Reqs = currentDataPoint.v2Reqs.containsKey(f)
+                                ? currentDataPoint.v2Reqs.get(f) : 0;
+                        int v3Reqs = currentDataPoint.v3Reqs.containsKey(f)
+                                ? currentDataPoint.v3Reqs.get(f) : 0;
+                        ip2total += v2Ips;
+                        ip3total += v3Ips;
+                        ipttotal += v2Ips + v3Ips;
+                        req2total += v2Reqs;
+                        req3total += v3Reqs;
+                        reqttotal += v2Reqs + v3Reqs;
+                        out.write(String.format("%d,%d,%d,%d,%d,%d,",
+                                v2Ips, v3Ips, v2Ips + v3Ips,
+                                v2Reqs, v3Reqs, v2Reqs + v3Reqs));
+                    }
+                    out.write(String.format("%d,%d,%d,%d,%d,%d,%d,%d",
+                            ip2total, ip3total, ipttotal,
+                            req2total, req3total, reqttotal,
+                            currentDataPoint.v2Share,
+                            currentDataPoint.v3Share));
+                    out.write(String.format(",%s",
+                            estimateRequestsAndClients(req2total, ip2total,
+                            currentDataPoint.v2Share)));
+                    out.write(String.format(",%s%n",
+                            estimateRequestsAndClients(req3total, ip3total,
+                            currentDataPoint.v3Share)));
+                }
+            }
+            out.close();
+        }
+
+        System.out.println("Parsing finished after "
+            + ((System.currentTimeMillis() - started) / 1000)
+            + " seconds.");
+    }
+}
+
-- 
1.5.6.5




More information about the tor-commits mailing list