[or-cvs] [ernie/master 2/3] Document server descriptor stats.

Tue Mar 9 14:10:33 UTC 2010

Author: Karsten Loesing <karsten.loesing at gmx.net>
Date: Tue, 9 Mar 2010 14:36:16 +0100
Subject: Document server descriptor stats.
Commit: fe66e8037fd344c235b81d0280ef6134564c591c

---
 config                                    |   10 +
 src/Configuration.java                    |   23 ++
 src/Main.java                             |    8 +-
 src/ServerDescriptorStatsFileHandler.java |  460 ++++++++++++++++++-----------
 4 files changed, 321 insertions(+), 180 deletions(-)

diff --git a/config b/config
index f900bf8..5c202f1 100644
--- a/config
+++ b/config
@@ -36,6 +36,16 @@
 ## Write bridge stats to disk
 #WriteBridgeStats 1
 
+## Write server descriptors stats to disk
+#WriteServerDescriptorStats 1
+
+## Comma-separated list of relay versions to be included in version-stats
+#RelayVersions 0.1.2,0.2.0,0.2.1,0.2.2
+
+## Comma-separated list of relay platforms to be included in
+## platform-stats
+#RelayPlatforms Linux,Windows,Darwin,FreeBSD
+
 ## Write directory archives to disk
 #WriteDirectoryArchives 0
 
diff --git a/src/Configuration.java b/src/Configuration.java
index cc90e6e..4ba3044 100644
--- a/src/Configuration.java
+++ b/src/Configuration.java
@@ -17,6 +17,11 @@ public class Configuration {
       Arrays.asList(("8522EB98C91496E80EC238E732594D1509158E77,"
       + "9695DFC35FFEB861329B9F1AB04C46397020CE31").split(",")));
   private boolean writeBridgeStats = true;
+  private boolean writeServerDescriptorStats = true;
+  private List<String> relayVersions = new ArrayList<String>(Arrays.asList(
+      "0.1.2,0.2.0,0.2.1,0.2.2".split(",")));
+  private List<String> relayPlatforms = new ArrayList<String>(Arrays.asList(
+      "Linux,Windows,Darwin,FreeBSD".split(",")));
   private boolean writeDirectoryArchives = false;
   private SortedSet<String> v3DirectoryAuthorities = new TreeSet<String>(
       Arrays.asList(("14C131DFC5C6F93646BE72FA1401C02A8DF2E8B4,"
@@ -74,6 +79,15 @@ public class Configuration {
         } else if (line.startsWith("WriteBridgeStats")) {
           this.writeBridgeStats = Integer.parseInt(
               line.split(" ")[1]) != 0;
+        } else if (line.startsWith("WriteServerDescriptorStats")) {
+          this.writeServerDescriptorStats = Integer.parseInt(
+              line.split(" ")[1]) != 0;
+        } else if (line.startsWith("RelayVersions")) {
+          this.relayVersions = new ArrayList<String>(
+              Arrays.asList(line.split(" ")[1].split(",")));
+        } else if (line.startsWith("RelayPlatforms")) {
+          this.relayPlatforms = new ArrayList<String>(
+              Arrays.asList(line.split(" ")[1].split(",")));
         } else if (line.startsWith("WriteDirectoryArchives")) {
           this.writeDirectoryArchives = Integer.parseInt(
               line.split(" ")[1]) != 0;
@@ -165,6 +179,15 @@ public class Configuration {
   public boolean getWriteBridgeStats() {
     return this.writeBridgeStats;
   }
+  public boolean getWriteServerDescriptorStats() {
+    return this.writeServerDescriptorStats;
+  }
+  public List<String> getRelayVersions() {
+    return this.relayVersions;
+  }
+  public List<String> getRelayPlatforms() {
+    return this.relayPlatforms;
+  }
   public boolean getWriteDirectoryArchives() {
     return this.writeDirectoryArchives;
   }
diff --git a/src/Main.java b/src/Main.java
index 34d309c..2d96396 100644
--- a/src/Main.java
+++ b/src/Main.java
@@ -34,12 +34,15 @@ public class Main {
     DirreqStatsFileHandler dsfh = config.getWriteDirreqStats() ?
         new DirreqStatsFileHandler(countries) : null;
     ServerDescriptorStatsFileHandler sdsfh =
-        new ServerDescriptorStatsFileHandler();
+        config.getWriteServerDescriptorStats() ?
+        new ServerDescriptorStatsFileHandler(config.getRelayVersions(),
+        config.getRelayPlatforms()) : null;
 
     // Prepare relay descriptor parser (only if we are writing the
     // stats)
     RelayDescriptorParser rdp = config.getWriteConsensusStats() ||
-        config.getWriteBridgeStats() || config.getWriteDirreqStats() ?
+        config.getWriteBridgeStats() || config.getWriteDirreqStats() ||
+        config.getWriteServerDescriptorStats() ?
         new RelayDescriptorParser(csfh, bsfh, dsfh, sdsfh, countries,
         directories) : null;
 
@@ -122,4 +125,3 @@ public class Main {
     logger.info("Terminating ERNIE.");
   }
 }
-
diff --git a/src/ServerDescriptorStatsFileHandler.java b/src/ServerDescriptorStatsFileHandler.java
index 37e71c3..6aa63b8 100644
--- a/src/ServerDescriptorStatsFileHandler.java
+++ b/src/ServerDescriptorStatsFileHandler.java
@@ -3,84 +3,141 @@ import java.text.*;
 import java.util.*;
 import java.util.logging.*;
 
-  /**
-   * two pieces of information: consensuses referencing N server
-   * descriptors that are combined with relay flags (like Running) and
-   * server descriptors containing information about tor
-   * versions, platforms, and advertised bandwidth. we want stats that
-   * combine information from consensuses and server descriptors. in
-   * databases this is a n:m relation with n consensus referencing m
-   * server descriptors. so, the straightforward way is to keep parse
-   * results in 2 tables and join them for extracting statistics.
-   * however, we don't want to use a database here. and even if we had
-   * a database, the table join would be too expensive to perform after
-   * adding new data every hour.
-   *
-   * the approach we take here is to de-normalize the data and write
-   * the join of consensuses and server descriptors into one file that
-   * is never kept in memory in the whole. this file has entries for
-   * every consensus line referencing a server descriptor and the
-   * information we want to use from the references server descriptor,
-   * if available. in addition to that, we need a smaller file containing
-   * unreferenced server descriptors that we were not able to write to
-   * the first file, yet. by implementing the join operation manually,
-   * we can make use of the fact that descriptors are not referenced for
-   * longer than 24 hours.
-   *
-   * stats/relay-version-stats:
-   * date,v011,v012,v020,v021,v022,other
-   *
-   * stats/relay-platform-stats:
-   * date,windows,sunos,openbsd,netbsd,linux,freebsd,dragonfly,darwin,other
-   *
-   * stats/relay-bandwidth-stats:
-   * date,q1,md,q3
-   *
-   * read largefile and merge our data in; also generate stats
-   * datetime,descriptor,version,platform,advbw
-   * 320095,aZ7mNo3lkjf2li34hlkvjsdru2,0.2.1,Darwin,1024
-   *
-   * TODO future extension: remove lines from server-descriptor-stats-raw
-   * as soon as we have written a full day (all consensuses, all SDs).
-   */
+/**
+ * Generates statistics about relays in the Tor network from data that
+ * relays write to their server descriptors. Accepts lists of referenced
+ * descriptors in network status consensuses and selected lines from
+ * server descriptors from <code>RelayDescriptorParser</code>. Keeps two
+ * intermediate results files <code>stats/consensuses-raw</code> and
+ * <code>stats/descriptors-raw</code> and writes three final results files
+ * <code>stats/version-stats</code>, <code>stats/platform-stats</code>,
+ * and <code>stats/bandwidth-stats</code>.
+ */
 public class ServerDescriptorStatsFileHandler {
 
+  /**
+   * Intermediate results file <code>stats/consensuses-raw</code>
+   * containing consensuses and the referenced descriptor identities of
+   * relays with the Running flag set. The file format is
+   * "valid-after,descid,descid,descid...\n" for each consensus. Lines are
+   * ordered by valid-after time in ascending order.
+   */
   private File consensusesFile;
+
+  /**
+   * Temporary file for writing <code>stats/consensuses-raw</code> while
+   * reading that file at the same time. After read and write operations
+   * are complete, the original file is deleted and the temporary file
+   * renamed to be the new intermediate results file.
+   */
   private File consensusesTempFile;
+
+  /**
+   * Intermediate results file <code>stats/descriptors-raw</code>
+   * containing server descriptors with relevant fields for statistics.
+   * The file format is "published,descid,version,platform,advbw\n" for
+   * each server descriptors. Lines are first ordered by published time,
+   * then by descid.
+   */
   private File descriptorsFile;
+
+  /**
+   * Temporary file for writing <code>stats/descriptors-raw</code> while
+   * reading that file at the same time. After read and write operations
+   * are complete, the original file is deleted and the temporary file
+   * renamed to be the new intermediate results file.
+   */
   private File descriptorsTempFile;
+
+  /**
+   * Final results file <code>stats/version-stats</code> containing
+   * statistics about Tor versions of relays in the network. The file
+   * format is "date,version1,version2,...,other" with versions as
+   * specified in config option RelayVersions.
+   */
   private File versionStatsFile;
+
+  /**
+   * Final results file <code>stats/platform-stats</code> containing
+   * statistics about operating systems of relays in the network. The
+   * file format is "date,os1,os2,...,other" with operating systems as
+   * specified in config option RelayPlatforms.
+   */
   private File platformStatsFile;
+
+  /**
+   * Final results file <code>stats/bandwidth-stats</code> containing
+   * statistics about the advertised bandwidth of relays in the network.
+   * The file format is "date,advbw".
+   */
   private File bandwidthStatsFile;
 
   /**
-   * map key "valid-after", map value "valid-after,descid,descid,descid.."
+   * Consensuses and referenced descriptor identities of relays with the
+   * Running flag set. This data structure only holds those consensuses
+   * that were parsed in this execution, not the previously parsed
+   * consensuses as read from disk. Map keys are valid-after times
+   * formatted as "yyyy-MM-dd HH:mm:ss", map values are valid-after times
+   * followed by a comma-separated list of base-64-formatted descriptor
+   * identifiers.
    */
   private SortedMap<String, String> consensuses;
 
   /**
-   * map key "published,descid"
-   * map value "published,descid,version,platform,bandwidth"
+   * Server descriptors with relevant fields for statistics, ordered by
+   * published time and descriptor identifier. Map keys are publication
+   * times of descriptors formatted as "yyyy-MM-dd HH:mm:ss", a comma, and
+   * base-64-formatted descriptor identifiers. An example key is
+   * "2009-09-30 20:42:19,ZQZ5zq4q1U8Uynyk6lkUy5uAsdM" (length 47). Map
+   * values are map keys plus version, platform, and advertised bandwidth
+   * written as "published,descid,version,platform,advbw". Note that the
+   * platform string may contain commas.
    */
   private SortedMap<String, String> descriptors;
 
   /**
-   * map key "descid"
-   * map value "published,descid,version,platform,bandwidth"
+   * Server descriptors as in <code>descriptors</code>, accessible by
+   * descriptor identifiers only, without knowing the publication time.
+   * Map keys are base-64-formatted descriptor identifiers, map values
+   * are formatted as map values in <code>descriptors</code>.
    */
   private SortedMap<String, String> descById;
 
+  /**
+   * Tor relay versions that we care about.
+   */
+  private List<String> relayVersions;
+
+  /**
+   * Platforms (operating systems) that we care about.
+   */
+  private List<String> relayPlatforms;
+
+  /**
+   * Logger for this class.
+   */
   private Logger logger;
 
+  // TODO should there be a modified flag, too?
+
   /**
-   * Initializes this class, including reading in results file
-   * <code>stats/relay-version-stats</code> etc. Not that we don't read in
-   * <code>stats/server-descriptors-raw</code>, because it can grow
-   * really big!
+   * Initializes this class, without reading in any files. We're only
+   * reading in files when writing results to disk in
+   * <code>writeFiles</code>.
    */
-  public ServerDescriptorStatsFileHandler() {
+  public ServerDescriptorStatsFileHandler(List<String> relayVersions,
+      List<String> relayPlatforms) {
 
-    /* init files */
+    /* Memorize versions and platforms that we care about. */
+    this.relayVersions = relayVersions;
+    this.relayPlatforms = relayPlatforms;
+
+    /* Initialize local data structures. */
+    this.consensuses = new TreeMap<String, String>();
+    this.descriptors = new TreeMap<String, String>();
+    this.descById = new TreeMap<String, String>();
+
+    /* Initialize file names for intermediate and final results files. */
     this.versionStatsFile = new File("stats/version-stats");
     this.platformStatsFile = new File("stats/platform-stats");
     this.bandwidthStatsFile = new File("stats/bandwidth-stats");
@@ -89,47 +146,59 @@ public class ServerDescriptorStatsFileHandler {
     this.descriptorsFile = new File("stats/descriptors-raw");
     this.descriptorsTempFile = new File("stats/descriptors-raw.temp");
 
-    /* Initialize local data structures. */
-    this.consensuses = new TreeMap<String, String>();
-    this.descriptors = new TreeMap<String, String>();
-    this.descById = new TreeMap<String, String>();
-
     /* Initialize logger. */
     this.logger =
         Logger.getLogger(ServerDescriptorStatsFileHandler.class.getName());
-    this.logger.fine("Initialized.");
   }
 
-  /* Just add to data structure. We cannot check whether we already got
-   * it right now. The only thing we can check is whether we got this
-   * consensus before in this run. */
+  /**
+   * Adds a consensus to the list with its valid-after time and a list of
+   * descriptor identifiers of relays that have the Running flag set. If
+   * the number of consensuses in memory exceeds a certain number, an
+   * auto-save mechanism is triggered by calling <code>writeFiles</code>.
+   */
   public void addConsensus(String validAfter,
       String descriptorIdentities) {
-    // TODO should there be a modified flag, too?
+
+    /* Add consensus to the list. */
     if (!this.consensuses.containsKey(validAfter)) {
-      this.logger.finer("Adding");
+      this.logger.finer("Adding consensus published at " + validAfter
+          + ".");
     } else {
-      this.logger.fine("We already learned about this consensus in this "
-          + "run. Overwriting.");
+      this.logger.fine("We already learned about a consensus published "
+          + "at " + validAfter + " in this execution. Overwriting.");
     }
     this.consensuses.put(validAfter, validAfter + ","
         + descriptorIdentities);
-    
-    // force autosave if we have too many data; 240 cons ^= 10 days
+
+    /* Check if we have more 240 consensuses in memory (covering 10 days).
+     * If so, trigger the auto-save mechanism. */
     if (this.consensuses.size() > 240) {
       this.logger.fine("Autosave triggered by adding consensus: We have "
-          + this.consensuses.size() + " consensuses and " + this.descriptors.size()
-          + " descriptors in memory. Writing to disk now.");
+          + this.consensuses.size() + " consensuses and "
+          + this.descriptors.size() + " descriptors in memory. Writing "
+          + "to disk now.");
       this.writeFiles();
     }
   }
 
-  // version string is the 0.2.1.23 part of the platform string
-  // platform is platform string with all parts after { removed
-  // advbw is in kibibytes
+  /**
+   * Adds a server descriptor to the list with its identity and the
+   * platform, published, and bandwidth lines. Version and operating
+   * system are parsed from the platform line. The parsed version consists
+   * only of the dotted numbers part (e.g. "0.2.1.2") without any
+   * additions like "-alpha". The operating system is the substring after
+   * " on " up to the first encountered opening curly bracket ("{").
+   * The publication time is extracted from the published line. The
+   * advertised bandwidth is calculated from the bandwidth line by taking
+   * the minimum of average and observed bandwidth, divided by 1024 to
+   * obtain KiB/s.
+   */
   public void addServerDescriptor(String descriptorIdentity,
       String platformLine, String publishedLine, String bandwidthLine) {
-    // TODO should there be a modified flag, too?
+
+    /* Parse version, platform, and advertised bandwidth from the given
+     * lines. */
     String version = "", platform = "", published = "", advBw = "";
     if (platformLine.contains(" Tor ")) {
       version = platformLine.substring(platformLine.indexOf(" Tor ") + 5).
@@ -144,44 +213,56 @@ public class ServerDescriptorStatsFileHandler {
     published = publishedLine.substring("published ".length());
     String[] bwParts = bandwidthLine.split(" ");
     if (bwParts.length == 4) {
-      advBw = "" + (Math.min(Long.parseLong(bwParts[1]),
-          Long.parseLong(bwParts[3])) / 1024L);
-      // TODO can't trust input! verify
+      try {
+        advBw = "" + (Math.min(Long.parseLong(bwParts[1]),
+            Long.parseLong(bwParts[3])) / 1024L);
+      } catch (NumberFormatException e) {
+        this.logger.log(Level.WARNING, "Exception while parsing average "
+            + "and observed bandwidth from line '" + bandwidthLine
+            + "'. Not adding server descriptor!", e);
+        return;
+      }
     }
     String key = published + "," + descriptorIdentity;
     String line = key + "," + version + "," + platform + "," + advBw;
     if (!this.descriptors.containsKey(key)) {
-      this.logger.finer("Adding");
+      this.logger.finer("Adding server descriptor with identifier "
+          + descriptorIdentity + ".");
     } else {
-      this.logger.fine("We already learned about this server descriptor "
-          + "in this run. Overwriting.");
+      this.logger.fine("We already learned about a server descriptor "
+          + "with identifier " + descriptorIdentity + ", published at "
+          + published + " in this execution. Overwriting.");
     }
     this.descriptors.put(key, line);
     this.descById.put(descriptorIdentity, line);
 
-    // force autosave if we have too many data; 50K descs ^= 10 days in early 2010
+    /* Check if we have more 50K server descriptors in memory (covering 10
+     * days as of early 2010). If so, trigger the auto-save mechanism. */
     if (this.descriptors.size() > 50000) {
-      this.logger.fine("Autosave triggered by adding descriptor: We have "
-          + this.consensuses.size() + " consensuses and " + this.descriptors.size()
-          + " descriptors in memory. Writing to disk now.");
+      this.logger.fine("Autosave triggered by adding server descriptor: "
+          + "We have " + this.consensuses.size() + " consensuses and "
+          + this.descriptors.size() + " descriptors in memory. Writing "
+          + "to disk now.");
       this.writeFiles();
     }
   }
 
   /**
-   * Writes the newly learned consensuses and server descriptors to disk
-   * and merges new findings about relay versions, platforms, and advertised
-   * bandwidth with existing stats files.
+   * Merges the newly learned consensuses and server descriptors with the
+   * ones we wrote to disk earlier and extracts new statistics about relay
+   * version, platforms, and advertised bandwidth.
+   *
+   * This method is rather complex, because we can only store a limited
+   * number of consensuses and serer descriptors in memory. Also, we want
+   * to avoid going through the files twice, once for merging old and new
+   * lines and another time for extracting statistics.
    */
-  /* why is this so complex? because the data doesn't fit into memory and
-   * we want to avoid going through the file more than once (that is,
-   * once for reading and once for writing) if at all possible. */
   public void writeFiles() {
 
-   // TODO use separate try blocks?
    try {
-      /* Initialize readers and writers for the two files. We are going to
-       * write to temporary files, delete originals, and rename. */
+
+      /* Initialize readers for reading intermediate results files from
+       * disk. */
       BufferedReader consensusesReader = null;
       if (this.consensusesFile.exists()) {
         consensusesReader = new BufferedReader(new FileReader(
@@ -193,48 +274,47 @@ public class ServerDescriptorStatsFileHandler {
           this.descriptorsFile));
       }
 
+      /* Prepare writing intermediate results. The idea is to write to
+       * temporary files while reading from the originals, delete the
+       * originals, and rename the temporary files to be the new
+       * originals. */
       this.consensusesTempFile.getParentFile().mkdirs();
-      BufferedWriter consensusesWriter = new BufferedWriter(new FileWriter(
-          this.consensusesTempFile));
-      BufferedWriter descriptorsWriter = new BufferedWriter(new FileWriter(
-          this.descriptorsTempFile));
-      BufferedWriter versionWriter = new BufferedWriter(new FileWriter(
-          this.versionStatsFile));
-      BufferedWriter platformWriter = new BufferedWriter(new FileWriter(
-          this.platformStatsFile));
-      BufferedWriter bandwidthWriter = new BufferedWriter(new FileWriter(
-          this.bandwidthStatsFile));
+      BufferedWriter consensusesWriter = new BufferedWriter(
+          new FileWriter(this.consensusesTempFile));
+      BufferedWriter descriptorsWriter = new BufferedWriter(
+          new FileWriter(this.descriptorsTempFile));
 
+      /* Prepare date format parsers. */
       SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
       dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
       SimpleDateFormat dateTimeFormat =
           new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
       dateTimeFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
 
+      /* Prepare extracting statistics and writing them to disk. */
       String statsDate = null;
-      // TODO make these configurable
-      List<String> versionKeys = new ArrayList<String>(Arrays.asList(
-          "0.1.1,0.1.2,0.2.0,0.2.1,0.2.2".split(",")));
-      List<String> platformKeys = new ArrayList<String>(Arrays.asList(
-          "Windows,SunOS,OpenBSD,NetBSD,Linux,FreeBSD,DragonFly,Darwin".
-          split(",")));
+      int[] versionStats = new int[this.relayVersions.size() + 1];
+      int[] platformStats = new int[this.relayPlatforms.size() + 1];
+      long bandwidthStats = 0L;
+      int consensusesAtThisDay = 0;
+      BufferedWriter versionWriter = new BufferedWriter(new FileWriter(
+          this.versionStatsFile));
+      BufferedWriter platformWriter = new BufferedWriter(new FileWriter(
+          this.platformStatsFile));
+      BufferedWriter bandwidthWriter = new BufferedWriter(new FileWriter(
+          this.bandwidthStatsFile));
       versionWriter.write("date");
-      for (String v : versionKeys) {
+      for (String v : this.relayVersions) {
         versionWriter.write("," + v);
       }
       versionWriter.write(",other\n");
       platformWriter.write("date");
-      for (String p : platformKeys) {
-        platformWriter.write("," + p.toLowerCase());
+      for (String p : this.relayPlatforms) {
+        platformWriter.write("," + p);
       }
       platformWriter.write(",other\n");
       bandwidthWriter.write("date,advbw\n");
 
-      int[] versionStats = new int[versionKeys.size() + 1];
-      int[] platformStats = new int[platformKeys.size() + 1];
-      long bandwidthStats = 0L;
-      int consensusesAtThisDay = 0;
-
       /* Always keep one line of the consensuses and descriptors file in
        * memory. */
       String consensusLine = consensusesReader != null ?
@@ -249,24 +329,25 @@ public class ServerDescriptorStatsFileHandler {
 
         /* Find out which line we want to process now, memorize it for
          * parsing below, advance the source from where we got the line,
-         * and write the line to disk. Afterwards, line contains
+         * and write the line to disk. Afterwards, variable line contains
          * the consensus line we want to parse in this iteration. */
-        String line = null; // TODO rename
+        String line = null;
         if (consensusLine != null) {
           if (!this.consensuses.isEmpty()) {
-            String fileVA = consensusLine.split(",")[0];
-            String memVA = this.consensuses.firstKey();
-            if (fileVA.equals(memVA)) {
-              this.logger.finer("We have a consensus line in memory that "
-                  + "we already knew before. Skipping.");
-              // TODO should we compare the two lines here?
+            String fileKey = consensusLine.split(",")[0];
+            String memKey = this.consensuses.firstKey();
+            if (fileKey.equals(memKey)) {
+              this.logger.finer("The consensus we read from disk has the "
+                  + "same valid-after time (" + fileKey + ") time as a "
+                  + "consensus we have in memory. Using the consensus "
+                  + "from memory.");
               consensusLine = consensusesReader.readLine();
-              continue; // TODO is this correct?
-            } else if (fileVA.compareTo(memVA) < 0) {
-              line = consensusLine; // TODO rename
+              continue;
+            } else if (fileKey.compareTo(memKey) < 0) {
+              line = consensusLine;
               consensusLine = consensusesReader.readLine();
             } else {
-              line = this.consensuses.remove(memVA);
+              line = this.consensuses.remove(memKey);
             }
           } else {
             line = consensusLine;
@@ -277,8 +358,11 @@ public class ServerDescriptorStatsFileHandler {
         }
         consensusesWriter.write(line + "\n");
 
-        /* Write all descriptor to disk that were published more than 24
-         * hours before this consensus. */
+        /* Write all server descriptors to disk that were published more
+         * than 24 hours before the consensus we're about to process. Also
+         * remove those server descriptors from memory. The idea is that
+         * those server descriptors cannot be referenced from the
+         * consensus anyway and would only bloat our memory. */
         String minus24h = dateTimeFormat.format(new Date(
             dateTimeFormat.parse(line.split(",")[0]).getTime() -
             (24L * 60L * 60L * 1000L)));
@@ -289,18 +373,24 @@ public class ServerDescriptorStatsFileHandler {
               compareTo(minus24h) < 0)) {
           if (descriptorLine != null) {
             if (!this.descriptors.isEmpty()) {
-              String filePubl = descriptorLine.substring(0, 47);
-              // 47 chars: 19 for datetime, 1 for comma, 27 for descid
-              String memPubl = this.descriptors.firstKey();
-              if (filePubl.equals(memPubl)) {
-                this.logger.finer("same desc. skipping.");
+              /* The first 47 chars contain the publication time (19
+               * chars), a comma (1 char), and the descriptor identifier
+               * (27 chars). */
+              String fileKey = descriptorLine.substring(0, 47);
+              String memKey = this.descriptors.firstKey();
+              if (fileKey.equals(memKey)) {
+                this.logger.finer("The server descriptor we read from "
+                    + "disk has the same publication time and identifier "
+                    + "(" + fileKey + ") as a server descriptor we have "
+                    + "in memory. Using the server descriptor from "
+                    + "memory.");
                 descriptorLine = descriptorsReader.readLine();
-                continue; // TODO is this correct?
-              } else if (filePubl.compareTo(memPubl) < 0) {
+                continue;
+              } else if (fileKey.compareTo(memKey) < 0) {
                 descriptorsWriter.write(descriptorLine + "\n");
                 descriptorLine = descriptorsReader.readLine();
               } else {
-                String removed = this.descriptors.remove(memPubl);
+                String removed = this.descriptors.remove(memKey);
                 this.descById.remove(removed.split(",")[1]);
                 descriptorsWriter.write(removed + "\n");
               }
@@ -316,8 +406,12 @@ public class ServerDescriptorStatsFileHandler {
           }
         }
 
-        /* Read in all descriptors that were published in the last 24
-         * hours before the consensus that we're just parsing. */
+        /* Read in all server descriptors that were published in the last
+         * 24 hours before the consensus that we're just processing. These
+         * server descriptors might be referenced from the consensus.
+         * Store references to these server descriptors by identifier to
+         * facilitate matching a consensus entry with the corresponding
+         * server descriptor. */
         String validAfter = line.split(",")[0];
         while (descriptorsReader != null && descriptorLine != null &&
             descriptorLine.split(",")[0].compareTo(validAfter) < 0) {
@@ -328,15 +422,15 @@ public class ServerDescriptorStatsFileHandler {
         }
 
         /* Now we have a consensus line we want to parse and all possibly
-         * referenced descriptors in descById (rename). Let's write some
-         * stats. */
+         * referenced descriptors in descById. Let's write some stats. */
         String consensusDate = line.substring(0, 10);
         if (statsDate == null) {
           statsDate = consensusDate;
         }
         if (!statsDate.equals(consensusDate)) {
-          /* If we have parsed at least half of the consensuses of a day,
-           * Write stats to disk. */ // TODO document this somewhere
+          /* We have finished one day of consensuses. If we have parsed at
+           * least half of the possible 24 consensuses of that day, write
+           * stats to disk. */
           if (consensusesAtThisDay >= 12) {
             versionWriter.write(statsDate);
             for (int i = 0; i < versionStats.length; i++) {
@@ -355,12 +449,11 @@ public class ServerDescriptorStatsFileHandler {
           } else {
             this.logger.fine("Not enough consensuses to write to stats.");
           }
-          versionStats = new int[versionKeys.size() + 1];
-          platformStats = new int[platformKeys.size() + 1];
-          bandwidthStats = 0L;
-          consensusesAtThisDay = 0;
-          // fill in NA's for missing dates
+          /* Fill in NA's for missing dates. */
           long writtenMillis = dateFormat.parse(statsDate).getTime();
+          if (consensusesAtThisDay < 12) {
+            writtenMillis -= 24L * 60L * 60L * 1000L;
+          }
           long nextMillis = dateFormat.parse(consensusDate).getTime();
           while (writtenMillis + (24L * 60L * 60L * 1000L) < nextMillis) {
             writtenMillis += 24L * 60L * 60L * 1000L;
@@ -377,15 +470,20 @@ public class ServerDescriptorStatsFileHandler {
             platformWriter.write(",NA\n");
             bandwidthWriter.write(date + ",NA\n");
           }
-          
+          /* Clear counters to collect next day's statistics. */
+          versionStats = new int[this.relayVersions.size() + 1];
+          platformStats = new int[this.relayPlatforms.size() + 1];
+          bandwidthStats = 0L;
+          consensusesAtThisDay = 0;
           statsDate = consensusDate;
         }
 
-        /* Parse all descriptors that are referenced from this consensus.
-         * only add values if we have 90+ % of all ref. descriptors!!
-         * TODO document this somewhere! */
-        int[] versionStatsCons = new int[versionKeys.size() + 1];
-        int[] platformStatsCons = new int[platformKeys.size() + 1];
+        /* For the given consensus, parse all referenced server
+         * descriptors to obtain statistics on versions, platforms, and
+         * advertised bandwidth. Only include these values if we have at
+         * least 90 % of all referenced server descriptors. */
+        int[] versionStatsCons = new int[this.relayVersions.size() + 1];
+        int[] platformStatsCons = new int[this.relayPlatforms.size() + 1];
         long bandwidthStatsCons = 0L;
         String[] ids = line.split(",");
         int seenDescs = 0;
@@ -396,20 +494,16 @@ public class ServerDescriptorStatsFileHandler {
             String[] parts = desc.split(",");
             String version = parts[2].substring(0,
                 parts[2].lastIndexOf("."));
-            if (versionKeys.contains(version)) {
-              versionStatsCons[versionKeys.indexOf(version)]++;
+            if (this.relayVersions.contains(version)) {
+              versionStatsCons[this.relayVersions.indexOf(version)]++;
             } else {
               versionStatsCons[versionStatsCons.length - 1]++;
             }
             String platform = parts[3].toLowerCase();
             boolean isOther = true;
-            // TODO document that order of platform strings in config
-            // matters! if there are two OS, "DragonFly" and "Dragon",
-            // put "DragonFly" first! capitalization doesn't matter, but
-            // is only relevant for stats file headers
-            for (String p : platformKeys) {
+            for (String p : this.relayPlatforms) {
               if (platform.contains(p.toLowerCase())) {
-                platformStatsCons[platformKeys.indexOf(p)]++;
+                platformStatsCons[this.relayPlatforms.indexOf(p)]++;
                 isOther = false;
                 break;
               }
@@ -431,37 +525,46 @@ public class ServerDescriptorStatsFileHandler {
           bandwidthStats += bandwidthStatsCons;
           consensusesAtThisDay++;
         } else {
-          this.logger.fine("not enough server descriptors for consensus, "
-              + "less than 90%. not including in stats.");
+          this.logger.fine("Not enough referenced server descriptors for "
+              + "consensus with valid-after time " + line.substring(0, 19)
+              + ". Not including this consensus in the statistics.");
         }
 
         /* We're done reading one consensus. */
       }
 
-      /* Write remaining server descriptors to disk. */
+      /* We're done reading all consensuses, both from disk and from
+       * memory. Write remaining server descriptors to disk. These are the
+       * server descriptors that were published 24 hours before the last
+       * parsed consensus and those server descriptors published
+       * afterwards. */
       while (descriptorLine != null || !this.descriptors.isEmpty()) {
         if (descriptorLine != null) {
           if (!this.descriptors.isEmpty()) {
-            String filePubl = descriptorLine.substring(0, 47);
-            // 47 chars: 19 for datetime, 1 for comma, 27 for descid
-            String memPubl = this.descriptors.firstKey();
-            if (filePubl.equals(memPubl)) {
-              this.logger.finer("same desc. skipping.");
+            String fileKey = descriptorLine.substring(0, 47);
+            String memKey = this.descriptors.firstKey();
+            if (fileKey.equals(memKey)) {
+              this.logger.finer("The server descriptor we read from "
+                    + "disk has the same publication time and identifier "
+                    + "(" + fileKey + ") as a server descriptor we have "
+                    + "in memory. Using the server descriptor from "
+                    + "memory.");
               descriptorLine = descriptorsReader.readLine();
-              continue; // TODO is this correct?
-            } else if (filePubl.compareTo(memPubl) < 0) {
+              continue;
+            } else if (fileKey.compareTo(memKey) < 0) {
               descriptorsWriter.write(descriptorLine + "\n");
               descriptorLine = descriptorsReader.readLine();
             } else {
-              descriptorsWriter.write(this.descriptors.remove(memPubl) + "\n");
+              descriptorsWriter.write(this.descriptors.remove(memKey)
+                  + "\n");
             }
           } else {
             descriptorsWriter.write(descriptorLine + "\n");
             descriptorLine = descriptorsReader.readLine();
           }
         } else {
-          descriptorsWriter.write(this.descriptors.remove(this.descriptors.firstKey())
-              + "\n");
+          descriptorsWriter.write(this.descriptors.remove(
+              this.descriptors.firstKey()) + "\n");
         }
       }
       this.descById.clear();
@@ -478,6 +581,9 @@ public class ServerDescriptorStatsFileHandler {
       bandwidthWriter.close();
       versionWriter.close();
       platformWriter.close();
+
+      /* Delete original files and rename temporary files to be the new
+       * originals. */
       if (this.consensusesFile.exists()) {
         this.consensusesFile.delete();
       }
@@ -488,10 +594,10 @@ public class ServerDescriptorStatsFileHandler {
       this.descriptorsTempFile.renameTo(this.descriptorsFile);
 
       /* Done. Whee! */
+      this.logger.fine("Finished writing.");
+
     } catch (Exception e) {
       this.logger.log(Level.WARNING, "Exception while writing files.", e);
     }
-    this.logger.fine("Finished writing.");
   }
 }
-
-- 
1.6.5