[or-cvs] r20961: {projects} Several minor improvements to the bridge-descriptor sanitize (projects/archives/trunk/bridge-desc-sanitizer)

kloesing at seul.org kloesing at seul.org
Tue Nov 17 15:09:11 UTC 2009


Author: kloesing
Date: 2009-11-17 10:09:11 -0500 (Tue, 17 Nov 2009)
New Revision: 20961

Modified:
   projects/archives/trunk/bridge-desc-sanitizer/ConvertBridgeDescs.java
   projects/archives/trunk/bridge-desc-sanitizer/HOWTO
Log:
Several minor improvements to the bridge-descriptor sanitizer
- switch to MaxMind geoip database
- scrub nicknames from the family line
- keep unreferenced descriptors
- replace references to missing descriptors with all zeros
- tweak the resulting directory structure a bit


Modified: projects/archives/trunk/bridge-desc-sanitizer/ConvertBridgeDescs.java
===================================================================
--- projects/archives/trunk/bridge-desc-sanitizer/ConvertBridgeDescs.java	2009-11-16 23:48:42 UTC (rev 20960)
+++ projects/archives/trunk/bridge-desc-sanitizer/ConvertBridgeDescs.java	2009-11-17 15:09:11 UTC (rev 20961)
@@ -1,7 +1,9 @@
 import java.io.*;
 import java.util.*;
+import com.maxmind.geoip.LookupService;
 import org.apache.commons.codec.digest.DigestUtils;
-import org.apache.commons.codec.binary.*;
+import org.apache.commons.codec.binary.Hex;
+import org.apache.commons.codec.binary.Base64;
 
 public class ConvertBridgeDescs {
 
@@ -18,31 +20,26 @@
     }
     File inDir = new File(args[0]);
     File geoipFile = new File(args[1]);
+    LookupService cl = new LookupService(geoipFile,
+        LookupService.GEOIP_MEMORY_CACHE);
+    Set<String> unresolved = new HashSet<String>();
+    unresolved.add("--");
+    unresolved.add("a1");
+    unresolved.add("a2");
+    unresolved.add("eu");
+    unresolved.add("ap");
     String year = args[2];
     String month = args[3];
     int yearInt = Integer.parseInt(year);
     int monthInt = Integer.parseInt(month);
-    File outDir = new File(args[4]);
-    if (!outDir.exists()) {
-      outDir.mkdir();
-    }
+    File outDir = new File(args[4] + File.separator
+        + "bridge-descriptors-" + year + "-" + month);
+    outDir.mkdirs();
 
     SortedSet<File> statuses = new TreeSet<File>();
     Set<File> descriptors = new HashSet<File>();
     Set<File> extrainfos = new HashSet<File>();
 
-    System.out.print("Parsing geoip.txt file... ");
-    BufferedReader r = new BufferedReader(new FileReader(geoipFile));
-    String line0 = null;
-    SortedMap<Long, String> geoipDatabase = new TreeMap<Long, String>();
-    while ((line0 = r.readLine()) != null) {
-      if (!line0.startsWith("#"))
-        geoipDatabase.put(Long.parseLong(line0.split(",")[0]),
-            line0.substring(line0.indexOf(',') + 1));
-    }
-    System.out.println("Found " + geoipDatabase.size()
-        + " entries (expected 100,000 +- 10,000).");
-
     System.out.println("Checking files in " + inDir.getAbsolutePath()
         + "...");
     Stack<File> directoriesLeftToParse = new Stack<File>();
@@ -57,10 +54,9 @@
     while (!directoriesLeftToParse.isEmpty()) {
       File directoryOrFile = directoriesLeftToParse.pop();
       String filename = directoryOrFile.getName();
-      boolean addDirectory = false;
       if (directoryOrFile.isDirectory()) {
         if (/* base directory */
-            filename.equals("in") ||
+            filename.equals(inDir.getName()) ||
             /* current month */
             filename.startsWith(currentYearAndMonth) ||
             /* last days of previous month */
@@ -69,7 +65,7 @@
             /* first days of next month */
             (filename.startsWith(nextYearAndMonth)
             && Integer.parseInt(filename.substring(19, 21)) < 6)) {
-          for (File fileInDir: directoryOrFile.listFiles()) {
+          for (File fileInDir : directoryOrFile.listFiles()) {
             directoriesLeftToParse.push(fileInDir);
           }
         }
@@ -99,7 +95,7 @@
       for (String y : hex)
         new File(outDir + File.separator + "extra-infos" + File.separator
             + x + File.separator + y).mkdirs();
-    Set<File> writtenExtrainfos = new HashSet<File>();
+    int writtenExtrainfos = 0;
     Map<String, String> extrainfoMapping = new HashMap<String, String>();
     int parsed = 0;
     for (File file : extrainfos) {
@@ -147,7 +143,7 @@
             BufferedWriter bw = new BufferedWriter(new FileWriter(out));
             bw.write(scrubbedDesc);
             bw.close();
-            writtenExtrainfos.add(out);
+            writtenExtrainfos++;
           }
         } else if (line.equals("-----BEGIN SIGNATURE-----")) {
           skipSignature = true;
@@ -160,16 +156,15 @@
       }
       br.close();
     }
-    System.out.println("\nWrote " + writtenExtrainfos.size()
+    System.out.println("\nWrote " + writtenExtrainfos
         + " extra-info descriptors.");
 
     System.out.print("Parsing server descriptors");
     for (String x : hex)
       for (String y : hex)
-        new File(outDir + File.separator + "descriptors" + File.separator
-            + x + File.separator + y).mkdirs();
-    Set<File> writtenDescriptors = new HashSet<File>();
-    Map<File, File> referencedExtraInfos = new HashMap<File, File>();
+        new File(outDir + File.separator + "server-descriptors"
+            + File.separator + x + File.separator + y).mkdirs();
+    int writtenDescriptors = 0;
     Map<String, String> descriptorMapping = new HashMap<String, String>();
     int found = 0, notfound = 0;
     parsed = 0;
@@ -189,19 +184,10 @@
           continue;
         } else if (line.startsWith("router ")) {
           original = new StringBuilder(line + "\n");
-          country = "zz";
-          String[] ipParts = line.split(" ")[2].replace('.', ' ').split(" ");
-          long ipNum = Long.parseLong(ipParts[0]) * 256L * 256L * 256L
-              + Long.parseLong(ipParts[1]) * 256L * 256L
-              + Long.parseLong(ipParts[2]) * 256L
-              + Long.parseLong(ipParts[3]);
-          long intervalStart = -1;
-          if (ipNum >= geoipDatabase.firstKey()) {
-            intervalStart = geoipDatabase.subMap(0L, ipNum).lastKey();
-            String dbContent = geoipDatabase.get(intervalStart);
-            long intervalEnd = Long.parseLong(dbContent.split(",")[0]);
-            if (ipNum <= intervalEnd)
-              country = dbContent.split(",")[1].toLowerCase();
+          country = cl.getCountry(line.split(" ")[2]).getCode().
+              toLowerCase();
+          if (unresolved.contains(country)) {
+            country = "zz";
           }
           scrubbed = new StringBuilder("router Unnamed 127.0.0.1 "
               + line.split(" ")[3] + " " + line.split(" ")[4] + " "
@@ -238,40 +224,30 @@
           }
           descriptorMapping.put(originalHash, scrubbedHash);
           if (haveExtraInfo != null) {
-            File out = new File(outDir + File.separator + "descriptors"
-                + File.separator + scrubbedHash.charAt(0) + File.separator
+            File out = new File(outDir + File.separator
+                + "server-descriptors" + File.separator
+                + scrubbedHash.charAt(0) + File.separator
                 + scrubbedHash.charAt(1) + File.separator + scrubbedHash);
             if (!out.exists()) {
               BufferedWriter bw2 = new BufferedWriter(new FileWriter(out));
               bw2.write(scrubbedDesc);
               bw2.close();
-              writtenDescriptors.add(out);
-              String extraInfoHash = haveExtraInfo.toLowerCase();
-              File extrainfoFile = new File(outDir + File.separator
-                  + "extra-infos" + File.separator
-                  + extraInfoHash.charAt(0) + File.separator
-                  + extraInfoHash.charAt(1) + File.separator
-                  + extraInfoHash);
-              if (!extrainfoFile.exists()) {
-                System.out.println("Extra-info descriptor '"
-                    + extrainfoFile + "' does not exist.");
-                System.exit(1);
-              }
-              referencedExtraInfos.put(out, extrainfoFile);
+              writtenDescriptors++;
             }
           }
         } else if (line.startsWith("opt extra-info-digest ")) {
           String originalExtraInfo = line.split(" ")[2].toLowerCase();
           if (!extrainfoMapping.containsKey(originalExtraInfo)) {
             notfound++;
+            haveExtraInfo = "0000000000000000000000000000000000000000";
           } else {
             found++;
-            original.append(line + "\n");
             haveExtraInfo = extrainfoMapping.get(originalExtraInfo).
                 toUpperCase();
-            scrubbed.append("opt extra-info-digest " + haveExtraInfo
-                + "\n");
           }
+          original.append(line + "\n");
+          scrubbed.append("opt extra-info-digest " + haveExtraInfo
+              + "\n");
         } else if (line.startsWith("reject ")
             || line.startsWith("accept ")) {
           if (!contactWritten) {
@@ -286,7 +262,6 @@
             || line.startsWith("published ")
             || line.startsWith("uptime ")
             || line.startsWith("bandwidth ")
-            || line.startsWith("uptime ")
             || line.startsWith("opt hibernating ")
             || line.equals("opt hidden-service-dir")
             || line.equals("opt caches-extra-info")) {
@@ -295,11 +270,12 @@
         } else if (line.startsWith("family ")) {
           StringBuilder familyLine = new StringBuilder("family");
           for (String s : line.substring(7).split(" ")) {
-            if (s.startsWith("$"))
+            if (s.startsWith("$")) {
               familyLine.append(" $" + DigestUtils.shaHex(Hex.decodeHex(
                   s.substring(1).toCharArray())).toUpperCase());
-            else
-              familyLine.append(" " + s);
+            } else {
+              familyLine.append(" Unnamed");
+            }
           }
           original.append(line + "\n");
           scrubbed.append(familyLine.toString() + "\n");
@@ -319,14 +295,13 @@
       }
       br.close();
     }
-    System.out.println("\nWrote " + writtenDescriptors.size()
+    System.out.println("\nWrote " + writtenDescriptors
         + " bridge descriptors. While parsing, we found that we parsed "
         + found + " extra-info identifiers before, but are missing "
         + notfound + ". (The number of missing identifiers should be "
         + "significantly smaller.)");
 
     System.out.print("Parsing network statuses");
-    Set<File> referencedDescriptors = new HashSet<File>();
     parsed = notfound = found = 0;
     for (File file : statuses) {
       if (parsed++ > statuses.size() / days) {
@@ -340,51 +315,45 @@
       BufferedReader br = new BufferedReader(new FileReader(file));
       String line = null;
       StringBuilder scrubbed = new StringBuilder();
-      boolean addSLine = false;
       while ((line = br.readLine()) != null) {
         if (line.startsWith("r ")) {
           String[] parts = line.split(" ");
           String bridgeIdentity = parts[2] + "==";
           String hexBridgeIdentity = Hex.encodeHexString(
               Base64.decodeBase64(bridgeIdentity));
-          String hashedBridgeIdentity2 = Base64.encodeBase64String(
-              DigestUtils.sha(Base64.decodeBase64(bridgeIdentity))).
-              replace("=", "");
           String hashedBridgeIdentity = Base64.encodeBase64String(
               DigestUtils.sha(Base64.decodeBase64(bridgeIdentity))).
               substring(0, 27);
           String descIdentifier = parts[3] + "==";
           String hexDescIdentifier = Hex.encodeHexString(
               Base64.decodeBase64(descIdentifier));
+          String replacementDescIdentifier = null;
           if (!descriptorMapping.containsKey(hexDescIdentifier)) {
             notfound++;
-            addSLine = false;
+            replacementDescIdentifier = "AAAAAAAAAAAAAAAAAAAAAAAAAAA";
           } else {
             found++;
             String refDesc = descriptorMapping.get(hexDescIdentifier).
                 toLowerCase();
             File descriptorFile = new File(outDir + File.separator
-                + "descriptors" + File.separator + refDesc.charAt(0)
-                + File.separator + refDesc.charAt(1) + File.separator
-                + refDesc);
+                + "server-descriptors" + File.separator
+                + refDesc.charAt(0) + File.separator + refDesc.charAt(1)
+                + File.separator + refDesc);
             if (!descriptorFile.exists()) {
               System.out.println("Descriptor file '"
                   + descriptorFile.getAbsolutePath() + "' does not exist.");
+              System.exit(1);
             }
-            String replacementDescIdentifier = Base64.encodeBase64String(
+            replacementDescIdentifier = Base64.encodeBase64String(
                 Hex.decodeHex(descriptorMapping.get(hexDescIdentifier).
                 toCharArray())).substring(0, 27);
-            scrubbed.append("r Unnamed " + hashedBridgeIdentity
-                + " " + replacementDescIdentifier + " " + parts[4] + " "
-                + parts[5] + " 127.0.0.1 " + parts[7] + " " + parts[8]
-                + "\n");
-            addSLine = true;
-            referencedDescriptors.add(descriptorFile);
           }
+          scrubbed.append("r Unnamed " + hashedBridgeIdentity
+              + " " + replacementDescIdentifier + " " + parts[4] + " "
+              + parts[5] + " 127.0.0.1 " + parts[7] + " " + parts[8]
+              + "\n");
         } else if (line.startsWith("s ")) {
-          if (addSLine) {
-            scrubbed.append(line + "\n");
-          }
+          scrubbed.append(line + "\n");
         } else {
           System.out.println("Unknown line: " + line);
           System.exit(1);
@@ -395,7 +364,6 @@
       String[] date = timeString.substring(0, 10).split("-");
       String time = timeString.substring(11, 17);
       File dir = new File(outDir + File.separator + "statuses"
-          + File.separator + date[0] + File.separator + date[1]
           + File.separator + date[2] + File.separator);
       dir.mkdirs();
       File out = new File(dir.getAbsolutePath() + File.separator + date[0]
@@ -412,38 +380,6 @@
         + notfound + ". (The number of missing identifiers should be "
         + "significantly smaller.)");
 
-    Set<File> deleteFromReferencedExtraInfos = new HashSet<File>();
-    for (File e : referencedExtraInfos.keySet()) {
-      if (!referencedDescriptors.contains(e)) {
-        deleteFromReferencedExtraInfos.add(e);
-      }
-    }
-    for (File e : deleteFromReferencedExtraInfos) {
-      referencedExtraInfos.remove(e);
-    }
-    SortedSet<File> deleteDescriptors = new TreeSet<File>();
-    for (File e : writtenDescriptors) {
-      if (!referencedDescriptors.contains(e)) {
-        deleteDescriptors.add(e);
-      }
-    }
-    SortedSet<File> deleteExtraInfos = new TreeSet<File>();
-    for (File e : writtenExtrainfos) {
-      if (!referencedExtraInfos.values().contains(e)) {
-        deleteExtraInfos.add(e);
-      }
-    }
-    System.out.println("Deleting " + deleteDescriptors.size()
-        + " unreferenced bridge descriptors and "
-        + deleteExtraInfos.size() + " extra-info descriptors (keeping "
-        + (writtenDescriptors.size() - deleteDescriptors.size())
-        + " bridge descriptors and " + (writtenExtrainfos.size()
-        - deleteExtraInfos.size()) + " extra-info descriptors).");
-    for (File e : deleteDescriptors)
-      e.delete();
-    for (File e : deleteExtraInfos)
-      e.delete();
-
     long finished = System.currentTimeMillis();
     System.out.println("Processing took " + ((finished - started) / 1000)
         + " seconds.");

Modified: projects/archives/trunk/bridge-desc-sanitizer/HOWTO
===================================================================
--- projects/archives/trunk/bridge-desc-sanitizer/HOWTO	2009-11-16 23:48:42 UTC (rev 20960)
+++ projects/archives/trunk/bridge-desc-sanitizer/HOWTO	2009-11-17 15:09:11 UTC (rev 20961)
@@ -54,7 +54,9 @@
 
    If there is contact information in a descriptor, the contact line is
    changed to "somebody at ...". If there is none, a contact line is added
-   saying "nobody at ..." in order to put in the country code.
+   saying "nobody at ..." in order to put in the country code. If the
+   bridge's IP address cannot be resolved to a country, the unassigned
+   country code "zz" is written to the contact line.
 
 5. Replace nickname with Unnamed
 
@@ -64,6 +66,14 @@
    addresses. All bridge nicknames are therefore replaced with the string
    Unnamed.
 
+6. Replace references to descriptors
+
+   Changing anything in the server descriptors or extra-info descriptors
+   invalidates the references from network statuses or server descriptors,
+   respectively. All references are replaced with the new hashes of
+   referenced descriptors, if available. In case of missing descriptors,
+   references are replaced with all zeros (or 'A's in base 64 encoding).
+
 Note that these processing steps only prevent people from learning about
 new bridge locations. People who already know a bridge identity or location
 can easily learn more about this bridge from the sanitized descriptors.
@@ -84,30 +94,39 @@
   following assumed to be commons-codec-1.4.jar) in the same directory as
   this HOWTO file.
 
+- Download MaxMind GeoIP Java library from http://geolite.maxmind.com/
+  download/geoip/api/java/ and generate a JAR file as described in the
+  README file. Place the resulting maxmindgeoip.jar in the same directory
+  as this HOWTO file.
+
 - Copy the half-hourly snapshots named from-tonga-YYYY-MM-DDThhmmssZ.tar.gz
   in a directory called data/ in the same directory as this HOWTO file.
 
 - Run ./extract-bridges.sh to extract the half-hourly snapshots in data/
   to separate directories in the newly created subdirectory in/ .
 
-- Copy the geoip.txt from the Tor sources (from /src/config/) to the same
-  directory as this HOWTO file.
+- Put the binary MaxMind GeoIP database file that shall be used for
+  resolving IP addresses to country codes in the same directory as this
+  HOWTO file. Either the free or the commercial version of the database
+  can be used. For the archives provided by The Tor Project, the first
+  available commercial version of the subsequent month is used.
 
 - Compile the Java class using
 
-  $ javac -cp commons-codec-1.4.jar ConvertBridgeDescs.java
+  $ javac -cp commons-codec-1.4.jar:maxmindgeoip.jar
+          ConvertBridgeDescs.java
 
 - Run the script, providing it with the parameters it needs:
 
-  java -cp .:commons-codec-1.4.jar ConvertBridgeDescs
-           <input directory> <geoip.txt file>
-           <YYYY> <MM> <output directory>
+  java -cp .:commons-codec-1.4.jar:maxmindgeoip.jar ConvertBridgeDescs
+           <input directory> <geoip database file> <YYYY> <MM>
+           <output directory>
 
   Note that YYYY and MM specify the month that shall be processed. The other
   descriptors in the input directory are ignored.
 
   A sample invocation might be:
 
-  $ java -cp .:commons-codec-1.4.jar ConvertBridgeDescs in/ geoip.txt
-        2008 10 out/
+  $ java -cp .:commons-codec-1.4.jar:maxmindgeoip.jar ConvertBridgeDescs
+        in/ GeoIP-106_20081101.dat 2008 10 out/
 



More information about the tor-commits mailing list