[tor-commits] [metrics-lib/master] Add descriptor source to fetch descriptors from CollecTor.

karsten at torproject.org karsten at torproject.org
Thu May 28 08:12:58 UTC 2015


commit e3d381f4c12eb61cc1d6491f31f1ac250602b3d9
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Thu May 21 17:13:17 2015 +0200

    Add descriptor source to fetch descriptors from CollecTor.
    
    Includes some really good suggestions from iwakeh.
    
    Implements #16151.
---
 .../torproject/descriptor/DescriptorCollector.java |   34 +++
 .../descriptor/DescriptorSourceFactory.java        |   12 +
 .../descriptor/impl/DescriptorCollectorImpl.java   |  240 ++++++++++++++++++++
 .../torproject/descriptor/impl/ParseHelper.java    |    2 +-
 .../impl/DescriptorCollectorImplTest.java          |  120 ++++++++++
 5 files changed, 407 insertions(+), 1 deletion(-)

diff --git a/src/org/torproject/descriptor/DescriptorCollector.java b/src/org/torproject/descriptor/DescriptorCollector.java
new file mode 100644
index 0000000..bd29fb0
--- /dev/null
+++ b/src/org/torproject/descriptor/DescriptorCollector.java
@@ -0,0 +1,34 @@
+/* Copyright 2015 The Tor Project
+ * See LICENSE for licensing information */
+package org.torproject.descriptor;
+
+import java.io.File;
+
+/** Fetch descriptors from the CollecTor service available at
+ * https://collector.torproject.org/ and store them to a local
+ * directory. */
+public interface DescriptorCollector {
+
+  /**
+   * Fetch remote files from a CollecTor instance that do not yet exist
+   * locally and possibly delete local files that do not exist remotely
+   * anymore.
+   *
+   * @param collecTorBaseUrl CollecTor base URL without trailing slash,
+   * e.g., "https://collector.torproject.org".
+   * @param remoteDirectories Remote directories to collect descriptors
+   * from, e.g., "/recent/relay-descriptors/server-descriptors/".  Only
+   * files in this directory will be collected, no files in its sub
+   * directories.
+   * @param minLastModified Minimum last-modified time in milliseconds of
+   * files to be collected.  Set to 0 for collecting all files.
+   * @param localDirectory Directory where collected files will be
+   * written.
+   * @param deleteExtraneousLocalFiles Whether to delete all local files
+   * that do not exist remotely anymore.
+   */
+  public void collectRemoteFiles(String collecTorBaseUrl,
+      String[] remoteDirectories, long minLastModified,
+      File localDirectory, boolean deleteExtraneousLocalFiles);
+}
+
diff --git a/src/org/torproject/descriptor/DescriptorSourceFactory.java b/src/org/torproject/descriptor/DescriptorSourceFactory.java
index 9bfd81f..49fcdc6 100644
--- a/src/org/torproject/descriptor/DescriptorSourceFactory.java
+++ b/src/org/torproject/descriptor/DescriptorSourceFactory.java
@@ -12,11 +12,14 @@ public final class DescriptorSourceFactory {
       "org.torproject.descriptor.impl.DescriptorParserImpl";
   public final static String READER_DEFAULT =
       "org.torproject.descriptor.impl.DescriptorReaderImpl";
+  public final static String COLLECTOR_DEFAULT =
+      "org.torproject.descriptor.impl.DescriptorCollectorImpl";
 
   /* property names */
   public final static String PARSER_PROPERTY = "onionoo.parser";
   public final static String READER_PROPERTY = "onionoo.property";
   public final static String LOADER_PROPERTY = "onionoo.downloader";
+  public final static String COLLECTOR_PROPERTY = "onionoo.collector";
 
   /**
    * Create a descriptor parser.
@@ -39,6 +42,13 @@ public final class DescriptorSourceFactory {
     return (DescriptorDownloader) retrieve(LOADER_PROPERTY);
   }
 
+  /**
+   * Create a descriptor collector.
+   */
+  public final static DescriptorCollector createDescriptorCollector() {
+    return (DescriptorCollector) retrieve(COLLECTOR_PROPERTY);
+  }
+
   private final static <T> Object retrieve(String type) {
     Object object;
     String clazzName = null;
@@ -49,6 +59,8 @@ public final class DescriptorSourceFactory {
         clazzName = System.getProperty(type, LOADER_DEFAULT);
       } else if (READER_PROPERTY.equals(type)) {
         clazzName = System.getProperty(type, READER_DEFAULT);
+      } else if (COLLECTOR_PROPERTY.equals(type)) {
+        clazzName = System.getProperty(type, COLLECTOR_DEFAULT);
       }
       object = ClassLoader.getSystemClassLoader().loadClass(clazzName).
           newInstance();
diff --git a/src/org/torproject/descriptor/impl/DescriptorCollectorImpl.java b/src/org/torproject/descriptor/impl/DescriptorCollectorImpl.java
new file mode 100644
index 0000000..ed88906
--- /dev/null
+++ b/src/org/torproject/descriptor/impl/DescriptorCollectorImpl.java
@@ -0,0 +1,240 @@
+/* Copyright 2015 The Tor Project
+ * See LICENSE for licensing information */
+package org.torproject.descriptor.impl;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.text.DateFormat;
+import java.text.ParseException;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.Stack;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.zip.GZIPInputStream;
+
+import org.torproject.descriptor.DescriptorCollector;
+
+public class DescriptorCollectorImpl implements DescriptorCollector {
+
+  @Override
+  public void collectRemoteFiles(String collecTorBaseUrl,
+      String[] remoteDirectories, long minLastModified,
+      File localDirectory, boolean deleteExtraneousLocalFiles) {
+    collecTorBaseUrl = collecTorBaseUrl.endsWith("/")
+        ? collecTorBaseUrl.substring(0, collecTorBaseUrl.length() - 1)
+        : collecTorBaseUrl;
+    if (minLastModified < 0) {
+      throw new IllegalArgumentException("A negative minimum "
+          + "last-modified time is not permitted.");
+    }
+    if (localDirectory.exists() && !localDirectory.isDirectory()) {
+      throw new IllegalArgumentException("Local directory already exists "
+          + "and is not a directory.");
+    }
+    SortedMap<String, Long> localFiles =
+        this.statLocalDirectory(localDirectory);
+    SortedMap<String, String> fetchedDirectoryListings =
+        this.fetchRemoteDirectories(collecTorBaseUrl, remoteDirectories);
+    SortedSet<String> parsedDirectories = new TreeSet<String>();
+    SortedMap<String, Long> remoteFiles = new TreeMap<String, Long>();
+    for (Map.Entry<String, String> e :
+        fetchedDirectoryListings.entrySet()) {
+      String remoteDirectory = e.getKey();
+      String directoryListing = e.getValue();
+      SortedMap<String, Long> parsedRemoteFiles =
+          this.parseDirectoryListing(remoteDirectory, directoryListing);
+      if (parsedRemoteFiles == null) {
+        continue;
+      }
+      parsedDirectories.add(remoteDirectory);
+      remoteFiles.putAll(parsedRemoteFiles);
+    }
+    this.fetchRemoteFiles(collecTorBaseUrl, remoteFiles, minLastModified,
+        localDirectory, localFiles);
+    if (deleteExtraneousLocalFiles) {
+      this.deleteExtraneousLocalFiles(parsedDirectories, remoteFiles,
+          localDirectory, localFiles);
+    }
+  }
+
+  SortedMap<String, Long> statLocalDirectory(
+      File localDirectory) {
+    SortedMap<String, Long> localFiles = new TreeMap<String, Long>();
+    if (!localDirectory.exists()) {
+      return localFiles;
+    }
+    Stack<File> files = new Stack<File>();
+    files.add(localDirectory);
+    while (!files.isEmpty()) {
+      File file = files.pop();
+      if (file.isDirectory()) {
+        files.addAll(Arrays.asList(file.listFiles()));
+      } else {
+        String localPath = file.getPath().substring(
+            localDirectory.getPath().length());
+        localFiles.put(localPath, file.lastModified());
+      }
+    }
+    return localFiles;
+  }
+
+  SortedMap<String, String> fetchRemoteDirectories(
+      String collecTorBaseUrl, String[] remoteDirectories) {
+    SortedMap<String, String> fetchedDirectoryListings =
+        new TreeMap<String, String>();
+    for (String remoteDirectory : remoteDirectories) {
+      String remoteDirectoryWithSlashAtBeginAndEnd =
+          (remoteDirectory.startsWith("/") ? "" : "/") + remoteDirectory
+          + (remoteDirectory.endsWith("/") ? "" : "/");
+      String directoryUrl = collecTorBaseUrl
+          + remoteDirectoryWithSlashAtBeginAndEnd;
+      String directoryListing = this.fetchRemoteDirectory(directoryUrl);
+      if (directoryListing.length() > 0) {
+        fetchedDirectoryListings.put(
+            remoteDirectoryWithSlashAtBeginAndEnd, directoryListing);
+      }
+    }
+    return fetchedDirectoryListings;
+  }
+
+  String fetchRemoteDirectory(String url) {
+    StringBuilder sb = new StringBuilder();
+    try {
+      URL u = new URL(url);
+      HttpURLConnection huc = (HttpURLConnection) u.openConnection();
+      huc.setRequestMethod("GET");
+      huc.connect();
+      int responseCode = huc.getResponseCode();
+      if (responseCode == 200) {
+        BufferedReader br = new BufferedReader(new InputStreamReader(
+            huc.getInputStream()));
+        String line;
+        while ((line = br.readLine()) != null) {
+          sb.append(line + "\n");
+        }
+        br.close();
+      }
+    } catch (IOException e) {
+      e.printStackTrace();
+      return "";
+    }
+    return sb.toString();
+  }
+
+  final Pattern DIRECTORY_LISTING_LINE_PATTERN =
+      Pattern.compile(".* href=\"([^\"/]+)\"" /* filename */
+      + ".*>(\\d{2}-\\w{3}-\\d{4} \\d{2}:\\d{2})\\s*<.*"); /* dateTime */
+
+  SortedMap<String, Long> parseDirectoryListing(
+      String remoteDirectory, String directoryListing) {
+    SortedMap<String, Long> remoteFiles = new TreeMap<String, Long>();
+    DateFormat dateTimeFormat = ParseHelper.getDateFormat(
+        "dd-MMM-yyyy HH:mm");
+    try {
+      Scanner s = new Scanner(directoryListing);
+      s.useDelimiter("\n");
+      while (s.hasNext()) {
+        String line = s.next();
+        Matcher matcher = DIRECTORY_LISTING_LINE_PATTERN.matcher(line);
+        if (matcher.matches()) {
+          String filename = matcher.group(1);
+          long lastModifiedMillis = dateTimeFormat.parse(
+              matcher.group(2)).getTime();
+          remoteFiles.put(remoteDirectory + filename, lastModifiedMillis);
+        }
+      }
+      s.close();
+    } catch (ParseException e) {
+      e.printStackTrace();
+      return null;
+    }
+    return remoteFiles;
+  }
+
+  void fetchRemoteFiles(String collecTorBaseUrl,
+      SortedMap<String, Long> remoteFiles, long minLastModified,
+      File localDirectory, SortedMap<String, Long> localFiles) {
+    for (Map.Entry<String, Long> e : remoteFiles.entrySet()) {
+      String filename = e.getKey();
+      long lastModifiedMillis = e.getValue();
+      if (lastModifiedMillis < minLastModified ||
+          (localFiles.containsKey(filename) &&
+          localFiles.get(filename) >= lastModifiedMillis)) {
+        continue;
+      }
+      String url = collecTorBaseUrl + filename;
+      File destinationFile = new File(localDirectory.getPath()
+          + filename);
+      this.fetchRemoteFile(url, destinationFile, lastModifiedMillis);
+    }
+  }
+
+  void fetchRemoteFile(String url, File destinationFile,
+      long lastModifiedMillis) {
+    try {
+      File destinationDirectory = destinationFile.getParentFile();
+      destinationDirectory.mkdirs();
+      File tempDestinationFile = new File(destinationDirectory, "."
+          + destinationFile.getName());
+      FileOutputStream fos = new FileOutputStream(tempDestinationFile);
+      URL u = new URL(url);
+      HttpURLConnection huc = (HttpURLConnection) u.openConnection();
+      huc.setRequestMethod("GET");
+      if (!url.endsWith(".xz")) {
+        huc.addRequestProperty("Accept-Encoding", "gzip");
+      }
+      huc.connect();
+      int responseCode = huc.getResponseCode();
+      if (responseCode == 200) {
+        InputStream is;
+        if (huc.getContentEncoding() != null &&
+            huc.getContentEncoding().equalsIgnoreCase("gzip")) {
+          is = new GZIPInputStream(huc.getInputStream());
+        } else {
+          is = huc.getInputStream();
+        }
+        BufferedInputStream bis = new BufferedInputStream(is);
+        int len;
+        byte[] data = new byte[1024];
+        while ((len = bis.read(data, 0, 1024)) >= 0) {
+          fos.write(data, 0, len);
+        }
+        bis.close();
+        fos.close();
+        tempDestinationFile.renameTo(destinationFile);
+        destinationFile.setLastModified(lastModifiedMillis);
+      }
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+  }
+
+  void deleteExtraneousLocalFiles(
+      SortedSet<String> parsedDirectories,
+      SortedMap<String, Long> remoteFiles, File localDirectory,
+      SortedMap<String, Long> localFiles) {
+    for (String localPath : localFiles.keySet()) {
+      for (String remoteDirectory : parsedDirectories) {
+        if (localPath.startsWith(remoteDirectory)) {
+          if (!remoteFiles.containsKey(localPath)) {
+            new File(localDirectory.getPath() + localPath).delete();
+          }
+        }
+      }
+    }
+  }
+}
+
diff --git a/src/org/torproject/descriptor/impl/ParseHelper.java b/src/org/torproject/descriptor/impl/ParseHelper.java
index 226bf80..09534c7 100644
--- a/src/org/torproject/descriptor/impl/ParseHelper.java
+++ b/src/org/torproject/descriptor/impl/ParseHelper.java
@@ -153,7 +153,7 @@ public class ParseHelper {
       super.set(value);
     }
   };
-  private static DateFormat getDateFormat(String format) {
+  static DateFormat getDateFormat(String format) {
     Map<String, DateFormat> threadDateFormats = dateFormats.get();
     if (!threadDateFormats.containsKey(format)) {
       DateFormat dateFormat = new SimpleDateFormat(format);
diff --git a/test/org/torproject/descriptor/impl/DescriptorCollectorImplTest.java b/test/org/torproject/descriptor/impl/DescriptorCollectorImplTest.java
new file mode 100644
index 0000000..2715f12
--- /dev/null
+++ b/test/org/torproject/descriptor/impl/DescriptorCollectorImplTest.java
@@ -0,0 +1,120 @@
+/* Copyright 2015 The Tor Project
+ * See LICENSE for licensing information */
+package org.torproject.descriptor.impl;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
+
+import java.util.SortedMap;
+
+import org.junit.Test;
+
+public class DescriptorCollectorImplTest {
+
+  private static final String REMOTE_DIRECTORY_CONSENSUSES =
+      "/recent/relay-descriptors/consensuses/";
+
+  @Test()
+  public void testOneFile() {
+    String remoteFilename = "2015-05-24-12-00-00-consensus";
+    String directoryListing = "<tr><td valign=\"top\">"
+        + "<img src=\"/icons/unknown.gif\" alt=\"[   ]\"></td><td>"
+        + "<a href=\"" + remoteFilename + "\">"
+        + "2015-05-24-12-00-00-consensus</a></td>"
+        + "<td align=\"right\">24-May-2015 12:08  </td>"
+        + "<td align=\"right\">1.5M</td><td> </td></tr>";
+    SortedMap<String, Long> remoteFiles =
+        new DescriptorCollectorImpl().parseDirectoryListing(
+        REMOTE_DIRECTORY_CONSENSUSES, directoryListing);
+    assertNotNull(remoteFiles);
+    assertSame(1, remoteFiles.size());
+    assertEquals(REMOTE_DIRECTORY_CONSENSUSES + remoteFilename,
+        remoteFiles.firstKey());
+    assertEquals((Long) 1432469280000L,
+        remoteFiles.get(remoteFiles.firstKey()));
+  }
+
+  @Test()
+  public void testSameFileTwoTimestampsLastWins() {
+    String remoteFilename = "2015-05-24-12-00-00-consensus";
+    String firstTimestamp = "24-May-2015 12:04";
+    String secondTimestamp = "24-May-2015 12:08";
+    String lineFormat = "<tr><td valign=\"top\">"
+        + "<img src=\"/icons/unknown.gif\" alt=\"[   ]\"></td><td>"
+        + "<a href=\"%s\">2015-05-24-12-00-00-consensus</a></td>"
+        + "<td align=\"right\">%s  </td>"
+        + "<td align=\"right\">1.5M</td><td> </td></tr>\n";
+    String directoryListing = String.format(lineFormat + lineFormat,
+        remoteFilename, firstTimestamp, remoteFilename, secondTimestamp);
+    SortedMap<String, Long> remoteFiles =
+        new DescriptorCollectorImpl().parseDirectoryListing(
+        REMOTE_DIRECTORY_CONSENSUSES, directoryListing);
+    assertNotNull(remoteFiles);
+    assertSame(1, remoteFiles.size());
+    assertEquals(REMOTE_DIRECTORY_CONSENSUSES + remoteFilename,
+        remoteFiles.firstKey());
+    assertEquals((Long) 1432469280000L,
+        remoteFiles.get(remoteFiles.firstKey()));
+  }
+
+  @Test()
+  public void testSubDirectoryOnly() {
+    String directoryListing = "<tr><td valign=\"top\">"
+        + "<img src=\"/icons/folder.gif\" alt=\"[DIR]\"></td><td>"
+        + "<a href=\"subdir/\">subdir/</a></td>"
+        + "<td align=\"right\">27-May-2015 14:07  </td>"
+        + "<td align=\"right\">  - </td><td> </td></tr>";
+    DescriptorCollectorImpl collector = new DescriptorCollectorImpl();
+    SortedMap<String, Long> remoteFiles = collector.parseDirectoryListing(
+        REMOTE_DIRECTORY_CONSENSUSES, directoryListing);
+    assertNotNull(remoteFiles);
+    assertTrue(remoteFiles.isEmpty());
+  }
+
+  @Test()
+  public void testParentDirectoryOnly() {
+    String directoryListing = "<tr><td valign=\"top\">"
+        + "<img src=\"/icons/back.gif\" alt=\"[DIR]\"></td><td>"
+        + "<a href=\"/recent/relay-descriptors/\">Parent Directory</a>"
+        + "</td><td> </td><td align=\"right\">  - </td>"
+        + "<td> </td></tr>";
+    DescriptorCollectorImpl collector = new DescriptorCollectorImpl();
+    SortedMap<String, Long> remoteFiles = collector.parseDirectoryListing(
+        REMOTE_DIRECTORY_CONSENSUSES, directoryListing);
+    assertNotNull(remoteFiles);
+    assertTrue(remoteFiles.isEmpty());
+  }
+
+  @Test()
+  public void testUnexpectedDateFormat() {
+    String directoryListing = "<tr><td valign=\"top\">"
+        + "<img src=\"/icons/unknown.gif\" alt=\"[   ]\"></td><td>"
+        + "<a href=\"2015-05-24-12-00-00-consensus\">"
+        + "2015-05-24-12-00-00-consensus</a></td>"
+        + "<td align=\"right\">2015-05-24 12:08  </td>"
+        + "<td align=\"right\">1.5M</td><td> </td></tr>";
+    SortedMap<String, Long> remoteFiles =
+        new DescriptorCollectorImpl().parseDirectoryListing(
+        REMOTE_DIRECTORY_CONSENSUSES, directoryListing);
+    assertNotNull(remoteFiles);
+    assertTrue(remoteFiles.isEmpty());
+  }
+
+  @Test()
+  public void testInvalidDate() {
+    String directoryListing = "<tr><td valign=\"top\">"
+        + "<img src=\"/icons/unknown.gif\" alt=\"[   ]\"></td><td>"
+        + "<a href=\"2015-05-24-12-00-00-consensus\">"
+        + "2015-05-24-12-00-00-consensus</a></td>"
+        + "<td align=\"right\">34-May-2015 12:08  </td>"
+        + "<td align=\"right\">1.5M</td><td> </td></tr>";
+    SortedMap<String, Long> remoteFiles =
+        new DescriptorCollectorImpl().parseDirectoryListing(
+        REMOTE_DIRECTORY_CONSENSUSES, directoryListing);
+    assertNull(remoteFiles);
+  }
+}
+



More information about the tor-commits mailing list