commit e3d381f4c12eb61cc1d6491f31f1ac250602b3d9 Author: Karsten Loesing karsten.loesing@gmx.net Date: Thu May 21 17:13:17 2015 +0200
Add descriptor source to fetch descriptors from CollecTor.
Includes some really good suggestions from iwakeh.
Implements #16151. --- .../torproject/descriptor/DescriptorCollector.java | 34 +++ .../descriptor/DescriptorSourceFactory.java | 12 + .../descriptor/impl/DescriptorCollectorImpl.java | 240 ++++++++++++++++++++ .../torproject/descriptor/impl/ParseHelper.java | 2 +- .../impl/DescriptorCollectorImplTest.java | 120 ++++++++++ 5 files changed, 407 insertions(+), 1 deletion(-)
diff --git a/src/org/torproject/descriptor/DescriptorCollector.java b/src/org/torproject/descriptor/DescriptorCollector.java new file mode 100644 index 0000000..bd29fb0 --- /dev/null +++ b/src/org/torproject/descriptor/DescriptorCollector.java @@ -0,0 +1,34 @@ +/* Copyright 2015 The Tor Project + * See LICENSE for licensing information */ +package org.torproject.descriptor; + +import java.io.File; + +/** Fetch descriptors from the CollecTor service available at + * https://collector.torproject.org/ and store them to a local + * directory. */ +public interface DescriptorCollector { + + /** + * Fetch remote files from a CollecTor instance that do not yet exist + * locally and possibly delete local files that do not exist remotely + * anymore. + * + * @param collecTorBaseUrl CollecTor base URL without trailing slash, + * e.g., "https://collector.torproject.org". + * @param remoteDirectories Remote directories to collect descriptors + * from, e.g., "/recent/relay-descriptors/server-descriptors/". Only + * files in this directory will be collected, no files in its sub + * directories. + * @param minLastModified Minimum last-modified time in milliseconds of + * files to be collected. Set to 0 for collecting all files. + * @param localDirectory Directory where collected files will be + * written. + * @param deleteExtraneousLocalFiles Whether to delete all local files + * that do not exist remotely anymore. + */ + public void collectRemoteFiles(String collecTorBaseUrl, + String[] remoteDirectories, long minLastModified, + File localDirectory, boolean deleteExtraneousLocalFiles); +} + diff --git a/src/org/torproject/descriptor/DescriptorSourceFactory.java b/src/org/torproject/descriptor/DescriptorSourceFactory.java index 9bfd81f..49fcdc6 100644 --- a/src/org/torproject/descriptor/DescriptorSourceFactory.java +++ b/src/org/torproject/descriptor/DescriptorSourceFactory.java @@ -12,11 +12,14 @@ public final class DescriptorSourceFactory { "org.torproject.descriptor.impl.DescriptorParserImpl"; public final static String READER_DEFAULT = "org.torproject.descriptor.impl.DescriptorReaderImpl"; + public final static String COLLECTOR_DEFAULT = + "org.torproject.descriptor.impl.DescriptorCollectorImpl";
/* property names */ public final static String PARSER_PROPERTY = "onionoo.parser"; public final static String READER_PROPERTY = "onionoo.property"; public final static String LOADER_PROPERTY = "onionoo.downloader"; + public final static String COLLECTOR_PROPERTY = "onionoo.collector";
/** * Create a descriptor parser. @@ -39,6 +42,13 @@ public final class DescriptorSourceFactory { return (DescriptorDownloader) retrieve(LOADER_PROPERTY); }
+ /** + * Create a descriptor collector. + */ + public final static DescriptorCollector createDescriptorCollector() { + return (DescriptorCollector) retrieve(COLLECTOR_PROPERTY); + } + private final static <T> Object retrieve(String type) { Object object; String clazzName = null; @@ -49,6 +59,8 @@ public final class DescriptorSourceFactory { clazzName = System.getProperty(type, LOADER_DEFAULT); } else if (READER_PROPERTY.equals(type)) { clazzName = System.getProperty(type, READER_DEFAULT); + } else if (COLLECTOR_PROPERTY.equals(type)) { + clazzName = System.getProperty(type, COLLECTOR_DEFAULT); } object = ClassLoader.getSystemClassLoader().loadClass(clazzName). newInstance(); diff --git a/src/org/torproject/descriptor/impl/DescriptorCollectorImpl.java b/src/org/torproject/descriptor/impl/DescriptorCollectorImpl.java new file mode 100644 index 0000000..ed88906 --- /dev/null +++ b/src/org/torproject/descriptor/impl/DescriptorCollectorImpl.java @@ -0,0 +1,240 @@ +/* Copyright 2015 The Tor Project + * See LICENSE for licensing information */ +package org.torproject.descriptor.impl; + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; +import java.text.DateFormat; +import java.text.ParseException; +import java.util.Arrays; +import java.util.Map; +import java.util.Scanner; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.Stack; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.zip.GZIPInputStream; + +import org.torproject.descriptor.DescriptorCollector; + +public class DescriptorCollectorImpl implements DescriptorCollector { + + @Override + public void collectRemoteFiles(String collecTorBaseUrl, + String[] remoteDirectories, long minLastModified, + File localDirectory, boolean deleteExtraneousLocalFiles) { + collecTorBaseUrl = collecTorBaseUrl.endsWith("/") + ? collecTorBaseUrl.substring(0, collecTorBaseUrl.length() - 1) + : collecTorBaseUrl; + if (minLastModified < 0) { + throw new IllegalArgumentException("A negative minimum " + + "last-modified time is not permitted."); + } + if (localDirectory.exists() && !localDirectory.isDirectory()) { + throw new IllegalArgumentException("Local directory already exists " + + "and is not a directory."); + } + SortedMap<String, Long> localFiles = + this.statLocalDirectory(localDirectory); + SortedMap<String, String> fetchedDirectoryListings = + this.fetchRemoteDirectories(collecTorBaseUrl, remoteDirectories); + SortedSet<String> parsedDirectories = new TreeSet<String>(); + SortedMap<String, Long> remoteFiles = new TreeMap<String, Long>(); + for (Map.Entry<String, String> e : + fetchedDirectoryListings.entrySet()) { + String remoteDirectory = e.getKey(); + String directoryListing = e.getValue(); + SortedMap<String, Long> parsedRemoteFiles = + this.parseDirectoryListing(remoteDirectory, directoryListing); + if (parsedRemoteFiles == null) { + continue; + } + parsedDirectories.add(remoteDirectory); + remoteFiles.putAll(parsedRemoteFiles); + } + this.fetchRemoteFiles(collecTorBaseUrl, remoteFiles, minLastModified, + localDirectory, localFiles); + if (deleteExtraneousLocalFiles) { + this.deleteExtraneousLocalFiles(parsedDirectories, remoteFiles, + localDirectory, localFiles); + } + } + + SortedMap<String, Long> statLocalDirectory( + File localDirectory) { + SortedMap<String, Long> localFiles = new TreeMap<String, Long>(); + if (!localDirectory.exists()) { + return localFiles; + } + Stack<File> files = new Stack<File>(); + files.add(localDirectory); + while (!files.isEmpty()) { + File file = files.pop(); + if (file.isDirectory()) { + files.addAll(Arrays.asList(file.listFiles())); + } else { + String localPath = file.getPath().substring( + localDirectory.getPath().length()); + localFiles.put(localPath, file.lastModified()); + } + } + return localFiles; + } + + SortedMap<String, String> fetchRemoteDirectories( + String collecTorBaseUrl, String[] remoteDirectories) { + SortedMap<String, String> fetchedDirectoryListings = + new TreeMap<String, String>(); + for (String remoteDirectory : remoteDirectories) { + String remoteDirectoryWithSlashAtBeginAndEnd = + (remoteDirectory.startsWith("/") ? "" : "/") + remoteDirectory + + (remoteDirectory.endsWith("/") ? "" : "/"); + String directoryUrl = collecTorBaseUrl + + remoteDirectoryWithSlashAtBeginAndEnd; + String directoryListing = this.fetchRemoteDirectory(directoryUrl); + if (directoryListing.length() > 0) { + fetchedDirectoryListings.put( + remoteDirectoryWithSlashAtBeginAndEnd, directoryListing); + } + } + return fetchedDirectoryListings; + } + + String fetchRemoteDirectory(String url) { + StringBuilder sb = new StringBuilder(); + try { + URL u = new URL(url); + HttpURLConnection huc = (HttpURLConnection) u.openConnection(); + huc.setRequestMethod("GET"); + huc.connect(); + int responseCode = huc.getResponseCode(); + if (responseCode == 200) { + BufferedReader br = new BufferedReader(new InputStreamReader( + huc.getInputStream())); + String line; + while ((line = br.readLine()) != null) { + sb.append(line + "\n"); + } + br.close(); + } + } catch (IOException e) { + e.printStackTrace(); + return ""; + } + return sb.toString(); + } + + final Pattern DIRECTORY_LISTING_LINE_PATTERN = + Pattern.compile(".* href="([^"/]+)"" /* filename */ + + ".*>(\d{2}-\w{3}-\d{4} \d{2}:\d{2})\s*<.*"); /* dateTime */ + + SortedMap<String, Long> parseDirectoryListing( + String remoteDirectory, String directoryListing) { + SortedMap<String, Long> remoteFiles = new TreeMap<String, Long>(); + DateFormat dateTimeFormat = ParseHelper.getDateFormat( + "dd-MMM-yyyy HH:mm"); + try { + Scanner s = new Scanner(directoryListing); + s.useDelimiter("\n"); + while (s.hasNext()) { + String line = s.next(); + Matcher matcher = DIRECTORY_LISTING_LINE_PATTERN.matcher(line); + if (matcher.matches()) { + String filename = matcher.group(1); + long lastModifiedMillis = dateTimeFormat.parse( + matcher.group(2)).getTime(); + remoteFiles.put(remoteDirectory + filename, lastModifiedMillis); + } + } + s.close(); + } catch (ParseException e) { + e.printStackTrace(); + return null; + } + return remoteFiles; + } + + void fetchRemoteFiles(String collecTorBaseUrl, + SortedMap<String, Long> remoteFiles, long minLastModified, + File localDirectory, SortedMap<String, Long> localFiles) { + for (Map.Entry<String, Long> e : remoteFiles.entrySet()) { + String filename = e.getKey(); + long lastModifiedMillis = e.getValue(); + if (lastModifiedMillis < minLastModified || + (localFiles.containsKey(filename) && + localFiles.get(filename) >= lastModifiedMillis)) { + continue; + } + String url = collecTorBaseUrl + filename; + File destinationFile = new File(localDirectory.getPath() + + filename); + this.fetchRemoteFile(url, destinationFile, lastModifiedMillis); + } + } + + void fetchRemoteFile(String url, File destinationFile, + long lastModifiedMillis) { + try { + File destinationDirectory = destinationFile.getParentFile(); + destinationDirectory.mkdirs(); + File tempDestinationFile = new File(destinationDirectory, "." + + destinationFile.getName()); + FileOutputStream fos = new FileOutputStream(tempDestinationFile); + URL u = new URL(url); + HttpURLConnection huc = (HttpURLConnection) u.openConnection(); + huc.setRequestMethod("GET"); + if (!url.endsWith(".xz")) { + huc.addRequestProperty("Accept-Encoding", "gzip"); + } + huc.connect(); + int responseCode = huc.getResponseCode(); + if (responseCode == 200) { + InputStream is; + if (huc.getContentEncoding() != null && + huc.getContentEncoding().equalsIgnoreCase("gzip")) { + is = new GZIPInputStream(huc.getInputStream()); + } else { + is = huc.getInputStream(); + } + BufferedInputStream bis = new BufferedInputStream(is); + int len; + byte[] data = new byte[1024]; + while ((len = bis.read(data, 0, 1024)) >= 0) { + fos.write(data, 0, len); + } + bis.close(); + fos.close(); + tempDestinationFile.renameTo(destinationFile); + destinationFile.setLastModified(lastModifiedMillis); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + + void deleteExtraneousLocalFiles( + SortedSet<String> parsedDirectories, + SortedMap<String, Long> remoteFiles, File localDirectory, + SortedMap<String, Long> localFiles) { + for (String localPath : localFiles.keySet()) { + for (String remoteDirectory : parsedDirectories) { + if (localPath.startsWith(remoteDirectory)) { + if (!remoteFiles.containsKey(localPath)) { + new File(localDirectory.getPath() + localPath).delete(); + } + } + } + } + } +} + diff --git a/src/org/torproject/descriptor/impl/ParseHelper.java b/src/org/torproject/descriptor/impl/ParseHelper.java index 226bf80..09534c7 100644 --- a/src/org/torproject/descriptor/impl/ParseHelper.java +++ b/src/org/torproject/descriptor/impl/ParseHelper.java @@ -153,7 +153,7 @@ public class ParseHelper { super.set(value); } }; - private static DateFormat getDateFormat(String format) { + static DateFormat getDateFormat(String format) { Map<String, DateFormat> threadDateFormats = dateFormats.get(); if (!threadDateFormats.containsKey(format)) { DateFormat dateFormat = new SimpleDateFormat(format); diff --git a/test/org/torproject/descriptor/impl/DescriptorCollectorImplTest.java b/test/org/torproject/descriptor/impl/DescriptorCollectorImplTest.java new file mode 100644 index 0000000..2715f12 --- /dev/null +++ b/test/org/torproject/descriptor/impl/DescriptorCollectorImplTest.java @@ -0,0 +1,120 @@ +/* Copyright 2015 The Tor Project + * See LICENSE for licensing information */ +package org.torproject.descriptor.impl; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertSame; +import static org.junit.Assert.assertTrue; + +import java.util.SortedMap; + +import org.junit.Test; + +public class DescriptorCollectorImplTest { + + private static final String REMOTE_DIRECTORY_CONSENSUSES = + "/recent/relay-descriptors/consensuses/"; + + @Test() + public void testOneFile() { + String remoteFilename = "2015-05-24-12-00-00-consensus"; + String directoryListing = "<tr><td valign="top">" + + "<img src="/icons/unknown.gif" alt="[ ]"></td><td>" + + "<a href="" + remoteFilename + "">" + + "2015-05-24-12-00-00-consensus</a></td>" + + "<td align="right">24-May-2015 12:08 </td>" + + "<td align="right">1.5M</td><td> </td></tr>"; + SortedMap<String, Long> remoteFiles = + new DescriptorCollectorImpl().parseDirectoryListing( + REMOTE_DIRECTORY_CONSENSUSES, directoryListing); + assertNotNull(remoteFiles); + assertSame(1, remoteFiles.size()); + assertEquals(REMOTE_DIRECTORY_CONSENSUSES + remoteFilename, + remoteFiles.firstKey()); + assertEquals((Long) 1432469280000L, + remoteFiles.get(remoteFiles.firstKey())); + } + + @Test() + public void testSameFileTwoTimestampsLastWins() { + String remoteFilename = "2015-05-24-12-00-00-consensus"; + String firstTimestamp = "24-May-2015 12:04"; + String secondTimestamp = "24-May-2015 12:08"; + String lineFormat = "<tr><td valign="top">" + + "<img src="/icons/unknown.gif" alt="[ ]"></td><td>" + + "<a href="%s">2015-05-24-12-00-00-consensus</a></td>" + + "<td align="right">%s </td>" + + "<td align="right">1.5M</td><td> </td></tr>\n"; + String directoryListing = String.format(lineFormat + lineFormat, + remoteFilename, firstTimestamp, remoteFilename, secondTimestamp); + SortedMap<String, Long> remoteFiles = + new DescriptorCollectorImpl().parseDirectoryListing( + REMOTE_DIRECTORY_CONSENSUSES, directoryListing); + assertNotNull(remoteFiles); + assertSame(1, remoteFiles.size()); + assertEquals(REMOTE_DIRECTORY_CONSENSUSES + remoteFilename, + remoteFiles.firstKey()); + assertEquals((Long) 1432469280000L, + remoteFiles.get(remoteFiles.firstKey())); + } + + @Test() + public void testSubDirectoryOnly() { + String directoryListing = "<tr><td valign="top">" + + "<img src="/icons/folder.gif" alt="[DIR]"></td><td>" + + "<a href="subdir/">subdir/</a></td>" + + "<td align="right">27-May-2015 14:07 </td>" + + "<td align="right"> - </td><td> </td></tr>"; + DescriptorCollectorImpl collector = new DescriptorCollectorImpl(); + SortedMap<String, Long> remoteFiles = collector.parseDirectoryListing( + REMOTE_DIRECTORY_CONSENSUSES, directoryListing); + assertNotNull(remoteFiles); + assertTrue(remoteFiles.isEmpty()); + } + + @Test() + public void testParentDirectoryOnly() { + String directoryListing = "<tr><td valign="top">" + + "<img src="/icons/back.gif" alt="[DIR]"></td><td>" + + "<a href="/recent/relay-descriptors/">Parent Directory</a>" + + "</td><td> </td><td align="right"> - </td>" + + "<td> </td></tr>"; + DescriptorCollectorImpl collector = new DescriptorCollectorImpl(); + SortedMap<String, Long> remoteFiles = collector.parseDirectoryListing( + REMOTE_DIRECTORY_CONSENSUSES, directoryListing); + assertNotNull(remoteFiles); + assertTrue(remoteFiles.isEmpty()); + } + + @Test() + public void testUnexpectedDateFormat() { + String directoryListing = "<tr><td valign="top">" + + "<img src="/icons/unknown.gif" alt="[ ]"></td><td>" + + "<a href="2015-05-24-12-00-00-consensus">" + + "2015-05-24-12-00-00-consensus</a></td>" + + "<td align="right">2015-05-24 12:08 </td>" + + "<td align="right">1.5M</td><td> </td></tr>"; + SortedMap<String, Long> remoteFiles = + new DescriptorCollectorImpl().parseDirectoryListing( + REMOTE_DIRECTORY_CONSENSUSES, directoryListing); + assertNotNull(remoteFiles); + assertTrue(remoteFiles.isEmpty()); + } + + @Test() + public void testInvalidDate() { + String directoryListing = "<tr><td valign="top">" + + "<img src="/icons/unknown.gif" alt="[ ]"></td><td>" + + "<a href="2015-05-24-12-00-00-consensus">" + + "2015-05-24-12-00-00-consensus</a></td>" + + "<td align="right">34-May-2015 12:08 </td>" + + "<td align="right">1.5M</td><td> </td></tr>"; + SortedMap<String, Long> remoteFiles = + new DescriptorCollectorImpl().parseDirectoryListing( + REMOTE_DIRECTORY_CONSENSUSES, directoryListing); + assertNull(remoteFiles); + } +} +
tor-commits@lists.torproject.org