commit 500b7c5ad3d94a0f0f8b8c7fdb110813895c25f0 Author: Karsten Loesing karsten.loesing@gmx.net Date: Wed Oct 30 21:56:41 2019 +0100
Extend index.json by additional file meta data.
Implements #31204. --- CHANGELOG.md | 13 + build.xml | 2 +- src/build | 2 +- .../org/torproject/metrics/collector/conf/Key.java | 5 +- .../metrics/collector/indexer/CreateIndexJson.java | 678 +++++++++++++++++---- .../metrics/collector/indexer/DirectoryNode.java | 36 ++ .../metrics/collector/indexer/FileNode.java | 125 ++++ .../metrics/collector/indexer/IndexNode.java | 30 + .../metrics/collector/indexer/IndexerTask.java | 225 +++++++ src/main/resources/collector.properties | 14 +- src/main/resources/create-tarballs.sh | 2 +- .../metrics/collector/conf/ConfigurationTest.java | 2 +- .../collector/indexer/CreateIndexJsonTest.java | 522 ++++++++++++++++ .../metrics/collector/indexer/IndexerTaskTest.java | 226 +++++++ 14 files changed, 1756 insertions(+), 126 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md index bb328ad..38754c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changes in version 1.??.? - 2019-??-??
+ * Medium changes + - Extend index.json by including descriptor types, first and last + publication timestamp, and SHA-256 file digest. Requires making + configuration changes in collector.properties: + 1) IndexedPath is a new directory with subdirectories for + archived and recent descriptors, + 2) ArchivePath and IndexPath are hard-wired to be subdirectories + of IndexedPath, + 3) RecentPath must be set to be a subdirectory of IndexedPath, + 4) ContribPath has disappeared, and + 5) HtdocsPath is a new directory with files served by the web + server. +
# Changes in version 1.12.0 - 2019-10-18
diff --git a/build.xml b/build.xml index 8276e63..019461d 100644 --- a/build.xml +++ b/build.xml @@ -12,7 +12,7 @@ <property name="release.version" value="1.12.0-dev" /> <property name="project-main-class" value="org.torproject.metrics.collector.Main" /> <property name="name" value="collector"/> - <property name="metricslibversion" value="2.8.0" /> + <property name="metricslibversion" value="2.9.0" /> <property name="jarincludes" value="collector.properties logback.xml" />
<patternset id="runtime" > diff --git a/src/build b/src/build index d82fff9..eb16cb3 160000 --- a/src/build +++ b/src/build @@ -1 +1 @@ -Subproject commit d82fff984634fe006ac7b0b102e7f48a52ca20d9 +Subproject commit eb16cb359db41722e6089bafb1e26808df4338df diff --git a/src/main/java/org/torproject/metrics/collector/conf/Key.java b/src/main/java/org/torproject/metrics/collector/conf/Key.java index 390feed..d59438b 100644 --- a/src/main/java/org/torproject/metrics/collector/conf/Key.java +++ b/src/main/java/org/torproject/metrics/collector/conf/Key.java @@ -18,13 +18,12 @@ public enum Key { RunOnce(Boolean.class), ExitlistUrl(URL.class), InstanceBaseUrl(String.class), - ArchivePath(Path.class), - ContribPath(Path.class), + IndexedPath(Path.class), RecentPath(Path.class), OutputPath(Path.class), - IndexPath(Path.class), StatsPath(Path.class), SyncPath(Path.class), + HtdocsPath(Path.class), RelaySources(SourceType[].class), BridgeSources(SourceType[].class), BridgePoolAssignmentsSources(SourceType[].class), diff --git a/src/main/java/org/torproject/metrics/collector/indexer/CreateIndexJson.java b/src/main/java/org/torproject/metrics/collector/indexer/CreateIndexJson.java index a40798e..15aa31d 100644 --- a/src/main/java/org/torproject/metrics/collector/indexer/CreateIndexJson.java +++ b/src/main/java/org/torproject/metrics/collector/indexer/CreateIndexJson.java @@ -1,73 +1,190 @@ -/* Copyright 2015--2018 The Tor Project +/* Copyright 2015--2019 The Tor Project * See LICENSE for licensing information */
package org.torproject.metrics.collector.indexer;
-import org.torproject.descriptor.index.DirectoryNode; -import org.torproject.descriptor.index.FileNode; -import org.torproject.descriptor.index.IndexNode; -import org.torproject.descriptor.internal.FileType; import org.torproject.metrics.collector.conf.Configuration; +import org.torproject.metrics.collector.conf.ConfigurationException; import org.torproject.metrics.collector.conf.Key; import org.torproject.metrics.collector.cron.CollecTorMain;
+import com.fasterxml.jackson.annotation.JsonAutoDetect; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.PropertyAccessor; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.PropertyNamingStrategy; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; +import org.apache.commons.compress.compressors.xz.XZCompressorOutputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory;
-import java.io.BufferedWriter; -import java.io.File; -import java.io.FileOutputStream; +import java.io.IOException; import java.io.InputStream; -import java.io.OutputStreamWriter; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.Locale; +import java.io.OutputStream; +import java.nio.file.FileVisitResult; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.StandardCopyOption; +import java.nio.file.attribute.BasicFileAttributes; +import java.time.Duration; +import java.time.Instant; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.temporal.TemporalAmount; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; import java.util.Properties; +import java.util.Set; +import java.util.SortedMap; import java.util.SortedSet; -import java.util.TimeZone; +import java.util.TreeMap; import java.util.TreeSet; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors;
-/* Create a fresh index.json containing all directories and files in the - * archive/ and recent/ directories. +/** + * Create an index file called {@code index.json} containing metadata of all + * files in the {@code indexed/} directory and update the {@code htdocs/} + * directory to contain all files to be served via the web server. * - * Note that if this ever takes longer than a few seconds, we'll have to - * cache index parts of directories or files that haven't changed. - * Example: if we parse include cryptographic hashes or @type information, - * we'll likely have to do that. */ + * <p>File metadata includes:</p> + * <ul> + * <li>Path for downloading this file from the web server.</li> + * <li>Size of the file in bytes.</li> + * <li>Timestamp when the file was last modified.</li> + * <li>Descriptor types as found in {@code @type} annotations of contained + * descriptors.</li> + * <li>Earliest and latest publication timestamp of contained + * descriptors.</li> + * <li>SHA-256 digest of the file.</li> + * </ul> + * + * <p>This class maintains its own working directory {@code htdocs/} with + * subdirectories like {@code htdocs/archive/} or {@code htdocs/recent/} and + * another subdirectory {@code htdocs/index/}. The first two subdirectories + * contain (hard) links created and deleted by this class, the third + * subdirectory contains the {@code index.json} file in uncompressed and + * compressed forms.</p> + * + * <p>The main reason for having the {@code htdocs/} directory is that indexing + * a large descriptor file can be time consuming. New or updated files in + * {@code indexed/} first need to be indexed before their metadata can be + * included in {@code index.json}. Another reason is that files removed from + * {@code indexed/} shall still be available for download for a limited period + * of time after disappearing from {@code index.json}.</p> + * + * <p>The reason for creating (hard) links in {@code htdocs/}, rather than + * copies, is that links do not consume additional disk space. All directories + * must be located on the same file system. Storing symbolic links in + * {@code htdocs/} would not have worked with replaced or deleted files in the + * original directories. Symbolic links in original directories are allowed as + * long as they target to the same file system.</p> + * + * <p>This class does not write, modify, or delete any files in the + * {@code indexed/} directory. At the same time it does not expect any other + * classes to write, modify, or delete contents in the {@code htdocs/} + * directory.</p> + */ public class CreateIndexJson extends CollecTorMain {
+ /** + * Class logger. + */ private static final Logger logger = LoggerFactory.getLogger(CreateIndexJson.class);
- private static File indexJsonFile; + /** + * Delay between finding out that a file has been deleted and deleting its + * link. + */ + private static final TemporalAmount deletionDelay = Duration.ofHours(2L);
- private static String basePath; + /** + * Index tarballs with no more than this many threads at a time. + */ + private static final int tarballIndexerThreads = 3;
- private static File[] indexedDirectories; + /** + * Index flat files with no more than this many threads at a time. + */ + private static final int flatFileIndexerThreads = 3;
- private static final String dateTimePattern = "yyyy-MM-dd HH:mm"; + /** + * Parser and formatter for all timestamps found in {@code index.json}. + */ + private static DateTimeFormatter dateTimeFormatter = DateTimeFormatter + .ofPattern("uuuu-MM-dd HH:mm").withZone(ZoneOffset.UTC);
- private static final Locale dateTimeLocale = Locale.US; + /** + * Object mapper for parsing and formatting {@code index.json} files. + */ + private static ObjectMapper objectMapper = new ObjectMapper() + .setPropertyNamingStrategy(PropertyNamingStrategy.SNAKE_CASE) + .setSerializationInclusion(JsonInclude.Include.NON_EMPTY) + .setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.NONE) + .setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY);
- private static final TimeZone dateTimezone = TimeZone.getTimeZone("UTC"); + /** + * Path to the {@code indexed/} directory. + */ + private Path indexedPath;
- private static String buildRevision = null; + /** + * Path to the {@code htdocs/} directory. + */ + private Path htdocsPath;
- /** Creates indexes of directories containing archived and recent - * descriptors and write index files to disk. */ - public CreateIndexJson(Configuration conf) { - super(conf); - Properties buildProperties = new Properties(); - try (InputStream is = getClass().getClassLoader() - .getResourceAsStream("collector.buildrevision.properties")) { - buildProperties.load(is); - buildRevision = buildProperties.getProperty("collector.build.revision", - null); - } catch (Exception ex) { - // This doesn't hamper the index creation: only log a warning. - logger.warn("No build revision available.", ex); - buildRevision = null; - } + /** + * Path to the uncompressed {@code index.json} file. + */ + private Path indexJsonPath; + + /** + * Base URL of all resources included in {@code index.json}. + */ + private String basePathString; + + /** + * Git revision of this software to be included in {@code index.json} or + * omitted if unknown. + */ + private String buildRevisionString; + + /** + * Index containing metadata of files in {@code indexed/}, including new or + * updated files that still need to be indexed and deleted files that are + * still linked in {@code htdocs/}. + * + * <p>This map is initialized by reading the last known {@code index.json} + * file and remains available in memory between executions until shutdown.</p> + */ + private SortedMap<Path, FileNode> index; + + /** + * Executor for indexing tarballs. + */ + private ExecutorService tarballsExecutor + = Executors.newFixedThreadPool(tarballIndexerThreads); + + /** + * Executor for indexing flat files (non-tarballs). + */ + private ExecutorService flatFilesExecutor + = Executors.newFixedThreadPool(flatFileIndexerThreads); + + /** + * Initialize this class with the given {@code configuration}. + * + * @param configuration Configuration values. + */ + public CreateIndexJson(Configuration configuration) { + super(configuration); }
@Override @@ -80,96 +197,433 @@ public class CreateIndexJson extends CollecTorMain { return "IndexJson"; }
+ + /** + * Run the indexer by (1) adding new files from {@code indexed/} to the index, + * (2) adding old files from {@code htdocs/} for which only links exist to the + * index, (3) scheduling new tasks and updating links in {@code htdocs/} to + * reflect what's contained in the in-memory index, and (4) writing new + * uncompressed and compressed {@code index.json} files to disk. + */ @Override - protected void startProcessing() { + public void startProcessing() { + this.startProcessing(Instant.now()); + } + + /** + * Helper method to {@link #startProcessing()} that accepts the current + * execution time and which is used by tests. + * + * @param now Current execution time. + */ + protected void startProcessing(Instant now) { + try { + this.basePathString = this.config.getProperty(Key.InstanceBaseUrl.name()); + this.indexedPath = config.getPath(Key.IndexedPath); + this.htdocsPath = config.getPath(Key.HtdocsPath); + } catch (ConfigurationException e) { + logger.error("Unable to read one or more configuration values. Not " + + "indexing in this execution.", e); + } + this.buildRevisionString = this.obtainBuildRevision(); + this.indexJsonPath = this.htdocsPath + .resolve(Paths.get("index", "index.json")); try { - indexJsonFile = new File(config.getPath(Key.IndexPath).toFile(), - "index.json"); - basePath = config.getProperty(Key.InstanceBaseUrl.name()); - indexedDirectories = new File[] { - config.getPath(Key.ArchivePath).toFile(), - config.getPath(Key.ContribPath).toFile(), - config.getPath(Key.RecentPath).toFile() }; - writeIndex(indexDirectories()); - } catch (Exception e) { - logger.error("Cannot run index creation: {}", e.getMessage(), e); - throw new RuntimeException(e); + this.prepareHtdocsDirectory(); + if (null == this.index) { + logger.info("Reading index.json file from last execution."); + this.index = this.readIndex(); + } + logger.info("Going through indexed/ and adding new files to the index."); + this.addNewFilesToIndex(this.indexedPath); + logger.info("Going through htdocs/ and adding links to deleted files to " + + "the index."); + this.addOldLinksToIndex(); + logger.info("Going through the index, scheduling tasks, and updating " + + "links."); + this.scheduleTasksAndUpdateLinks(now); + logger.info("Writing uncompressed and compressed index.json files to " + + "disk."); + this.writeIndex(this.index, now); + logger.info("Pausing until next index update run."); + } catch (IOException e) { + logger.error("I/O error while updating index.json files. Trying again in " + + "the next execution.", e); } }
- private static DateFormat dateTimeFormat; - - static { - dateTimeFormat = new SimpleDateFormat(dateTimePattern, - dateTimeLocale); - dateTimeFormat.setLenient(false); - dateTimeFormat.setTimeZone(dateTimezone); + /** + * Prepare the {@code htdocs/} directory by checking whether all required + * subdirectories exist and by creating them if not. + * + * @throws IOException Thrown if one or more directories could not be created. + */ + private void prepareHtdocsDirectory() throws IOException { + for (Path requiredPath : new Path[] { + this.htdocsPath, + this.indexJsonPath.getParent() }) { + if (!Files.exists(requiredPath)) { + Files.createDirectories(requiredPath); + } + } }
- private IndexNode indexDirectories() { - SortedSet<DirectoryNode> directoryNodes = new TreeSet<>(); - logger.trace("indexing: {} {}", indexedDirectories[0], - indexedDirectories[1]); - for (File directory : indexedDirectories) { - if (directory.exists() && directory.isDirectory()) { - DirectoryNode dn = indexDirectory(directory); - if (null != dn) { - directoryNodes.add(dn); + /** + * Read the {@code index.json} file written by the previous execution and + * populate our index with its contents, or leave the index empty if this is + * the first execution and that file does not yet exist. + * + * @return Index read from disk, or empty map if {@code index.json} does not + * exist. + */ + private SortedMap<Path, FileNode> readIndex() throws IOException { + SortedMap<Path, FileNode> index = new TreeMap<>(); + if (Files.exists(this.indexJsonPath)) { + IndexNode indexNode = objectMapper.readValue( + Files.newInputStream(this.indexJsonPath), IndexNode.class); + SortedMap<Path, DirectoryNode> directoryNodes = new TreeMap<>(); + directoryNodes.put(Paths.get(""), indexNode); + while (!directoryNodes.isEmpty()) { + Path directoryPath = directoryNodes.firstKey(); + DirectoryNode directoryNode = directoryNodes.remove(directoryPath); + if (null != directoryNode.files) { + for (FileNode fileNode : directoryNode.files) { + Path filePath = this.indexedPath.resolve(directoryPath) + .resolve(Paths.get(fileNode.path)); + index.put(filePath, fileNode); + } } + if (null != directoryNode.directories) { + boolean isRootDirectory = directoryNode == indexNode; + for (DirectoryNode subdirectoryNode : directoryNode.directories) { + Path subdirectoryPath = isRootDirectory + ? Paths.get(subdirectoryNode.path) + : directoryPath.resolve(Paths.get(subdirectoryNode.path)); + directoryNodes.put(subdirectoryPath, subdirectoryNode); + } + } + } + } + return index; + } + + /** + * Obtain and return the build revision string that was generated during the + * build process with {@code git rev-parse --short HEAD} and written to + * {@code collector.buildrevision.properties}, or return {@code null} if the + * build revision string cannot be obtained. + * + * @return Build revision string. + */ + protected String obtainBuildRevision() { + String buildRevision = null; + Properties buildProperties = new Properties(); + String propertiesFile = "collector.buildrevision.properties"; + try (InputStream is = getClass().getClassLoader() + .getResourceAsStream(propertiesFile)) { + if (null == is) { + logger.warn("File {}, which is supposed to contain the build revision " + + "string, does not exist in our class path. Writing index.json " + + "without the "build_revision" field.", propertiesFile); + return null; } + buildProperties.load(is); + buildRevision = buildProperties.getProperty( + "collector.build.revision", null); + } catch (IOException e) { + logger.warn("I/O error while trying to obtain build revision string. " + + "Writing index.json without the "build_revision" field."); } - return new IndexNode(dateTimeFormat.format( - System.currentTimeMillis()), buildRevision, basePath, null, - directoryNodes); - } - - private DirectoryNode indexDirectory(File directory) { - SortedSet<FileNode> fileNodes = new TreeSet<>(); - SortedSet<DirectoryNode> directoryNodes = new TreeSet<>(); - logger.trace("indexing: {}", directory); - File[] fileList = directory.listFiles(); - if (null == fileList) { - logger.warn("Indexing dubious directory: {}", directory); - return null; + return buildRevision; + } + + /** + * Walk the given file tree and add all previously unknown files to the + * in-memory index (except for files starting with "." or ending with ".tmp"). + * + * @param path File tree to walk. + */ + private void addNewFilesToIndex(Path path) throws IOException { + if (!Files.exists(path)) { + return; } - for (File fileOrDirectory : fileList) { - if (fileOrDirectory.getName().startsWith(".") - || fileOrDirectory.getName().endsWith(".tmp")) { - continue; + Files.walkFileTree(path, new SimpleFileVisitor<Path>() { + @Override + public FileVisitResult visitFile(Path filePath, + BasicFileAttributes basicFileAttributes) { + if (!filePath.toString().startsWith(".") + && !filePath.toString().endsWith(".tmp")) { + index.putIfAbsent(filePath, new FileNode()); + } + return FileVisitResult.CONTINUE; + } + }); + } + + /** + * Walk the file tree of the {@code htdocs/} directory and add all previously + * unknown links to the in-memory index to ensure their deletion when they're + * known to be deleted from their original directories. + */ + private void addOldLinksToIndex() throws IOException { + Path htdocsIndexPath = this.indexJsonPath.getParent(); + Files.walkFileTree(this.htdocsPath, new SimpleFileVisitor<Path>() { + @Override + public FileVisitResult visitFile(Path linkPath, + BasicFileAttributes basicFileAttributes) { + if (!linkPath.startsWith(htdocsIndexPath)) { + Path filePath = indexedPath.resolve(htdocsPath.relativize(linkPath)); + index.putIfAbsent(filePath, new FileNode()); + } + return FileVisitResult.CONTINUE; } - if (fileOrDirectory.isFile()) { - fileNodes.add(indexFile(fileOrDirectory)); + }); + } + + /** + * Go through the index, schedule tasks to index files, and update links. + * + * @throws IOException Thrown if an I/O exception occurs while creating or + * deleting links. + */ + private void scheduleTasksAndUpdateLinks(Instant now) throws IOException { + int queuedIndexerTasks = 0; + Map<Path, FileNode> indexingResults = new HashMap<>(); + SortedSet<Path> filesToIndex = new TreeSet<>(); + Map<Path, Path> linksToCreate = new HashMap<>(); + Set<FileNode> linksToMarkForDeletion = new HashSet<>(); + Map<Path, Path> linksToDelete = new HashMap<>(); + for (Map.Entry<Path, FileNode> e : this.index.entrySet()) { + Path filePath = e.getKey(); + Path linkPath = this.htdocsPath + .resolve(this.indexedPath.relativize(filePath)); + FileNode fileNode = e.getValue(); + if (Files.exists(filePath)) { + if (null != fileNode.indexerResult) { + if (!fileNode.indexerResult.isDone()) { + /* This file is currently being indexed, so we should just skip it + * and wait until the indexer is done. */ + queuedIndexerTasks++; + continue; + } + try { + /* Indexing is done, obtain index results. */ + fileNode = fileNode.indexerResult.get(); + indexingResults.put(filePath, fileNode); + } catch (InterruptedException | ExecutionException ex) { + /* Clear index result, so that we can give this file another try + * next time. */ + fileNode.indexerResult = null; + } + } + String originalLastModified = dateTimeFormatter + .format(Files.getLastModifiedTime(filePath).toInstant()); + if (!originalLastModified.equals(fileNode.lastModified)) { + /* We either don't have any index results for this file, or we only + * have index results for an older version of this file. */ + filesToIndex.add(filePath); + } else if (!Files.exists(linkPath)) { + /* We do have index results, but we don't have a link yet, so we're + * going to create a link. */ + linksToCreate.put(linkPath, filePath); + } else { + String linkLastModified = dateTimeFormatter + .format(Files.getLastModifiedTime(linkPath).toInstant()); + if (!linkLastModified.equals(fileNode.lastModified)) { + /* We do have index results plus a link to an older version of this + * file, so we'll have to update the link. */ + linksToCreate.put(linkPath, filePath); + } + } } else { - DirectoryNode dn = indexDirectory(fileOrDirectory); - if (null != dn) { - directoryNodes.add(dn); + if (null == fileNode.markedForDeletion) { + /* We're noticing just now that the file doesn't exist anymore, so + * we're going to mark it for deletion but not deleting the link right + * away. */ + linksToMarkForDeletion.add(fileNode); + } else if (fileNode.markedForDeletion + .isBefore(now.minus(deletionDelay))) { + /* The file doesn't exist anymore, and we found out long enough ago, + * so we can now go ahead and delete the link. */ + linksToDelete.put(linkPath, filePath); + } + } + } + if (queuedIndexerTasks > 0) { + logger.info("Counting {} file(s) being currently indexed or in the " + + "queue.", queuedIndexerTasks); + } + this.updateIndex(indexingResults); + this.scheduleTasks(filesToIndex); + this.createLinks(linksToCreate); + this.markForDeletion(linksToMarkForDeletion, now); + this.deleteLinks(linksToDelete); + } + + /** + * Update index with index results. + */ + private void updateIndex(Map<Path, FileNode> indexResults) { + if (!indexResults.isEmpty()) { + logger.info("Updating {} index entries with index results.", + indexResults.size()); + this.index.putAll(indexResults); + } + } + + /** + * Schedule indexing the given set of descriptor files, using different queues + * for tarballs and flat files. + * + * @param filesToIndex Paths to descriptor files to index. + */ + private void scheduleTasks(SortedSet<Path> filesToIndex) { + if (!filesToIndex.isEmpty()) { + logger.info("Scheduling {} indexer task(s).", filesToIndex.size()); + for (Path fileToIndex : filesToIndex) { + IndexerTask indexerTask = this.createIndexerTask(fileToIndex); + if (fileToIndex.getFileName().toString().endsWith(".tar.xz")) { + this.index.get(fileToIndex).indexerResult + = this.tarballsExecutor.submit(indexerTask); + } else { + this.index.get(fileToIndex).indexerResult + = this.flatFilesExecutor.submit(indexerTask); } } } - return new DirectoryNode( - directory.getName(), fileNodes.isEmpty() ? null : fileNodes, - directoryNodes.isEmpty() ? null : directoryNodes); - } - - private FileNode indexFile(File file) { - return new FileNode(file.getName(), file.length(), - dateTimeFormat.format(file.lastModified())); - } - - private void writeIndex(IndexNode indexNode) throws Exception { - indexJsonFile.getParentFile().mkdirs(); - String indexNodeString = IndexNode.makeJsonString(indexNode); - for (String filename : new String[] {indexJsonFile.toString(), - indexJsonFile + ".gz", indexJsonFile + ".xz", indexJsonFile + ".bz2"}) { - FileType type = FileType.valueOf( - filename.substring(filename.lastIndexOf(".") + 1).toUpperCase()); - try (BufferedWriter bufferedWriter - = new BufferedWriter(new OutputStreamWriter(type.outputStream( - new FileOutputStream(filename))))) { - bufferedWriter.write(indexNodeString); + } + + /** + * Create an indexer task for indexing the given file. + * + * <p>The reason why this is a separate method is that it can be overriden by + * tests that don't actually want to index files but instead provide their own + * index results.</p> + * + * @param fileToIndex File to index. + * @return Indexer task. + */ + protected IndexerTask createIndexerTask(Path fileToIndex) { + return new IndexerTask(fileToIndex); + } + + /** + * Create links in {@code htdocs/}, including all necessary parent + * directories. + * + * @param linksToCreate Map of links to be created with keys being link paths + * and values being original file paths. + * @throws IOException Thrown if an I/O error occurs. + */ + private void createLinks(Map<Path, Path> linksToCreate) throws IOException { + if (!linksToCreate.isEmpty()) { + logger.info("Creating {} new link(s).", linksToCreate.size()); + for (Map.Entry<Path, Path> e : linksToCreate.entrySet()) { + Path linkPath = e.getKey(); + Path originalPath = e.getValue(); + Files.createDirectories(linkPath.getParent()); + Files.deleteIfExists(linkPath); + Files.createLink(linkPath, originalPath); + } + } + } + + /** + * Mark the given links for deletion in the in-memory index. + * + * @param linksToMarkForDeletion Files to be marked for deletion. + */ + private void markForDeletion(Set<FileNode> linksToMarkForDeletion, + Instant now) { + if (!linksToMarkForDeletion.isEmpty()) { + logger.info("Marking {} old link(s) for deletion.", + linksToMarkForDeletion.size()); + for (FileNode fileNode : linksToMarkForDeletion) { + fileNode.markedForDeletion = now; + } + } + } + + /** + * Delete the given links from {@code htdocs/}. + * + * @param linksToDelete Map of links to be deleted with keys being link paths + * and values being original file paths. + * @throws IOException Thrown if an I/O error occurs. + */ + private void deleteLinks(Map<Path, Path> linksToDelete) throws IOException { + if (!linksToDelete.isEmpty()) { + logger.info("Deleting {} old link(s).", linksToDelete.size()); + for (Map.Entry<Path, Path> e : linksToDelete.entrySet()) { + Path linkPath = e.getKey(); + Path originalPath = e.getValue(); + Files.deleteIfExists(linkPath); + index.remove(originalPath); } } }
+ /** + * Write the in-memory index to {@code index.json} and its compressed + * variants, but exclude files that have not yet been indexed or that are + * marked for deletion. + * + * @throws IOException Thrown if an I/O error occurs while writing files. + */ + private void writeIndex(SortedMap<Path, FileNode> index, + Instant now) throws IOException { + IndexNode indexNode = new IndexNode(); + indexNode.indexCreated = dateTimeFormatter.format(now); + indexNode.buildRevision = this.buildRevisionString; + indexNode.path = this.basePathString; + SortedMap<Path, DirectoryNode> directoryNodes = new TreeMap<>(); + for (Map.Entry<Path, FileNode> indexEntry : index.entrySet()) { + Path filePath = this.indexedPath.relativize(indexEntry.getKey()); + FileNode fileNode = indexEntry.getValue(); + if (null == fileNode.lastModified || null != fileNode.markedForDeletion) { + /* Skip unindexed or deleted files. */ + continue; + } + Path directoryPath = null; + DirectoryNode parentDirectoryNode = indexNode; + if (null != filePath.getParent()) { + for (Path pathPart : filePath.getParent()) { + directoryPath = null == directoryPath ? pathPart + : directoryPath.resolve(pathPart); + DirectoryNode directoryNode = directoryNodes.get(directoryPath); + if (null == directoryNode) { + directoryNode = new DirectoryNode(); + directoryNode.path = pathPart.toString(); + if (null == parentDirectoryNode.directories) { + parentDirectoryNode.directories = new ArrayList<>(); + } + parentDirectoryNode.directories.add(directoryNode); + directoryNodes.put(directoryPath, directoryNode); + } + parentDirectoryNode = directoryNode; + } + } + if (null == parentDirectoryNode.files) { + parentDirectoryNode.files = new ArrayList<>(); + } + parentDirectoryNode.files.add(fileNode); + } + Path htdocsIndexPath = this.indexJsonPath.getParent(); + try (OutputStream uncompressed + = Files.newOutputStream(htdocsIndexPath.resolve(".index.json.tmp")); + OutputStream bz2Compressed = new BZip2CompressorOutputStream( + Files.newOutputStream(htdocsIndexPath.resolve("index.json.bz2"))); + OutputStream gzCompressed = new GzipCompressorOutputStream( + Files.newOutputStream(htdocsIndexPath.resolve("index.json.gz"))); + OutputStream xzCompressed = new XZCompressorOutputStream( + Files.newOutputStream(htdocsIndexPath.resolve("index.json.xz")))) { + objectMapper.writeValue(uncompressed, indexNode); + objectMapper.writeValue(bz2Compressed, indexNode); + objectMapper.writeValue(gzCompressed, indexNode); + objectMapper.writeValue(xzCompressed, indexNode); + } + Files.move(htdocsIndexPath.resolve(".index.json.tmp"), this.indexJsonPath, + StandardCopyOption.REPLACE_EXISTING); + } }
diff --git a/src/main/java/org/torproject/metrics/collector/indexer/DirectoryNode.java b/src/main/java/org/torproject/metrics/collector/indexer/DirectoryNode.java new file mode 100644 index 0000000..a369d08 --- /dev/null +++ b/src/main/java/org/torproject/metrics/collector/indexer/DirectoryNode.java @@ -0,0 +1,36 @@ +/* Copyright 2019 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.metrics.collector.indexer; + +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonPropertyOrder; + +import java.util.List; + +/** + * Directory node in {@code index.json} which is discarded after reading and + * re-created before writing that file. + */ +@JsonPropertyOrder({ "path", "files", "directories" }) +class DirectoryNode { + + /** + * Relative path of the directory. + */ + @JsonProperty("path") + String path; + + /** + * List of file objects of files available from this directory. + */ + @JsonProperty("files") + List<FileNode> files; + + /** + * List of directory objects of directories available from this directory. + */ + @JsonProperty("directories") + List<DirectoryNode> directories; +} + diff --git a/src/main/java/org/torproject/metrics/collector/indexer/FileNode.java b/src/main/java/org/torproject/metrics/collector/indexer/FileNode.java new file mode 100644 index 0000000..c007196 --- /dev/null +++ b/src/main/java/org/torproject/metrics/collector/indexer/FileNode.java @@ -0,0 +1,125 @@ +/* Copyright 2019 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.metrics.collector.indexer; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonPropertyOrder; + +import java.time.Instant; +import java.util.SortedSet; +import java.util.TreeSet; +import java.util.concurrent.Future; + +/** + * File node in {@code index.json}, also used for storing volatile metadata + * like whether a descriptor file is currently being indexed or whether its + * link in {@code htdocs/} is marked for deletion. + */ +@JsonPropertyOrder({ "path", "size", "last_modified", "types", + "first_published", "last_published", "sha256" }) +class FileNode { + + /** + * Relative path of the file. + */ + @JsonProperty("path") + String path; + + /** + * Size of the file in bytes. + */ + @JsonProperty("size") + Long size; + + /** + * Timestamp when the file was last modified using pattern + * {@code "YYYY-MM-DD HH:MM"} in the UTC timezone. + */ + @JsonProperty("last_modified") + String lastModified; + + /** + * Descriptor types as found in {@code @type} annotations of contained + * descriptors. + */ + @JsonProperty("types") + SortedSet<String> types; + + /** + * Earliest publication timestamp of contained descriptors using pattern + * {@code "YYYY-MM-DD HH:MM"} in the UTC timezone. + */ + @JsonProperty("first_published") + String firstPublished; + + /** + * Latest publication timestamp of contained descriptors using pattern + * {@code "YYYY-MM-DD HH:MM"} in the UTC timezone. + */ + @JsonProperty("last_published") + String lastPublished; + + /** + * SHA-256 digest of this file. + */ + @JsonProperty("sha256") + String sha256; + + /** + * Indexer result that will be available as soon as the indexer has completed + * its task. + */ + @JsonIgnore + Future<FileNode> indexerResult; + + /** + * Timestamp when this file was first not found anymore in {@code indexed/}, + * used to keep the link in {@code htdocs/} around for another 2 hours before + * deleting it, too. + * + * <p>This field is ignored when writing {@code index.json}, because it's an + * internal detail that nobody else cares about. The effect is that links + * might be around for longer than 2 hours in case of a restart, which seems + * acceptable.</p> + */ + @JsonIgnore + Instant markedForDeletion; + + /** + * Create and return a {@link FileNode} instance with the given values. + * + * @param path Relative path of the file. + * @param size Size of the file in bytes. + * @param lastModified Timestamp when the file was last modified using pattern + * {@code "YYYY-MM-DD HH:MM"} in the UTC timezone. + * @param types Descriptor types as found in {@code @type} annotations of + * contained descriptors. + * @param firstPublished Earliest publication timestamp of contained + * descriptors using pattern {@code "YYYY-MM-DD HH:MM"} in the UTC + * timezone. + * @param lastPublished Latest publication timestamp of contained descriptors + * using pattern {@code "YYYY-MM-DD HH:MM"} in the UTC timezone. + * @param sha256 SHA-256 digest of this file. + * + * @return {@link FileNode} instance with the given values. + */ + static FileNode of(String path, Long size, String lastModified, + Iterable<String> types, String firstPublished, String lastPublished, + String sha256) { + FileNode fileNode = new FileNode(); + fileNode.path = path; + fileNode.size = size; + fileNode.lastModified = lastModified; + fileNode.types = new TreeSet<>(); + for (String type : types) { + fileNode.types.add(type); + } + fileNode.firstPublished = firstPublished; + fileNode.lastPublished = lastPublished; + fileNode.sha256 = sha256; + return fileNode; + } +} + diff --git a/src/main/java/org/torproject/metrics/collector/indexer/IndexNode.java b/src/main/java/org/torproject/metrics/collector/indexer/IndexNode.java new file mode 100644 index 0000000..8b7a46b --- /dev/null +++ b/src/main/java/org/torproject/metrics/collector/indexer/IndexNode.java @@ -0,0 +1,30 @@ +/* Copyright 2019 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.metrics.collector.indexer; + +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonPropertyOrder; + +/** + * Root node in {@code index.json} containing additional information about index + * creation time or Git revision used for creating it. + */ +@JsonPropertyOrder({ "index_created", "build_revision", "path", "files", + "directories" }) +class IndexNode extends DirectoryNode { + + /** + * Timestamp when this index was created using pattern + * {@code "YYYY-MM-DD HH:MM"} in the UTC timezone. + */ + @JsonProperty("index_created") + String indexCreated; + + /** + * Git revision of this software. + */ + @JsonProperty("build_revision") + String buildRevision; +} + diff --git a/src/main/java/org/torproject/metrics/collector/indexer/IndexerTask.java b/src/main/java/org/torproject/metrics/collector/indexer/IndexerTask.java new file mode 100644 index 0000000..03c750b --- /dev/null +++ b/src/main/java/org/torproject/metrics/collector/indexer/IndexerTask.java @@ -0,0 +1,225 @@ +/* Copyright 2019 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.metrics.collector.indexer; + +import org.torproject.descriptor.BandwidthFile; +import org.torproject.descriptor.BridgeNetworkStatus; +import org.torproject.descriptor.BridgePoolAssignment; +import org.torproject.descriptor.BridgedbMetrics; +import org.torproject.descriptor.Descriptor; +import org.torproject.descriptor.DescriptorSourceFactory; +import org.torproject.descriptor.DirectoryKeyCertificate; +import org.torproject.descriptor.ExitList; +import org.torproject.descriptor.ExtraInfoDescriptor; +import org.torproject.descriptor.Microdescriptor; +import org.torproject.descriptor.RelayDirectory; +import org.torproject.descriptor.RelayNetworkStatus; +import org.torproject.descriptor.RelayNetworkStatusConsensus; +import org.torproject.descriptor.RelayNetworkStatusVote; +import org.torproject.descriptor.ServerDescriptor; +import org.torproject.descriptor.SnowflakeStats; +import org.torproject.descriptor.TorperfResult; +import org.torproject.descriptor.UnparseableDescriptor; +import org.torproject.descriptor.WebServerAccessLog; + +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.codec.digest.DigestUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.util.SortedSet; +import java.util.TreeSet; +import java.util.concurrent.Callable; + +/** + * Callable task that indexes a given descriptor file. + */ +class IndexerTask implements Callable<FileNode> { + + /** + * Class logger. + */ + private static final Logger logger + = LoggerFactory.getLogger(IndexerTask.class); + + /** + * Formatter for all timestamps found in {@code index.json}. + */ + private static DateTimeFormatter dateTimeFormatter = DateTimeFormatter + .ofPattern("uuuu-MM-dd HH:mm").withZone(ZoneOffset.UTC); + + /** + * Path to the descriptor file to index. + */ + private Path path; + + /** + * Index results object, which starts out empty and gets populated as indexing + * proceeds. + */ + private FileNode indexResult; + + /** + * Create a new instance to parse the given descriptor file, but don't start + * parsing just yet. + * + * @param path Descriptor file to index. + */ + IndexerTask(Path path) { + this.path = path; + } + + /** + * Index the given file and return index results when done. + * + * @return Index results. + * @throws IOException Thrown if an I/O error occurs. + */ + @Override + public FileNode call() throws IOException { + this.indexResult = new FileNode(); + this.requestBasicFileAttributes(); + this.computeFileDigest(); + this.parseDescriptorFile(); + return this.indexResult; + } + + /** + * Request and store basic file attributes like file name, last-modified time, + * and size. + * + * @throws IOException Thrown if an I/O error occurs. + */ + private void requestBasicFileAttributes() throws IOException { + this.indexResult.path = this.path.getFileName().toString(); + this.indexResult.lastModified = dateTimeFormatter + .format(Files.getLastModifiedTime(this.path).toInstant()); + this.indexResult.size = Files.size(this.path); + } + + /** + * Compute and store the file's SHA-256 digest. + * + * @throws IOException Thrown if an I/O error occurs. + */ + private void computeFileDigest() throws IOException { + try (InputStream stream = Files.newInputStream(this.path)) { + this.indexResult.sha256 + = Base64.encodeBase64String(DigestUtils.sha256(stream)); + } + } + + /** + * Parse the descriptor file to extract contained descriptor types and first + * and last published time. + */ + private void parseDescriptorFile() { + Long firstPublishedMillis = null; + Long lastPublishedMillis = null; + this.indexResult.types = new TreeSet<>(); + SortedSet<String> unknownDescriptorSubclasses = new TreeSet<>(); + for (Descriptor descriptor : DescriptorSourceFactory + .createDescriptorReader().readDescriptors(this.path.toFile())) { + if (descriptor instanceof UnparseableDescriptor) { + /* Skip unparseable descriptor. */ + continue; + } + for (String annotation : descriptor.getAnnotations()) { + if (annotation.startsWith("@type ")) { + this.indexResult.types.add(annotation.substring(6)); + } + } + Long publishedMillis; + if (descriptor instanceof BandwidthFile) { + BandwidthFile bandwidthFile = (BandwidthFile) descriptor; + LocalDateTime fileCreatedOrTimestamp + = bandwidthFile.fileCreated().isPresent() + ? bandwidthFile.fileCreated().get() + : bandwidthFile.timestamp(); + publishedMillis = fileCreatedOrTimestamp + .toInstant(ZoneOffset.UTC).toEpochMilli(); + } else if (descriptor instanceof BridgeNetworkStatus) { + publishedMillis = ((BridgeNetworkStatus) descriptor) + .getPublishedMillis(); + } else if (descriptor instanceof BridgePoolAssignment) { + publishedMillis = ((BridgePoolAssignment) descriptor) + .getPublishedMillis(); + } else if (descriptor instanceof BridgedbMetrics) { + publishedMillis = ((BridgedbMetrics) descriptor) + .bridgedbMetricsEnd().toInstant(ZoneOffset.UTC).toEpochMilli(); + } else if (descriptor instanceof DirectoryKeyCertificate) { + publishedMillis = ((DirectoryKeyCertificate) descriptor) + .getDirKeyPublishedMillis(); + } else if (descriptor instanceof ExitList) { + publishedMillis = ((ExitList) descriptor) + .getDownloadedMillis(); + } else if (descriptor instanceof ExtraInfoDescriptor) { + publishedMillis = ((ExtraInfoDescriptor) descriptor) + .getPublishedMillis(); + } else if (descriptor instanceof Microdescriptor) { + /* Microdescriptors don't contain useful timestamps for this purpose, + * but we already knew that, so there's no need to log a warning + * further down below. */ + continue; + } else if (descriptor instanceof RelayDirectory) { + publishedMillis = ((RelayDirectory) descriptor) + .getPublishedMillis(); + } else if (descriptor instanceof RelayNetworkStatus) { + publishedMillis = ((RelayNetworkStatus) descriptor) + .getPublishedMillis(); + } else if (descriptor instanceof RelayNetworkStatusConsensus) { + publishedMillis = ((RelayNetworkStatusConsensus) descriptor) + .getValidAfterMillis(); + } else if (descriptor instanceof RelayNetworkStatusVote) { + publishedMillis = ((RelayNetworkStatusVote) descriptor) + .getValidAfterMillis(); + } else if (descriptor instanceof ServerDescriptor) { + publishedMillis = ((ServerDescriptor) descriptor) + .getPublishedMillis(); + } else if (descriptor instanceof SnowflakeStats) { + publishedMillis = ((SnowflakeStats) descriptor) + .snowflakeStatsEnd().toInstant(ZoneOffset.UTC).toEpochMilli(); + } else if (descriptor instanceof TorperfResult) { + publishedMillis = ((TorperfResult) descriptor) + .getStartMillis(); + } else if (descriptor instanceof WebServerAccessLog) { + publishedMillis = ((WebServerAccessLog) descriptor) + .getLogDate().atStartOfDay(ZoneOffset.UTC) + .toInstant().toEpochMilli(); + } else { + /* Skip published timestamp if descriptor type is unknown or doesn't + * contain such a timestamp. */ + unknownDescriptorSubclasses.add( + descriptor.getClass().getSimpleName()); + continue; + } + if (null == firstPublishedMillis + || publishedMillis < firstPublishedMillis) { + firstPublishedMillis = publishedMillis; + } + if (null == lastPublishedMillis + || publishedMillis > lastPublishedMillis) { + lastPublishedMillis = publishedMillis; + } + } + if (!unknownDescriptorSubclasses.isEmpty()) { + logger.warn("Ran into unknown/unexpected Descriptor subclass(es) in " + + "{}: {}. Ignoring for index.json, but maybe worth looking into.", + this.path, unknownDescriptorSubclasses); + } + this.indexResult.firstPublished = null == firstPublishedMillis ? null + : dateTimeFormatter.format(Instant.ofEpochMilli(firstPublishedMillis)); + this.indexResult.lastPublished = null == lastPublishedMillis ? null + : dateTimeFormatter.format(Instant.ofEpochMilli(lastPublishedMillis)); + } +} + diff --git a/src/main/resources/collector.properties b/src/main/resources/collector.properties index e7cadf7..65e0f99 100644 --- a/src/main/resources/collector.properties +++ b/src/main/resources/collector.properties @@ -76,16 +76,11 @@ BridgedbMetricsOffsetMinutes = 340 # The URL of this instance. This will be the base URL # written to index.json, i.e. please change this to the mirrors url! InstanceBaseUrl = https://collector.torproject.org -# The target location for index.json and its compressed -# versions index.json.gz, index.json.bz2, and index.json.xz -IndexPath = index # The top-level directory for archived descriptors. -ArchivePath = archive -# The top-level directory for third party data. -ContribPath = contrib +IndexedPath = indexed # The top-level directory for the recent descriptors that were # published in the last 72 hours. -RecentPath = recent +RecentPath = indexed/recent # The top-level directory for the retrieved descriptors that will # be archived. OutputPath = out @@ -93,6 +88,11 @@ OutputPath = out StatsPath = stats # Path for descriptors downloaded from other instances SyncPath = sync +# Directory served via an external web server and managed by us which contains +# (hard) links to files in ArchivePath and RecentPath and which therefore must +# be located on the same file system. Also contains index.json and its +# compressed versions index.json.gz, index.json.bz2, and index.json.xz. +HtdocsPath = htdocs ######## Relay descriptors ######## # ## Define descriptor sources diff --git a/src/main/resources/create-tarballs.sh b/src/main/resources/create-tarballs.sh index 5802020..695bb24 100755 --- a/src/main/resources/create-tarballs.sh +++ b/src/main/resources/create-tarballs.sh @@ -10,7 +10,7 @@ # Configuration section: # The following path should be adjusted, if the CollecTor server layout differs. # All paths should be given absolute. -ARCHIVEDIR="/srv/collector.torproject.org/collector/archive" +ARCHIVEDIR="/srv/collector.torproject.org/collector/indexed/archive" WORKDIR="/srv/collector.torproject.org/collector/tarballs" OUTDIR="/srv/collector.torproject.org/collector/out" TARBALLTARGETDIR="/srv/collector.torproject.org/collector/data" diff --git a/src/test/java/org/torproject/metrics/collector/conf/ConfigurationTest.java b/src/test/java/org/torproject/metrics/collector/conf/ConfigurationTest.java index 7e9ea28..887f3ae 100644 --- a/src/test/java/org/torproject/metrics/collector/conf/ConfigurationTest.java +++ b/src/test/java/org/torproject/metrics/collector/conf/ConfigurationTest.java @@ -39,7 +39,7 @@ public class ConfigurationTest { public void testKeyCount() { assertEquals("The number of properties keys in enum Key changed." + "\n This test class should be adapted.", - 71, Key.values().length); + 70, Key.values().length); }
@Test() diff --git a/src/test/java/org/torproject/metrics/collector/indexer/CreateIndexJsonTest.java b/src/test/java/org/torproject/metrics/collector/indexer/CreateIndexJsonTest.java new file mode 100644 index 0000000..db00032 --- /dev/null +++ b/src/test/java/org/torproject/metrics/collector/indexer/CreateIndexJsonTest.java @@ -0,0 +1,522 @@ +/* Copyright 2019 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.metrics.collector.indexer; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import org.torproject.metrics.collector.conf.Configuration; +import org.torproject.metrics.collector.conf.Key; + +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.attribute.FileTime; +import java.time.Instant; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Test class for {@link CreateIndexJson}. + */ +public class CreateIndexJsonTest { + + /** + * Mocked indexer task that does not actually index a file but instead waits + * for the test class to set index results. + */ + static class MockedIndexerTask extends IndexerTask { + + /** + * Index result, to be set by the test. + */ + private FileNode result; + + /** + * Create a new mocked indexer task for the given path. + * + * @param path Path to index. + */ + MockedIndexerTask(Path path) { + super(path); + } + + /** + * Set index results. + * + * @param result Index results. + */ + synchronized void setResult(FileNode result) { + this.result = result; + this.notifyAll(); + } + + /** + * Execute the task by waiting for the test to set index results. + * + * @return Index results provided by the test. + */ + @Override + public FileNode call() { + synchronized (this) { + while (null == result) { + try { + wait(); + } catch (InterruptedException e) { + /* Don't care about being interrupted, just keep waiting. */ + } + } + return this.result; + } + } + } + + /** + * List of mocked indexer tasks in the order of creation. + */ + private List<MockedIndexerTask> indexerTasks = new ArrayList<>(); + + /** + * Testable version of the class under test. + */ + class TestableCreateIndexJson extends CreateIndexJson { + + /** + * Create a new instance with the given configuration. + * + * @param configuration Configuration for this test. + */ + TestableCreateIndexJson(Configuration configuration) { + super(configuration); + } + + /** + * Create an indexer task that doesn't actually index a file but that can + * be controlled by the test, and add that task to the list of tasks. + * + * @param fileToIndex File to index. + * @return Created (mocked) indexer task. + */ + @Override + protected IndexerTask createIndexerTask(Path fileToIndex) { + MockedIndexerTask indexerTask = new MockedIndexerTask(fileToIndex); + indexerTasks.add(indexerTask); + return indexerTask; + } + + /** + * Return {@code null} as build revision string to make it easier to compare + * written {@code index.json} files in tests. + * + * @return Always {@code null}. + */ + @Override + protected String obtainBuildRevision() { + return null; + } + } + + /** + * Temporary folder containing all files for this test. + */ + @Rule + public TemporaryFolder temporaryFolder = new TemporaryFolder(); + + /** + * Path to recent exit list file in {@code indexed/recent/}. + */ + private Path recentExitListFilePath; + + /** + * Path to archived exit list file in {@code indexed/archive/}. + */ + private Path archiveExitListFilePath; + + /** + * Path to exit list link in {@code htdocs/recent/}. + */ + private Path recentExitListLinkPath; + + /** + * Path to {@code index.json} file in {@code htdocs/index/}. + */ + private Path indexJsonPath; + + /** + * Class under test. + */ + private CreateIndexJson cij; + + /** + * Prepares the temporary folder and configuration for this test. + * + * @throws IOException Thrown if an I/O error occurs. + */ + @Before + public void prepareDirectoriesAndConfiguration() throws IOException { + Path indexedPath = this.temporaryFolder.newFolder("indexed").toPath(); + this.recentExitListFilePath = indexedPath.resolve( + Paths.get("recent", "exit-lists", "2016-09-20-13-02-00")); + this.archiveExitListFilePath = indexedPath.resolve( + Paths.get("archive", "exit-lists", "exit-list-2016-09.tar.xz")); + Path htdocsPath = this.temporaryFolder.newFolder("htdocs").toPath(); + this.recentExitListLinkPath = htdocsPath.resolve( + Paths.get("recent", "exit-lists", "2016-09-20-13-02-00")); + this.indexJsonPath = htdocsPath.resolve( + Paths.get("index", "index.json")); + Configuration configuration = new Configuration(); + configuration.setProperty(Key.IndexedPath.name(), + indexedPath.toAbsolutePath().toString()); + configuration.setProperty(Key.HtdocsPath.name(), + htdocsPath.toAbsolutePath().toString()); + configuration.setProperty(Key.InstanceBaseUrl.name(), + "https://collector.torproject.org"); + this.cij = new TestableCreateIndexJson(configuration); + } + + /** + * First execution time. + */ + private static final Instant firstExecution + = Instant.parse("2016-09-20T13:04:00Z"); + + /** + * Second execution time, two minutes after the first execution time, which is + * the default rate for executing this module. + */ + private static final Instant secondExecution + = Instant.parse("2016-09-20T13:06:00Z"); + + /** + * Third execution, three hours later than the second execution time, to see + * if links to files that have been marked for deletion are actually deleted. + */ + private static final Instant thirdExecution + = Instant.parse("2016-09-20T16:06:00Z"); + + /** + * Index result from indexing recent exit list. + */ + private FileNode recentExitListFileNode = FileNode.of( + "2016-09-20-13-02-00", 177_090L, "2016-09-20 13:02", + Collections.singletonList("tordnsel 1.0"), "2016-09-20 13:02", + "2016-09-20 13:02", "4aXdw+jQ5O33AS8n+fUOwD5ZzHCICnwzvxkK8fWDhdw="); + + /** + * Index result from indexing archived exit list. + */ + private FileNode archiveExitListFileNode = FileNode.of( + "exit-list-2016-09.tar.xz", 1_008_748L, "2016-10-04 03:31", + Collections.singletonList("tordnsel 1.0"), "2016-09-01 00:02", + "2016-09-30 23:02", "P4zUKVOJFtKzxOXpN3NLU0UBZTBqCAM95yDPJ5JH62g="); + + /** + * Index result from indexing <i>updated</i> archived exit list. + */ + private FileNode updatedArchiveExitListFileNode = FileNode.of( + "exit-list-2016-09.tar.xz", 1_008_748L, "2016-10-07 03:31", + Collections.singletonList("tordnsel 1.0"), "2016-09-01 00:02", + "2016-09-30 23:02", "P4zUKVOJFtKzxOXpN3NLU0UBZTBqCAM95yDPJ5JH62g="); + + /** + * Finish the oldest indexer task by providing the given file node as index + * result. + * + * @param fileNode Index result. + */ + private void finishIndexing(FileNode fileNode) { + assertFalse(this.indexerTasks.isEmpty()); + this.indexerTasks.remove(0).setResult(fileNode); + } + + /** + * (Almost) empty {@code index.json} file. + */ + private static final String emptyIndexJsonString + = "{"index_created":"2016-09-20 13:06"," + + ""path":"https://collector.torproject.org%5C%22%7D"; + + /** + * {@code index.json} file containing a single recent exit list. + */ + private static final String recentExitListIndexJsonString + = "{"index_created":"2016-09-20 13:06"," + + ""path":"https://collector.torproject.org%5C%22,%5C%22directories%5C%22:%5B%7B" + + ""path":"recent","directories":[{" + + ""path":"exit-lists","files":[{" + + ""path":"2016-09-20-13-02-00","size":177090," + + ""last_modified":"2016-09-20 13:02"," + + ""types":["tordnsel 1.0"]," + + ""first_published":"2016-09-20 13:02"," + + ""last_published":"2016-09-20 13:02"," + + ""sha256":"4aXdw+jQ5O33AS8n+fUOwD5ZzHCICnwzvxkK8fWDhdw="}]}]}]}"; + + /** + * {@code index.json} file containing a single archived exit list with a + * placeholder for the last-modified time. + */ + private static final String archiveExitListIndexJsonString + = "{"index_created":"2016-09-20 13:06"," + + ""path":"https://collector.torproject.org%5C%22,%5C%22directories%5C%22:%5B%7B" + + ""path":"archive","directories":[{" + + ""path":"exit-lists","files":[{" + + ""path":"exit-list-2016-09.tar.xz","size":1008748," + + ""last_modified":"%s"," + + ""types":["tordnsel 1.0"]," + + ""first_published":"2016-09-01 00:02"," + + ""last_published":"2016-09-30 23:02"," + + ""sha256":"P4zUKVOJFtKzxOXpN3NLU0UBZTBqCAM95yDPJ5JH62g="}]}]}]}"; + + /** + * Delete the given file. + * + * @param fileToDelete Path to file to delete. + */ + private static void deleteFile(Path fileToDelete) { + try { + Files.delete(fileToDelete); + } catch (IOException e) { + fail(String.format("I/O error while deleting %s.", fileToDelete)); + } + } + + /** + * Create the given file. + * + * @param fileToCreate Path to file to create. + * @param lastModified Last-modified time of file to create. + */ + private static void createFile(Path fileToCreate, Instant lastModified) { + try { + Files.createDirectories(fileToCreate.getParent()); + Files.createFile(fileToCreate); + Files.setLastModifiedTime(fileToCreate, FileTime.from(lastModified)); + } catch (IOException e) { + fail(String.format("I/O error while creating %s.", fileToCreate)); + } + } + + /** + * Return whether the given file exists. + * + * @param fileToCheck Path to file to check. + * @return Whether the file exists. + */ + private boolean fileExists(Path fileToCheck) { + return Files.exists(fileToCheck); + } + + /** + * Change last-modified time of the given file. + * + * @param fileToChange File to change. + * @param lastModified New last-modified time. + */ + private void changeLastModified(Path fileToChange, Instant lastModified) { + try { + Files.setLastModifiedTime(fileToChange, FileTime.from(lastModified)); + } catch (IOException e) { + fail(String.format("I/O error while changing last-modified time of %s.", + fileToChange)); + } + } + + /** + * Write the given string to the {@code index.json} file. + * + * @param indexJsonString String to write. + * @param formatArguments Optional format arguments. + */ + private void writeIndexJson(String indexJsonString, + Object ... formatArguments) { + try { + Files.createDirectories(indexJsonPath.getParent()); + Files.write(indexJsonPath, + String.format(indexJsonString, formatArguments).getBytes()); + } catch (IOException e) { + fail("I/O error while writing index.json file."); + } + } + + /** + * Read and return the first line from the {@code index.json} file. + * + * @return First line from the {@code index.json} file. + */ + private String readIndexJson() { + try { + return Files.readAllLines(indexJsonPath).get(0); + } catch (IOException e) { + fail("I/O error while reading index.json file."); + return null; + } + } + + /** + * Run the module with the given system time. + * + * @param now Time when running the module. + */ + private void startProcessing(Instant now) { + this.cij.startProcessing(now); + } + + /** + * Test whether two executions on an empty {@code indexed/} directory produce + * an {@code index.json} file without any files or directories. + */ + @Test + public void testEmptyDirs() { + startProcessing(firstExecution); + startProcessing(secondExecution); + assertEquals(emptyIndexJsonString, readIndexJson()); + } + + /** + * Test whether a new exit list in {@code indexed/recent/} gets indexed and + * then included in {@code index.json}. + */ + @Test + public void testNewRecentExitList() { + createFile(recentExitListFilePath, Instant.parse("2016-09-20T13:02:00Z")); + startProcessing(firstExecution); + finishIndexing(this.recentExitListFileNode); + startProcessing(secondExecution); + assertEquals(recentExitListIndexJsonString, readIndexJson()); + } + + /** + * Test whether an existing exit list in {@code indexed/recent/} that is + * already contained in {@code index.json} gets ignored by the indexers. + */ + @Test + public void testExistingRecentExitList() { + createFile(recentExitListFilePath, Instant.parse("2016-09-20T13:02:00Z")); + writeIndexJson(recentExitListIndexJsonString); + startProcessing(firstExecution); + startProcessing(secondExecution); + assertEquals(recentExitListIndexJsonString, readIndexJson()); + } + + /** + * Test whether a deleted exit list in {@code indexed/recent/} is first + * removed from {@code index.json} and later deleted from + * {@code htdocs/recent/}. + */ + @Test + public void testDeletedRecentExitList() { + createFile(recentExitListFilePath, Instant.parse("2016-09-20T13:02:00Z")); + writeIndexJson(recentExitListIndexJsonString); + startProcessing(firstExecution); + assertTrue(fileExists(recentExitListLinkPath)); + deleteFile(recentExitListFilePath); + startProcessing(secondExecution); + assertEquals(emptyIndexJsonString, readIndexJson()); + fileExists(recentExitListLinkPath); + assertTrue(fileExists(recentExitListLinkPath)); + startProcessing(thirdExecution); + assertFalse(fileExists(recentExitListLinkPath)); + } + + /** + * Test whether a link in {@code htdocs/recent/} for which no corresponding + * file in {@code indexed/recent/} exists is eventually deleted. + */ + @Test + public void testDeletedLink() { + createFile(recentExitListLinkPath, Instant.parse("2016-09-20T13:02:00Z")); + startProcessing(firstExecution); + assertTrue(Files.exists(recentExitListLinkPath)); + startProcessing(secondExecution); + assertTrue(Files.exists(recentExitListLinkPath)); + startProcessing(thirdExecution); + assertFalse(Files.exists(recentExitListLinkPath)); + } + + /** + * Test whether a tarball that gets deleted while being indexed is not + * included in {@code index.json} even after indexing is completed. + */ + @Test + public void testIndexingDisappearingTarball() { + createFile(recentExitListFilePath, Instant.parse("2016-09-20T13:02:00Z")); + startProcessing(firstExecution); + deleteFile(recentExitListFilePath); + finishIndexing(recentExitListFileNode); + startProcessing(secondExecution); + assertEquals(emptyIndexJsonString, readIndexJson()); + } + + /** + * Test whether a tarball that gets updated in {@code indexed/archive/} gets + * re-indexed and updated in {@code index.json}. + */ + @Test + public void testUpdatedFile() { + writeIndexJson(archiveExitListIndexJsonString, "2016-10-04 03:31"); + createFile(archiveExitListFilePath, Instant.parse("2016-10-07T03:31:00Z")); + startProcessing(firstExecution); + finishIndexing(updatedArchiveExitListFileNode); + startProcessing(secondExecution); + assertEquals(String.format(archiveExitListIndexJsonString, + "2016-10-07 03:31"), readIndexJson()); + } + + /** + * Test whether a tarball that gets updated while being indexed is not + * included in {@code index.json} even after indexing is completed. + */ + @Test + public void testUpdateFileWhileIndexing() { + createFile(archiveExitListFilePath, Instant.parse("2016-10-07T03:31:00Z")); + startProcessing(firstExecution); + changeLastModified(archiveExitListFilePath, + Instant.parse("2016-10-07T03:31:00Z")); + finishIndexing(archiveExitListFileNode); + startProcessing(secondExecution); + assertEquals(String.format(archiveExitListIndexJsonString, + "2016-10-04 03:31"), readIndexJson()); + } + + /** + * Test whether a tarball that gets updated after being indexed but before + * being included in {@code index.json} is not being updated in + * {@code index.json} until the updated file is being indexed. */ + @Test + public void testUpdateFileAfterIndexing() { + createFile(archiveExitListFilePath, Instant.parse("2016-10-04T03:31:00Z")); + startProcessing(firstExecution); + finishIndexing(archiveExitListFileNode); + changeLastModified(archiveExitListFilePath, + Instant.parse("2016-10-07T03:31:00Z")); + startProcessing(secondExecution); + assertEquals(String.format(archiveExitListIndexJsonString, + "2016-10-04 03:31"), readIndexJson()); + } + + /** + * Test whether a long-running indexer task is being given the time to finish, + * rather than starting another task for the same file. + */ + @Test + public void testLongRunningIndexerTask() { + createFile(archiveExitListFilePath, Instant.parse("2016-10-04T03:31:00Z")); + startProcessing(firstExecution); + startProcessing(secondExecution); + assertEquals(emptyIndexJsonString, readIndexJson()); + finishIndexing(archiveExitListFileNode); + startProcessing(thirdExecution); + assertTrue(this.indexerTasks.isEmpty()); + } +} + diff --git a/src/test/java/org/torproject/metrics/collector/indexer/IndexerTaskTest.java b/src/test/java/org/torproject/metrics/collector/indexer/IndexerTaskTest.java new file mode 100644 index 0000000..8e5e6f4 --- /dev/null +++ b/src/test/java/org/torproject/metrics/collector/indexer/IndexerTaskTest.java @@ -0,0 +1,226 @@ +/* Copyright 2019 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.metrics.collector.indexer; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.attribute.FileTime; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.util.Arrays; +import java.util.Collection; +import java.util.SortedSet; +import java.util.TreeSet; + +/** + * Test class for {@link IndexerTask}. + */ +@RunWith(Parameterized.class) +public class IndexerTaskTest { + + @Parameterized.Parameter + public String path; + + @Parameterized.Parameter(1) + public Long size; + + @Parameterized.Parameter(2) + public String lastModified; + + @Parameterized.Parameter(3) + public String[] types; + + @Parameterized.Parameter(4) + public String firstPublished; + + @Parameterized.Parameter(5) + public String lastPublished; + + @Parameterized.Parameter(6) + public String sha256; + + /** + * Initialize test parameters. + * + * @return Test parameters. + */ + @Parameterized.Parameters + public static Collection<Object[]> pathFilename() { + return Arrays.asList(new Object[][]{ + + {"2016-09-20-13-00-00-consensus", /* Path in src/test/resources/ */ + 1_618_103L, /* Size in bytes */ + "2017-09-07 12:13", /* Last-modified time */ + new String[] { "network-status-consensus-3 1.17" }, /* Types */ + "2016-09-20 13:00", /* First published */ + "2016-09-20 13:00", /* Last published */ + "3mLpDZmP/NSgOgmuPDyljxh0Lup1L6FtD16266ZCGAw="}, /* SHA-256 */ + + {"2016-09-20-13-00-00-vote-49015F787433103580E3B66A1707A00E60F2D15B-" + + "60ADC6BEC262AE921A1037D54C8A3976367DBE87", + 3_882_514L, + "2017-09-07 12:13", + new String[] { "network-status-vote-3 1.17" }, + "2016-09-20 13:00", + "2016-09-20 13:00", + "UCnSSrvdm26dJOriFgEQNQVrBLpVKbH/fF0VPRX3TGc="}, + + {"2016-09-20-13-02-00", + 177_090L, + "2017-01-13 16:55", + new String[] { "tordnsel 1.0" }, + "2016-09-20 13:02", + "2016-09-20 13:02", + "4aXdw+jQ5O33AS8n+fUOwD5ZzHCICnwzvxkK8fWDhdw="}, + + {"2016-10-01-16-00-00-vote-0232AF901C31A04EE9848595AF9BB7620D4C5B2E-" + + "FEE63B4AB7CE5A6BDD09E9A5C4F01BD61EB7E4F1", + 3_226_152L, + "2017-01-13 16:55", + new String[] { "network-status-vote-3 1.0" }, + "2016-10-01 16:00", + "2016-10-01 16:00", + "bilv6zEXr0Y9f5o24RMN0lUujsJJiSQAn9LkG0XJrZE="}, + + {"2016-10-02-17-00-00-consensus-microdesc", + 1_431_627L, + "2017-09-07 12:13", + new String[] { "network-status-microdesc-consensus-3 1.17" }, + "2016-10-02 17:00", + "2016-10-02 17:00", + "rrkxuLahYENLExX99Jio587/kUz9NtOoaYyKXxvX5EA="}, + + {"20160920-063816-1D8F3A91C37C5D1C4C19B1AD1D0CFBE8BF72D8E1", + 339_256L, + "2017-09-07 12:13", + new String[] { "bridge-network-status 1.17" }, + "2016-09-20 06:38", + "2016-09-20 06:38", + "sMAcyFrZ2rxj50b6iGe3icCNMC4gBSA1y9ZH4EWTa8s="}, + + {"bridge-2016-10-02-08-09-00-extra-infos", + 11_561L, + "2017-09-07 12:13", + new String[] { "bridge-extra-info 1.3" }, + "2016-10-02 06:09", + "2016-10-02 06:09", + "hat+vbyE04eH9JBQa0s6ezB6sLaStUUhvUj8CZ1aoEY="}, + + {"bridge-2016-10-02-16-09-00-server-descriptors", + 5_336L, + "2017-01-13 16:55", + new String[] { "bridge-server-descriptor 1.2" }, + "2016-10-02 14:09", + "2016-10-02 14:09", + "6CtHdo+eRFOi5xBjJcOVszC1hibC5gTB+YWvn1VmIIc="}, + + {"moria-1048576-2016-10-05.tpf", + 20_405L, + "2017-09-07 12:13", + new String[0], + null, + null, + "DZyk6c0lQQ7OVZo1cmA+SuxPA+1thmuiooVifQPPOiA="}, + + {"op-nl-1048576-2017-04-11.tpf", + 4_220L, + "2017-09-20 12:14", + new String[] { "torperf 1.1" }, + "2017-04-11 06:24", + "2017-04-11 15:54", + "Gwex5yN3+s2PrhekjA68XmPg+UorOfx7mUa4prd7Dt8="}, + + {"relay-2016-10-02-08-05-00-extra-infos", + 20_541L, + "2017-01-13 16:55", + new String[] { "extra-info 1.0" }, + "2016-10-02 07:01", + "2016-10-02 07:01", + "3ZSO3+9ed9OwMVPx2LcVIiJfC+O30eEXEdbz64Hrp0w="}, + + {"relay-2016-10-02-16-05-00-server-descriptors", + 17_404L, + "2017-01-13 16:55", + new String[] { "server-descriptor 1.0" }, + "2016-10-02 14:58", + "2016-10-02 15:01", + "uWKHHzq4+oVNdOGh0mfkLUSjwGrBlLtEtN2DtF5qcLU="}, + + {"siv-1048576-2016-10-03.tpf", + 39_193L, + "2017-01-13 16:55", + new String[] { "torperf 1.0" }, + "2016-10-03 00:02", + "2016-10-03 23:32", + "paaFPI6BVuIDQ32aIuHYNCuKmBvFxsDvVCCwp+oM0GE="}, + + {"torperf-51200-2016-10-02.tpf", + 233_763L, + "2017-01-13 16:55", + new String[] { "torperf 1.0" }, + "2016-10-02 00:00", + "2016-10-02 23:55", + "fqeVAXamvB4yQ/8UlZAxhJx0+1Y7IfipqIpOUqQ57rE="} + }); + } + + /** + * Formatter for all timestamps found in {@code index.json}. + */ + private static DateTimeFormatter dateTimeFormatter = DateTimeFormatter + .ofPattern("uuuu-MM-dd HH:mm").withZone(ZoneOffset.UTC); + + /** + * Temporary folder containing all files for this test. + */ + @Rule + public TemporaryFolder temporaryFolder = new TemporaryFolder(); + + /** + * Test indexing a file. + * + * @throws IOException Thrown if an I/O error occurs. + */ + @Test + public void testIndexFile() throws IOException { + Path indexedDirectory = this.temporaryFolder.newFolder().toPath(); + Path temporaryFile = indexedDirectory.resolve(this.path); + try (InputStream is = getClass() + .getClassLoader().getResourceAsStream(this.path)) { + if (null == is) { + fail(String.format("Unable to read test resource %s.", this.path)); + return; + } + Files.copy(is, temporaryFile); + } + Files.setLastModifiedTime(temporaryFile, + FileTime.from(LocalDateTime.parse(this.lastModified, dateTimeFormatter) + .toInstant(ZoneOffset.UTC))); + assertTrue(Files.exists(temporaryFile)); + IndexerTask indexerTask = new IndexerTask(temporaryFile); + FileNode indexResult = indexerTask.call(); + assertEquals(this.path, indexResult.path); + assertEquals(this.size, indexResult.size); + assertEquals(this.lastModified, indexResult.lastModified); + SortedSet<String> expectedTypes = new TreeSet<>(Arrays.asList(this.types)); + assertEquals(expectedTypes, indexResult.types); + assertEquals(this.firstPublished, indexResult.firstPublished); + assertEquals(this.lastPublished, indexResult.lastPublished); + assertEquals(this.sha256, indexResult.sha256); + } +} +