commit 97e577ae73ec631ac5d7448cb9f525594baa0c8a Author: iwakeh iwakeh@torproject.org Date: Mon Oct 9 12:23:53 2017 +0000
Add webstats module with sync and local import functionality.
Implements task-22428. --- CHANGELOG.md | 6 +- build.xml | 2 +- src/main/java/org/torproject/collector/Main.java | 2 + .../torproject/collector/conf/Configuration.java | 3 +- .../java/org/torproject/collector/conf/Key.java | 9 +- .../collector/persist/DescriptorPersistence.java | 2 + .../persist/WebServerAccessLogPersistence.java | 73 ++++++++ .../torproject/collector/sync/SyncPersistence.java | 7 + .../torproject/collector/webstats/LogFileMap.java | 115 ++++++++++++ .../torproject/collector/webstats/LogMetadata.java | 87 +++++++++ .../collector/webstats/SanitizeWeblogs.java | 198 +++++++++++++++++++++ src/main/resources/collector.properties | 20 ++- .../collector/conf/ConfigurationTest.java | 2 +- .../collector/cron/CollecTorMainTest.java | 1 + .../collector/sync/SyncPersistenceTest.java | 68 +++---- .../collector/webstats/LogFileMapTest.java | 33 ++++ .../collector/webstats/LogMetadataTest.java | 82 +++++++++ ...eotrichon.torproject.org_access.log_20151007.xz | Bin 0 -> 4056 bytes ...meronense.torproject.org_access.log_20170531.gz | Bin 0 -> 388 bytes 19 files changed, 671 insertions(+), 39 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f4cd21..a0b5d1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,8 @@ -# Changes in version 1.?.? - 201?-??-?? +# Changes in version 1.5.0 - 2018-01-31 + + * Major changes + - Update to metrics-lib 2.2.0. + - Add new module for processing and sanitizing Tor web server logs.
* Minor changes - Exclude lastModifiedMillis in index.json. diff --git a/build.xml b/build.xml index f004f29..48f6e33 100644 --- a/build.xml +++ b/build.xml @@ -11,7 +11,7 @@ <property name="release.version" value="1.4.1-dev" /> <property name="project-main-class" value="org.torproject.collector.Main" /> <property name="name" value="collector"/> - <property name="metricslibversion" value="2.1.1" /> + <property name="metricslibversion" value="2.2.0" /> <property name="jarincludes" value="collector.properties logback.xml" />
<patternset id="runtime" > diff --git a/src/main/java/org/torproject/collector/Main.java b/src/main/java/org/torproject/collector/Main.java index 50cc8be..70cdbfa 100644 --- a/src/main/java/org/torproject/collector/Main.java +++ b/src/main/java/org/torproject/collector/Main.java @@ -14,6 +14,7 @@ import org.torproject.collector.exitlists.ExitListDownloader; import org.torproject.collector.index.CreateIndexJson; import org.torproject.collector.onionperf.OnionPerfDownloader; import org.torproject.collector.relaydescs.ArchiveWriter; +import org.torproject.collector.webstats.SanitizeWeblogs;
import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -51,6 +52,7 @@ public class Main { collecTorMains.put(Key.UpdateindexActivated, CreateIndexJson.class); collecTorMains.put(Key.RelaydescsActivated, ArchiveWriter.class); collecTorMains.put(Key.OnionPerfActivated, OnionPerfDownloader.class); + collecTorMains.put(Key.WebstatsActivated, SanitizeWeblogs.class); }
private static Configuration conf = new Configuration(); diff --git a/src/main/java/org/torproject/collector/conf/Configuration.java b/src/main/java/org/torproject/collector/conf/Configuration.java index 57f9731..72bd5fc 100644 --- a/src/main/java/org/torproject/collector/conf/Configuration.java +++ b/src/main/java/org/torproject/collector/conf/Configuration.java @@ -92,7 +92,8 @@ public class Configuration extends Observable implements Cloneable { || this.getBool(Key.BridgedescsActivated) || this.getBool(Key.ExitlistsActivated) || this.getBool(Key.UpdateindexActivated) - || this.getBool(Key.OnionPerfActivated))) { + || this.getBool(Key.OnionPerfActivated) + || this.getBool(Key.WebstatsActivated))) { throw new ConfigurationException("Nothing is activated!\n" + "Please edit collector.properties. Exiting."); } diff --git a/src/main/java/org/torproject/collector/conf/Key.java b/src/main/java/org/torproject/collector/conf/Key.java index e0a20a7..6454009 100644 --- a/src/main/java/org/torproject/collector/conf/Key.java +++ b/src/main/java/org/torproject/collector/conf/Key.java @@ -28,6 +28,7 @@ public enum Key { BridgeSources(SourceType[].class), ExitlistSources(SourceType[].class), OnionPerfSources(SourceType[].class), + WebstatsSources(SourceType[].class), RelayCacheOrigins(String[].class), RelayLocalOrigins(Path.class), RelaySyncOrigins(URL[].class), @@ -35,6 +36,8 @@ public enum Key { BridgeLocalOrigins(Path.class), ExitlistSyncOrigins(URL[].class), OnionPerfSyncOrigins(URL[].class), + WebstatsSyncOrigins(URL[].class), + WebstatsLocalOrigins(Path.class), BridgedescsActivated(Boolean.class), BridgedescsOffsetMinutes(Integer.class), BridgedescsPeriodMinutes(Integer.class), @@ -58,7 +61,11 @@ public enum Key { KeepDirectoryArchiveImportHistory(Boolean.class), ReplaceIpAddressesWithHashes(Boolean.class), BridgeDescriptorMappingsLimit(Integer.class), - OnionPerfHosts(URL[].class); + OnionPerfHosts(URL[].class), + WebstatsActivated(Boolean.class), + WebstatsLimits(Boolean.class), + WebstatsOffsetMinutes(Integer.class), + WebstatsPeriodMinutes(Integer.class);
private Class clazz; private static Set<String> keys; diff --git a/src/main/java/org/torproject/collector/persist/DescriptorPersistence.java b/src/main/java/org/torproject/collector/persist/DescriptorPersistence.java index 3e464fe..01c9fad 100644 --- a/src/main/java/org/torproject/collector/persist/DescriptorPersistence.java +++ b/src/main/java/org/torproject/collector/persist/DescriptorPersistence.java @@ -19,6 +19,7 @@ public abstract class DescriptorPersistence<T extends Descriptor> {
protected static final String BRIDGEDESCS = "bridge-descriptors"; protected static final String DASH = "-"; + protected static final String DOT = "."; protected static final String MICRODESC = "microdesc"; protected static final String MICRODESCS = "microdescs"; protected static final String RELAYDESCS = "relay-descriptors"; @@ -26,6 +27,7 @@ public abstract class DescriptorPersistence<T extends Descriptor> { protected static final String EXTRA_INFOS = "extra-infos"; protected static final String SERVERDESC = "server-descriptor"; protected static final String SERVERDESCS = "server-descriptors"; + protected static final String WEBSTATS = "webstats";
protected final T desc; protected final byte[] annotation; diff --git a/src/main/java/org/torproject/collector/persist/WebServerAccessLogPersistence.java b/src/main/java/org/torproject/collector/persist/WebServerAccessLogPersistence.java new file mode 100644 index 0000000..792d3a9 --- /dev/null +++ b/src/main/java/org/torproject/collector/persist/WebServerAccessLogPersistence.java @@ -0,0 +1,73 @@ +/* Copyright 2016--2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.collector.persist; + +import org.torproject.descriptor.WebServerAccessLog; +import org.torproject.descriptor.internal.FileType; +import org.torproject.descriptor.log.InternalLogDescriptor; +import org.torproject.descriptor.log.InternalWebServerAccessLog; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; +import java.time.format.DateTimeFormatter; + +public class WebServerAccessLogPersistence + extends DescriptorPersistence<WebServerAccessLog> { + + public static final String SEP = InternalWebServerAccessLog.SEP; + public static final FileType COMPRESSION = FileType.XZ; + private static final Logger log + = LoggerFactory.getLogger(WebServerAccessLogPersistence.class); + + private DateTimeFormatter yearPattern = DateTimeFormatter.ofPattern("yyyy"); + private DateTimeFormatter monthPattern = DateTimeFormatter.ofPattern("MM"); + private DateTimeFormatter dayPattern = DateTimeFormatter.ofPattern("dd"); + + /** Prepare storing the given descriptor. */ + public WebServerAccessLogPersistence(WebServerAccessLog desc) { + super(desc, new byte[0]); + byte[] compressedBytes = null; + try { // The descriptor bytes have to be stored compressed. + compressedBytes = COMPRESSION.compress(desc.getRawDescriptorBytes()); + ((InternalLogDescriptor)desc).setRawDescriptorBytes(compressedBytes); + } catch (Exception ex) { + log.warn("Cannot compress ’{}’. Storing uncompressed.", ex); + } + calculatePaths(); + } + + private void calculatePaths() { + String name = + this.desc.getVirtualHost() + SEP + this.desc.getPhysicalHost() + + SEP + "access.log" + + SEP + this.desc.getLogDate().format(DateTimeFormatter.BASIC_ISO_DATE) + + DOT + COMPRESSION.name().toLowerCase(); + this.recentPath = Paths.get(WEBSTATS, name).toString(); + this.storagePath = Paths.get( + WEBSTATS, + this.desc.getVirtualHost(), + this.desc.getLogDate().format(yearPattern), // year + this.desc.getLogDate().format(monthPattern), // month + this.desc.getLogDate().format(dayPattern), // day + name).toString(); + } + + /** Logs are not appended. */ + @Override + public boolean storeAll(String recentRoot, String outRoot) { + return storeAll(recentRoot, outRoot, StandardOpenOption.CREATE_NEW, + StandardOpenOption.CREATE_NEW); + } + + /** Logs are not appended. */ + @Override + public boolean storeRecent(String recentRoot) { + return storeRecent(recentRoot, StandardOpenOption.CREATE_NEW); + } + +} + diff --git a/src/main/java/org/torproject/collector/sync/SyncPersistence.java b/src/main/java/org/torproject/collector/sync/SyncPersistence.java index e230fca..142be7a 100644 --- a/src/main/java/org/torproject/collector/sync/SyncPersistence.java +++ b/src/main/java/org/torproject/collector/sync/SyncPersistence.java @@ -18,6 +18,7 @@ import org.torproject.collector.persist.PersistenceUtils; import org.torproject.collector.persist.ServerDescriptorPersistence; import org.torproject.collector.persist.StatusPersistence; import org.torproject.collector.persist.VotePersistence; +import org.torproject.collector.persist.WebServerAccessLogPersistence; import org.torproject.descriptor.BridgeExtraInfoDescriptor; import org.torproject.descriptor.BridgeNetworkStatus; import org.torproject.descriptor.BridgeServerDescriptor; @@ -28,6 +29,7 @@ import org.torproject.descriptor.RelayNetworkStatusConsensus; import org.torproject.descriptor.RelayNetworkStatusVote; import org.torproject.descriptor.RelayServerDescriptor; import org.torproject.descriptor.TorperfResult; +import org.torproject.descriptor.WebServerAccessLog;
import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -132,6 +134,10 @@ public class SyncPersistence { case "TorperfResult": descPersist = new OnionPerfPersistence((TorperfResult) desc); break; + case "WebServerAccessLog": + descPersist = new WebServerAccessLogPersistence( + (WebServerAccessLog) desc); + break; default: log.trace("Invalid descriptor type {} for sync-merge.", clazz.getName()); @@ -149,3 +155,4 @@ public class SyncPersistence { } } } + diff --git a/src/main/java/org/torproject/collector/webstats/LogFileMap.java b/src/main/java/org/torproject/collector/webstats/LogFileMap.java new file mode 100644 index 0000000..c1a6802 --- /dev/null +++ b/src/main/java/org/torproject/collector/webstats/LogFileMap.java @@ -0,0 +1,115 @@ +/* Copyright 2017--2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.collector.webstats; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.FileVisitResult; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.attribute.BasicFileAttributes; +import java.time.LocalDate; +import java.util.Optional; +import java.util.TreeMap; +import java.util.stream.Stream; + +/** + * Processes the given path and stores metadata for log files. + */ +public class LogFileMap + extends TreeMap<String, TreeMap<String, TreeMap<LocalDate, LogMetadata>>> { + + private static final Logger log = LoggerFactory.getLogger(LogFileMap.class); + + /** + * The map to keep track of the logfiles by virtual host, + * physical host, and date. + */ + public LogFileMap(Path startDir) { + collectFiles(this, startDir); + } + + private void collectFiles(final LogFileMap logFileMap, Path startDir) { + try { + Files.walkFileTree(startDir, new SimpleFileVisitor<Path>() { + @Override + public FileVisitResult visitFile(Path path, BasicFileAttributes att) + throws IOException { + Optional<LogMetadata> optionalMetadata = LogMetadata.create(path); + if (optionalMetadata.isPresent()) { + logFileMap.add(optionalMetadata.get()); + } + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFileFailed(Path path, IOException ex) + throws IOException { + return logIfError(path, ex); + } + + @Override + public FileVisitResult postVisitDirectory(Path path, IOException ex) + throws IOException { + return logIfError(path, ex); + } + + private FileVisitResult logIfError(Path path, IOException ex) { + if (null != ex) { + log.warn("Cannot process '{}'.", path, ex); + } + return FileVisitResult.CONTINUE; + } + }); + } catch (IOException ex) { + log.error("Cannot read directory '{}'.", startDir, ex); + } + } + + /** Add log metadata to the map structure. */ + public void add(LogMetadata metadata) { + TreeMap<String, TreeMap<LocalDate, LogMetadata>> virtualHosts + = this.get(metadata.virtualHost); + if (null == virtualHosts) { + virtualHosts = new TreeMap<String, TreeMap<LocalDate, LogMetadata>>(); + this.put(metadata.virtualHost, virtualHosts); + } + TreeMap<LocalDate, LogMetadata> physicalHosts + = virtualHosts.get(metadata.physicalHost); + if (null == physicalHosts) { + physicalHosts = new TreeMap<LocalDate, LogMetadata>(); + virtualHosts.put(metadata.physicalHost, physicalHosts); + } + physicalHosts.put(metadata.date, metadata); + } + + /** + * Takes the given metadata and returns the LogMetadata for the entry + * of the next day. + */ + public Optional<LogMetadata> nextDayLogFor(LogMetadata metadata) { + TreeMap<String, TreeMap<LocalDate, LogMetadata>> virtualHosts + = this.get(metadata.virtualHost); + if (null == virtualHosts) { + return Optional.empty(); + } + TreeMap<LocalDate, LogMetadata> physicalHosts + = virtualHosts.get(metadata.physicalHost); + if (null == physicalHosts) { + return Optional.empty(); + } + return Optional.ofNullable(physicalHosts.get(metadata.date.plusDays(1))); + } + + /** Returns a stream of all contained log metadata. */ + public Stream<LogMetadata> metadataStream() { + return this.values().stream() + .flatMap((virtualHosts) -> virtualHosts.values().stream()) + .flatMap((physicalHosts) -> physicalHosts.values().stream()); + } +} + diff --git a/src/main/java/org/torproject/collector/webstats/LogMetadata.java b/src/main/java/org/torproject/collector/webstats/LogMetadata.java new file mode 100644 index 0000000..ee0db1a --- /dev/null +++ b/src/main/java/org/torproject/collector/webstats/LogMetadata.java @@ -0,0 +1,87 @@ +/* Copyright 2017--2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.collector.webstats; + +import static org.torproject.descriptor.log.WebServerAccessLogImpl.MARKER; + +import org.torproject.descriptor.internal.FileType; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Path; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class LogMetadata { + + private static final Logger log + = LoggerFactory.getLogger(LogMetadata.class); + + /** The mandatory web server log descriptor file name pattern. */ + public static final Pattern filenamePattern + = Pattern.compile("(\S*)-" + MARKER + + "-(\d{8})(?:\.?)([a-zA-Z0-9]+)$"); + + /** The path of the log file to be imported. */ + public final Path path; + + /** The date the log entries were created. */ + public final LocalDate date; + + /** The log's compression type. */ + public final FileType fileType; + + /** The name of the physical host. */ + public final String physicalHost; + + /** The name of the virtual host. */ + public final String virtualHost; + + private LogMetadata(Path logPath, String physicalHost, String virtualHost, + LocalDate logDate, FileType fileType) { + this.path = logPath; + this.date = logDate; + this.fileType = fileType; + this.physicalHost = physicalHost; + this.virtualHost = virtualHost; + } + + /** + * Only way to create a LogMetadata object from a given log path. + */ + public static Optional<LogMetadata> create(Path logPath) { + LogMetadata metadata = null; + try { + Path parentPath = logPath.getName(logPath.getNameCount() - 2); + Path file = logPath.getFileName(); + if (null != parentPath && null != file) { + String physicalHost = parentPath.toString(); + Matcher mat = filenamePattern.matcher(file.toString()); + if (mat.find()) { + String virtualHost = mat.group(1); + // verify date given + LocalDate logDate + = LocalDate.parse(mat.group(2), DateTimeFormatter.BASIC_ISO_DATE); + if (null == virtualHost || null == physicalHost || null == logDate + || virtualHost.isEmpty() || physicalHost.isEmpty()) { + log.debug("Non-matching file encountered: '{}/{}'.", + parentPath, file); + } else { + metadata = new LogMetadata(logPath, physicalHost, virtualHost, + logDate, FileType.findType(mat.group(3))); + } + } + } + } catch (Throwable ex) { + metadata = null; + log.debug("Problem parsing path '{}'.", logPath, ex); + } + return Optional.ofNullable(metadata); + } +} + diff --git a/src/main/java/org/torproject/collector/webstats/SanitizeWeblogs.java b/src/main/java/org/torproject/collector/webstats/SanitizeWeblogs.java new file mode 100644 index 0000000..88d62fa --- /dev/null +++ b/src/main/java/org/torproject/collector/webstats/SanitizeWeblogs.java @@ -0,0 +1,198 @@ +/* Copyright 2017--2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.collector.webstats; + +import org.torproject.collector.conf.Configuration; +import org.torproject.collector.conf.ConfigurationException; +import org.torproject.collector.conf.Key; +import org.torproject.collector.conf.SourceType; +import org.torproject.collector.cron.CollecTorMain; + +import org.torproject.collector.persist.PersistenceUtils; +import org.torproject.collector.persist.WebServerAccessLogPersistence; +import org.torproject.descriptor.DescriptorParseException; +import org.torproject.descriptor.WebServerAccessLog; +import org.torproject.descriptor.log.InternalLogDescriptor; +import org.torproject.descriptor.log.InternalWebServerAccessLog; +import org.torproject.descriptor.log.WebServerAccessLogImpl; +import org.torproject.descriptor.log.WebServerAccessLogLine; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.SortedSet; +import java.util.StringJoiner; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * This module processes web-logs for CollecTor according to the weblog + * sanitation specification published on metrics.torproject.org</p> + */ +public class SanitizeWeblogs extends CollecTorMain { + + private static final Logger log = + LoggerFactory.getLogger(SanitizeWeblogs.class); + private static final int LIMIT = 2; + + private static final String WEBSTATS = "webstats"; + private String outputPathName; + private String recentPathName; + private boolean limits; + + /** + * Possibly privacy impacting data is replaced by dummy data producing a + * log-file (or files) that confirm(s) to Apache's Combined Log Format. + */ + public SanitizeWeblogs(Configuration conf) { + super(conf); + this.mapPathDescriptors.put("recent/webstats", WebServerAccessLog.class); + } + + @Override + public String module() { + return WEBSTATS; + } + + @Override + protected String syncMarker() { + return "Webstats"; + } + + @Override + protected void startProcessing() throws ConfigurationException { + try { + Files.createDirectories(this.config.getPath(Key.OutputPath)); + Files.createDirectories(this.config.getPath(Key.RecentPath)); + this.outputPathName = this.config.getPath(Key.OutputPath).toString(); + this.recentPathName = this.config.getPath(Key.RecentPath).toString(); + this.limits = this.config.getBool(Key.WebstatsLimits); + Set<SourceType> sources = this.config.getSourceTypeSet( + Key.WebstatsSources); + if (sources.contains(SourceType.Local)) { + findCleanWrite(this.config.getPath(Key.WebstatsLocalOrigins)); + PersistenceUtils.cleanDirectory(this.config.getPath(Key.RecentPath)); + } + } catch (Exception e) { + log.error("Cannot sanitize web-logs: " + e.getMessage(), e); + throw new RuntimeException(e); + } + } + + private void findCleanWrite(Path dir) { + LogFileMap fileMapIn = new LogFileMap(dir); + log.info("Found log files for {} virtual hosts.", fileMapIn.size()); + for (Map.Entry<String,TreeMap<String,TreeMap<LocalDate,LogMetadata>>> + virtualEntry : fileMapIn.entrySet()) { + String virtualHost = virtualEntry.getKey(); + for (Map.Entry<String, TreeMap<LocalDate, LogMetadata>> physicalEntry + : virtualEntry.getValue().entrySet()) { + String physicalHost = physicalEntry.getKey(); + log.info("Processing logs for {} on {}.", virtualHost, physicalHost); + Map<LocalDate, List<WebServerAccessLogLine>> linesByDate + = physicalEntry.getValue().values().stream().parallel() + .flatMap((LogMetadata metadata) + -> lineStream(metadata).filter((line) -> line.isValid())) + .collect(Collectors.groupingBy(WebServerAccessLogLine::getDate, + Collectors.toList())); + LocalDate[] interval = determineInterval(linesByDate.keySet()); + linesByDate.entrySet().stream() + .filter((entry) -> entry.getKey().isAfter(interval[0]) + && entry.getKey().isBefore(interval[1])) + .forEach((entry) -> storeSanitized(virtualHost, physicalHost, + entry.getKey(), entry.getValue())); + } + } + } + + private void storeSanitized(String virtualHost, String physicalHost, + LocalDate date, List<WebServerAccessLogLine> lines) { + String name = new StringJoiner(InternalLogDescriptor.SEP) + .add(virtualHost).add(physicalHost) + .add(InternalWebServerAccessLog.MARKER) + .add(date.format(DateTimeFormatter.BASIC_ISO_DATE)).toString(); + log.debug("Sanitizing {}.", name); + List<String> retainedLines = lines + .stream().map((line) -> sanitize(line, date)) + .filter((line) -> line.isPresent()).map((line) -> line.get()) + .collect(Collectors.toList()); + retainedLines.sort(null); + try { + WebServerAccessLogPersistence walp + = new WebServerAccessLogPersistence( + new WebServerAccessLogImpl(retainedLines, name)); + log.debug("Storing {}.", name); + walp.storeOut(this.outputPathName); + walp.storeRecent(this.recentPathName); + } catch (DescriptorParseException dpe) { + log.error("Cannot store log desriptor {}.", name, dpe); + } + } + + static Optional<String> sanitize(WebServerAccessLogLine logLine, + LocalDate date) { + if (!logLine.isValid() + || !("GET".equals(logLine.getMethod()) + || "HEAD".equals(logLine.getMethod())) + || !logLine.getProtocol().startsWith("HTTP") + || 400 == logLine.getResponse() || 404 == logLine.getResponse()) { + return Optional.empty(); + } + if (!logLine.getIp().startsWith("0.0.0.")) { + logLine.setIp("0.0.0.0"); + } + int queryStart = logLine.getRequest().indexOf("?"); + if (queryStart > 0) { + logLine.setRequest(logLine.getRequest().substring(0, queryStart)); + } + return Optional.of(logLine.toLogString()); + } + + LocalDate[] determineInterval(Set<LocalDate> dates) { + SortedSet<LocalDate> sorted = new TreeSet<>(); + sorted.addAll(dates); + if (this.limits) { + for (int i = 0; i < LIMIT - 1; i++) { + sorted.remove(sorted.last()); + } + } + if (sorted.isEmpty()) { + return new LocalDate[]{LocalDate.MAX, LocalDate.MIN}; + } + if (!this.limits) { + sorted.add(sorted.first().minusDays(1)); + sorted.add(sorted.last().plusDays(1)); + } + return new LocalDate[]{sorted.first(), sorted.last()}; + } + + private Stream<WebServerAccessLogLine> lineStream(LogMetadata metadata) { + log.debug("Processing file {}.", metadata.path); + try (BufferedReader br + = new BufferedReader(new InputStreamReader(new ByteArrayInputStream( + metadata.fileType.decompress(Files.readAllBytes(metadata.path)))))) { + return br.lines() + .map((String line) -> WebServerAccessLogLine.makeLine(line)) + .collect(Collectors.toList()).stream(); + } catch (Exception ex) { + log.debug("Skipping log-file {}.", metadata.path, ex); + } + return Stream.empty(); + } + +} + diff --git a/src/main/resources/collector.properties b/src/main/resources/collector.properties index 0a9f932..30dda2a 100644 --- a/src/main/resources/collector.properties +++ b/src/main/resources/collector.properties @@ -41,6 +41,12 @@ UpdateindexActivated = false UpdateindexPeriodMinutes = 2 # offset in minutes since the epoch and UpdateindexOffsetMinutes = 0 +# the following defines, if this module is activated +WebstatsActivated = false +# period in minutes +WebstatsPeriodMinutes = 360 +# offset in minutes since the epoch and +WebstatsOffsetMinutes = 31 ########################################## ## All below can be changed at runtime. ##### @@ -154,4 +160,16 @@ OnionPerfSyncOrigins = https://collector.torproject.org ## the second, etc.: ## OnionPerfHosts = http://first.torproject.org/, http://second.torproject.org/ OnionPerfHosts = https://op-us.onionperf.torproject.net/ - +######## Tor Weblogs ######## +# +## Define descriptor sources +# possible values: Local, Sync +WebstatsSources = Local +# Retrieve files from the following CollecTor instances. +# List of URLs separated by comma. +WebstatsSyncOrigins = https://collector.torproject.org +## Relative path to directory to import logfiles from. +WebstatsLocalOrigins = in/webstats +# Default 'true' behaves as stated in section 4 of +# https://metrics.torproject.org/web-server-logs.html +WebstatsLimits = true diff --git a/src/test/java/org/torproject/collector/conf/ConfigurationTest.java b/src/test/java/org/torproject/collector/conf/ConfigurationTest.java index dfb06b2..fcaa71f 100644 --- a/src/test/java/org/torproject/collector/conf/ConfigurationTest.java +++ b/src/test/java/org/torproject/collector/conf/ConfigurationTest.java @@ -40,7 +40,7 @@ public class ConfigurationTest { public void testKeyCount() throws Exception { assertEquals("The number of properties keys in enum Key changed." + "\n This test class should be adapted.", - 45, Key.values().length); + 52, Key.values().length); }
@Test() diff --git a/src/test/java/org/torproject/collector/cron/CollecTorMainTest.java b/src/test/java/org/torproject/collector/cron/CollecTorMainTest.java index 79c1bd7..025f96c 100644 --- a/src/test/java/org/torproject/collector/cron/CollecTorMainTest.java +++ b/src/test/java/org/torproject/collector/cron/CollecTorMainTest.java @@ -71,6 +71,7 @@ public class CollecTorMainTest { case "Bridge": case "Exitlist": case "OnionPerf": + case "Webstats": assertNotNull("Property '" + key + "' not specified in " + Main.CONF_FILE + ".", props.getProperty(key)); diff --git a/src/test/java/org/torproject/collector/sync/SyncPersistenceTest.java b/src/test/java/org/torproject/collector/sync/SyncPersistenceTest.java index 2774c8d..489a413 100644 --- a/src/test/java/org/torproject/collector/sync/SyncPersistenceTest.java +++ b/src/test/java/org/torproject/collector/sync/SyncPersistenceTest.java @@ -28,6 +28,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; +import java.util.stream.Collectors;
@RunWith(Parameterized.class) public class SyncPersistenceTest { @@ -49,6 +50,26 @@ public class SyncPersistenceTest { Integer.valueOf(1), Integer.valueOf(1)},
+ {"webstats/archive.torproject.org_" + + "archeotrichon.torproject.org_access.log_20151007.xz", + new String[]{"webstats/archive.torproject.org/2015/10/07/" + + "archive.torproject.org_archeotrichon.torproject.org" + + "_access.log_20151007.xz"}, + "archeotrichon.torproject.org/archive.torproject.org_" + + "archeotrichon.torproject.org_access.log_20151007.xz", + Integer.valueOf(1), + Integer.valueOf(1)}, + + {"webstats/metrics.torproject.org_" + + "meronense.torproject.org_access.log_20170531.xz", + new String[]{"webstats/metrics.torproject.org/2017/05/31/" + + "metrics.torproject.org_meronense.torproject.org" + + "_access.log_20170531.xz"}, + "meronense.torproject.org/metrics.torproject.org_" + + "meronense.torproject.org_access.log_20170531.gz", + Integer.valueOf(1), + Integer.valueOf(1)}, + {"relay-descriptors/server-descriptors/" + "2016-10-05-19-06-17-server-descriptors", new String[]{"relay-descriptors/server-descriptor/2016/10/e/3/" @@ -266,6 +287,9 @@ public class SyncPersistenceTest {
@Test() public void testRecentFileContent() throws Exception { + if (this.filename.contains(".log")) { + return; // Skip this test, because logs are compressed and sanitized. + } conf = new Configuration(); makeTemporaryFolders(); DescriptorParser dp = DescriptorSourceFactory.createDescriptorParser(); @@ -292,6 +316,9 @@ public class SyncPersistenceTest {
@Test() public void testOutFileContent() throws Exception { + if (this.filename.contains(".log")) { + return; // Skip this test, because logs are compressed and sanitized. + } conf = new Configuration(); makeTemporaryFolders(); DescriptorParser dp = DescriptorSourceFactory.createDescriptorParser(); @@ -305,9 +332,8 @@ public class SyncPersistenceTest { List<String> expContent = linesFromResource(filename); int expContentSize = expContent.size(); for (File file : outputList) { - List<String> content = Files.readAllLines(file.toPath(), - StandardCharsets.UTF_8); - for (String line : content) { + for (String line : Files.readAllLines(file.toPath(), + StandardCharsets.UTF_8)) { assertTrue("Couldn't remove " + line + ".", expContent.remove(line)); assertEquals(--expContentSize, expContent.size()); } @@ -325,49 +351,25 @@ public class SyncPersistenceTest { }
private byte[] bytesFromResource() throws Exception { - StringBuilder sb = new StringBuilder(); - BufferedReader br = new BufferedReader(new InputStreamReader(getClass() - .getClassLoader().getResourceAsStream(filename))); - String line = br.readLine(); - while (null != line) { - sb.append(line).append('\n'); - line = br.readLine(); - } - return sb.toString().getBytes(); + return Files.readAllBytes((new File(getClass() + .getClassLoader().getResource(filename).toURI())).toPath()); }
private String stringFromResource() throws Exception { - StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(new InputStreamReader(getClass() .getClassLoader().getResourceAsStream(filename))); - String line = br.readLine(); - while (null != line) { - sb.append(line).append('\n'); - line = br.readLine(); - } - return sb.toString(); + return br.lines().collect(Collectors.joining("\n", "", "\n")); }
private String stringFromFile(File file) throws Exception { - StringBuilder sb = new StringBuilder(); - List<String> lines = Files.readAllLines(file.toPath(), - StandardCharsets.UTF_8); - for (String line : lines) { - sb.append(line).append('\n'); - } - return sb.toString(); + return Files.lines(file.toPath(), StandardCharsets.UTF_8) + .collect(Collectors.joining("\n", "", "\n")); }
private List<String> linesFromResource(String filename) throws Exception { - List<String> res = new ArrayList<>(); BufferedReader br = new BufferedReader(new InputStreamReader(getClass() .getClassLoader().getResourceAsStream(filename))); - String line = br.readLine(); - while (null != line) { - res.add(line); - line = br.readLine(); - } - return res; + return br.lines().collect(Collectors.toList()); }
} diff --git a/src/test/java/org/torproject/collector/webstats/LogFileMapTest.java b/src/test/java/org/torproject/collector/webstats/LogFileMapTest.java new file mode 100644 index 0000000..d55ba40 --- /dev/null +++ b/src/test/java/org/torproject/collector/webstats/LogFileMapTest.java @@ -0,0 +1,33 @@ +/* Copyright 2017--2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.collector.webstats; + +import static org.junit.Assert.assertTrue; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.nio.file.Paths; +import java.util.Optional; + +public class LogFileMapTest { + + @Rule + public TemporaryFolder tmpf = new TemporaryFolder(); + + @Test + public void makeLogFileMap() throws Exception { + LogFileMap lfm = new LogFileMap(tmpf.newFolder().toPath()); + for (String path : new String[] {"in/ph1/vh1-access.log-20170901.gz", + "in/ph1/vh1-access.log-20170902.xz"}) { + Optional<LogMetadata> element + = LogMetadata.create(Paths.get(path)); + assertTrue(element.isPresent()); + lfm.add(element.get()); + } + } + +} + diff --git a/src/test/java/org/torproject/collector/webstats/LogMetadataTest.java b/src/test/java/org/torproject/collector/webstats/LogMetadataTest.java new file mode 100644 index 0000000..6121e8d --- /dev/null +++ b/src/test/java/org/torproject/collector/webstats/LogMetadataTest.java @@ -0,0 +1,82 @@ +/* Copyright 2017--2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.collector.webstats; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.Arrays; +import java.util.Collection; +import java.util.Optional; + +@RunWith(Parameterized.class) +public class LogMetadataTest { + + /** Path and expected values of LogMetadata. */ + @Parameters + public static Collection<Object[]> pathResult() { + return Arrays.asList(new Object[][] { + {Paths.get("in", "ph1", "vh1-error.log-20170902.xz"), + "10001010", Boolean.FALSE, + "Non-access logs should be discarded."}, + {Paths.get("in", "ph1", "vh1-access.log-2017.xz"), + "10001010", Boolean.FALSE, + "Log file should be discarded, because of wrong date format."}, + {Paths.get("in", "ph1", "vh1-access.log.xz"), + "10001010", Boolean.FALSE, + "Log file should be discarded, because of the missing date."}, + {Paths.get("vh1-access.log-20170901.gz"), + "10001010", Boolean.FALSE, + "Should be discarded because of missing physical host information."}, + {Paths.get("in", "ph1", "vh1-access.log-20170901.gz"), + "20170901", Boolean.TRUE, + "Should have been accepted."}, + {Paths.get("", "vh1-access.log-20170901.gz"), + "20170901", Boolean.FALSE, + "Should not result in metadata."}, + {Paths.get("x", "vh1-access.log-.gz"), + "20170901", Boolean.FALSE, + "Should not result in metadata."}, + {Paths.get("/collection/download/in/ph2", "vh2-access.log-20180901.xz"), + "20180901", Boolean.TRUE, + "Should have been accepted."} + }); + } + + private Path path; + private LocalDate date; + private boolean valid; + private String failureMessage; + + /** Set all test values. */ + public LogMetadataTest(Path path, String dateString, boolean valid, + String message) { + this.path = path; + this.date = LocalDate.parse(dateString, DateTimeFormatter.BASIC_ISO_DATE); + this.valid = valid; + this.failureMessage = message; + } + + @Test + public void testCreate() throws Exception { + Optional<LogMetadata> element = LogMetadata.create(this.path); + assertEquals(this.failureMessage, this.valid, element.isPresent()); + if (!this.valid) { + return; + } + LogMetadata lmd = element.get(); + assertEquals(this.date, lmd.date); + assertEquals(this.path, lmd.path); + } + +} + diff --git a/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz b/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz new file mode 100644 index 0000000..b459742 Binary files /dev/null and b/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz differ diff --git a/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz new file mode 100644 index 0000000..8c2333b Binary files /dev/null and b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz differ