commit 3cd814d8481c87ee3609783d66ae4e2eec81d290 Author: iwakeh iwakeh@torproject.org Date: Fri Sep 15 14:07:08 2017 +0000
Add new descriptor type for web server access logs.
Implements task-22983 and is based on the log-descriptor specification. --- CHANGELOG.md | 7 + .../org/torproject/descriptor/LogDescriptor.java | 47 ++++++ .../torproject/descriptor/WebServerAccessLog.java | 65 ++++++++ .../descriptor/impl/DescriptorParserImpl.java | 12 +- .../torproject/descriptor/index/package-info.java | 6 +- .../torproject/descriptor/internal/FileType.java | 53 +++++- .../descriptor/internal/package-info.java | 10 +- .../descriptor/log/InternalLogDescriptor.java | 63 ++++++++ .../descriptor/log/InternalWebServerAccessLog.java | 17 ++ .../descriptor/log/LogDescriptorImpl.java | 163 +++++++++++++++++++ .../descriptor/log/WebServerAccessLogImpl.java | 119 ++++++++++++++ .../descriptor/log/WebServerAccessLogLine.java | 135 ++++++++++++++++ .../torproject/descriptor/log/package-info.java | 14 ++ .../org/torproject/descriptor/package-info.java | 6 +- .../descriptor/log/LogDescriptorTest.java | 178 +++++++++++++++++++++ .../descriptor/log/WebServerAccessLogLineTest.java | 140 ++++++++++++++++ .../descriptor/log/WebServerAccessLogTest.java | 94 +++++++++++ .../descriptor/log/WebServerModuleTest.java | 113 +++++++++++++ ...eotrichon.torproject.org_access.log_20151007.xz | Bin 0 -> 4056 bytes ...rver.org_dummy.host.net_access.log_20111111.bz2 | Bin 0 -> 76 bytes ...meronense.torproject.org_access.log_20170530.gz | Bin 0 -> 388 bytes ...meronense.torproject.org_access.log_20170531.gz | Bin 0 -> 388 bytes ...eronense.torproject.org_access.log_20170530.log | 26 +++ ...eronense.torproject.org_access.log_20170607.log | 26 +++ 24 files changed, 1280 insertions(+), 14 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md index cd0dc6a..42e0e09 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +# Changes in version 2.2.0 - 2018-01-?? + + * Major changes + - Add new descriptor type WebServerAccessLog to parse web server + access logs. + + # Changes in version 2.1.1 - 2017-10-09
* Minor changes diff --git a/src/main/java/org/torproject/descriptor/LogDescriptor.java b/src/main/java/org/torproject/descriptor/LogDescriptor.java new file mode 100644 index 0000000..ff02cae --- /dev/null +++ b/src/main/java/org/torproject/descriptor/LogDescriptor.java @@ -0,0 +1,47 @@ +/* Copyright 2017--2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.descriptor; + +import java.util.List; + +/** + * Contains a log file. + * + * <p>Unlike other descriptors, logs can get very large and are typically stored + * on disk in compressed form. However, all access to log contents through this + * interface and its subinterfaces is made available in uncompressed form.</p> + * + * @since 2.2.0 + */ +public interface LogDescriptor extends Descriptor { + + /** + * Returns the decompressed raw descriptor bytes of the log. + * + * @since 2.2.0 + */ + @Override + public byte[] getRawDescriptorBytes(); + + /** + * Returns annotations found in the log file, which may be an empty List if a + * log format does not support adding annotations. + * + * @since 2.2.0 + */ + @Override + public List<String> getAnnotations(); + + /** + * Returns unrecognized lines encountered while parsing the log, which may be + * an empty list or a fixed-size list with only a few entries, depending on + * the log type. + * + * @since 2.2.0 + */ + @Override + public List<String> getUnrecognizedLines(); + +} + diff --git a/src/main/java/org/torproject/descriptor/WebServerAccessLog.java b/src/main/java/org/torproject/descriptor/WebServerAccessLog.java new file mode 100644 index 0000000..b94bc30 --- /dev/null +++ b/src/main/java/org/torproject/descriptor/WebServerAccessLog.java @@ -0,0 +1,65 @@ +/* Copyright 2017--2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.descriptor; + +import java.time.LocalDate; +import java.util.List; + +/** + * Contains a sanitized web server access log file from a {@code torproject.org} + * web server. + * + * <p>Parsing non-sanitized web server access logs from {@code torproject.org} + * web servers or other web servers is not explicitly supported, but may work + * anyway.</p> + * + * @since 2.2.0 + */ +public interface WebServerAccessLog extends LogDescriptor { + + /** + * Returns the date when requests contained in the log have been started, + * which is parsed from the log file path. + * + * <p>Typical web server access logs may contain date information in their + * file path, too, but that would be the date when the log file was rotated, + * which is not necessary the same date as the date in contained request + * lines.</p> + * + * @since 2.2.0 + */ + public LocalDate getLogDate(); + + /** + * Returns the hostname of the physical host writing this log file, which is + * parsed from the log file path. + * + * <p>A physical host can serve multiple virtual hosts, and a virtual host can + * be served by multiple physical hosts.</p> + * + * @since 2.2.0 + */ + public String getPhysicalHost(); + + /** + * Returns the hostname of the virtual host that this log file was written + * for, which is parsed from the log file path. + * + * <p>A physical host can serve multiple virtual hosts, and a virtual host can + * be served by multiple physical hosts.</p> + * + * @since 2.2.0 + */ + public String getVirtualHost(); + + /** + * Returns at most three unrecognized lines encountered while parsing the log. + * + * @since 2.2.0 + */ + @Override + public List<String> getUnrecognizedLines(); + +} + diff --git a/src/main/java/org/torproject/descriptor/impl/DescriptorParserImpl.java b/src/main/java/org/torproject/descriptor/impl/DescriptorParserImpl.java index d32c031..f244abb 100644 --- a/src/main/java/org/torproject/descriptor/impl/DescriptorParserImpl.java +++ b/src/main/java/org/torproject/descriptor/impl/DescriptorParserImpl.java @@ -9,6 +9,10 @@ import static org.torproject.descriptor.impl.DescriptorImpl.SP; import org.torproject.descriptor.Descriptor; import org.torproject.descriptor.DescriptorParseException; import org.torproject.descriptor.DescriptorParser; +import org.torproject.descriptor.log.LogDescriptorImpl; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory;
import java.io.File; import java.lang.reflect.Constructor; @@ -19,6 +23,9 @@ import java.util.List;
public class DescriptorParserImpl implements DescriptorParser {
+ private static final Logger log + = LoggerFactory.getLogger(DescriptorParserImpl.class); + @Override public Iterable<Descriptor> parseDescriptors(byte[] rawDescriptorBytes, File descriptorFile, String fileName) { @@ -26,8 +33,7 @@ public class DescriptorParserImpl implements DescriptorParser { return this.detectTypeAndParseDescriptors(rawDescriptorBytes, descriptorFile, fileName); } catch (DescriptorParseException e) { - /* Looks like we attempted to parse the whole raw descriptor bytes at once - * below and ran into a parse issue. */ + log.debug("Cannot parse descriptor file ’{}’.", descriptorFile, e); List<Descriptor> parsedDescriptors = new ArrayList<>(); parsedDescriptors.add(new UnparseableDescriptorImpl(rawDescriptorBytes, new int[] { 0, rawDescriptorBytes.length }, descriptorFile, e)); @@ -124,6 +130,8 @@ public class DescriptorParserImpl implements DescriptorParser { } else if (firstLines.startsWith("@type torperf 1.")) { return TorperfResultImpl.parseTorperfResults(rawDescriptorBytes, descriptorFile); + } else if (descriptorFile.getName().contains(LogDescriptorImpl.MARKER)) { + return LogDescriptorImpl.parse(rawDescriptorBytes, descriptorFile); } else { throw new DescriptorParseException("Could not detect descriptor " + "type in descriptor starting with '" + firstLines + "'."); diff --git a/src/main/java/org/torproject/descriptor/index/package-info.java b/src/main/java/org/torproject/descriptor/index/package-info.java index c685c63..021cbea 100644 --- a/src/main/java/org/torproject/descriptor/index/package-info.java +++ b/src/main/java/org/torproject/descriptor/index/package-info.java @@ -2,14 +2,12 @@ * See LICENSE for licensing information */
/** - * <h1>This package is still in alpha stage.</h1> - * <p>The public interface might still change in unexpected ways.</p> + * <h1>This package is part of the implementation not the public API.</h1> + * <p>The public interface might change in unexpected ways.</p> * * <p>Interfaces and essential classes for obtaining and processing * CollecTor's index.json file.</p> * - * <p>Interfaces and classes make the content of index.json available.</p> - * * * @since 1.4.0 */ diff --git a/src/main/java/org/torproject/descriptor/internal/FileType.java b/src/main/java/org/torproject/descriptor/internal/FileType.java index 36b5df8..353f0bb 100644 --- a/src/main/java/org/torproject/descriptor/internal/FileType.java +++ b/src/main/java/org/torproject/descriptor/internal/FileType.java @@ -12,6 +12,8 @@ import org.apache.commons.compress.compressors.xz.XZCompressorOutputStream;
import java.io.BufferedInputStream; import java.io.BufferedOutputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.InputStream; import java.io.OutputStream;
@@ -43,6 +45,8 @@ public enum FileType { /** * Returns <code>valueOf</code> or the default enum {@link #PLAIN}, i.e., * this method doesn't throw any exceptions and allways returns a valid enum. + * + * @since 2.1.0 */ public static FileType findType(String ext) { FileType res = null; @@ -54,16 +58,61 @@ public enum FileType { } }
- /** Return the appropriate input stream. */ + /** + * Return the appropriate input stream. + * + * @since 1.4.0 + */ public InputStream inputStream(InputStream is) throws Exception { return this.inClass.getConstructor(new Class[]{InputStream.class}) .newInstance(is); }
- /** Return the appropriate output stream. */ + /** + * Return the appropriate output stream. + * + * @since 1.4.0 + */ public OutputStream outputStream(OutputStream os) throws Exception { return this.outClass.getConstructor(new Class[]{OutputStream.class}) .newInstance(os); } + + /** + * Compresses the given bytes in memory and returns the compressed bytes. + * + * @since 2.2.0 + */ + public byte[] compress(byte[] bytes) throws Exception { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (OutputStream os = this.outputStream(baos)) { + os.write(bytes); + os.flush(); + } + return baos.toByteArray(); + } + + /** + * Decompresses the given bytes in memory and returns the decompressed bytes. + * + * @since 2.2.0 + */ + public byte[] decompress(byte[] bytes) throws Exception { + if (0 == bytes.length) { + return bytes; + } + try (InputStream is + = this.inputStream(new ByteArrayInputStream(bytes)); + ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + int readByte = is.read(); + while (readByte > 0) { + baos.write(readByte); + readByte = is.read(); + } + baos.flush(); + return baos.toByteArray(); + } + } + }
diff --git a/src/main/java/org/torproject/descriptor/internal/package-info.java b/src/main/java/org/torproject/descriptor/internal/package-info.java index 5bc7bcb..b845921 100644 --- a/src/main/java/org/torproject/descriptor/internal/package-info.java +++ b/src/main/java/org/torproject/descriptor/internal/package-info.java @@ -2,11 +2,13 @@ * See LICENSE for licensing information */
/** - * <h1>This package is part of the implementation not the public API.</h1> - * <p>The public interface might change in unexpected ways.</p> + * Interfaces and essential classes for obtaining and processing + * descriptors. + * + * <p><strong>This package is part of the implementation not the + * public API.</strong></p> * - * <p>Interfaces and essential classes for obtaining and processing - * descriptors.</p> + * <p>The public interface might change in unexpected ways.</p> * * @since 2.1.0 */ diff --git a/src/main/java/org/torproject/descriptor/log/InternalLogDescriptor.java b/src/main/java/org/torproject/descriptor/log/InternalLogDescriptor.java new file mode 100644 index 0000000..3c0039b --- /dev/null +++ b/src/main/java/org/torproject/descriptor/log/InternalLogDescriptor.java @@ -0,0 +1,63 @@ +/* Copyright 2017--2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.descriptor.log; + +import org.torproject.descriptor.DescriptorParseException; +import org.torproject.descriptor.LogDescriptor; + +/** + * This interface provides methods for internal use only. + * + * @since 2.2.0 + */ +public interface InternalLogDescriptor extends LogDescriptor { + + /** Logfile name parts separator. */ + public static final String SEP = "_"; + + /** + * Validate log lines. + * + * @since 2.2.0 + */ + public void validate() throws DescriptorParseException; + + /** + * Set the <code>Validator</code> that will perform the validation on log + * lines. + * + * <p>Usually set by the implementing class.</p> + * + * @since 2.2.0 + */ + public void setValidator(Validator validator); + + /** + * Set the descriptor's bytes. + * + * @since 2.2.0 + */ + public void setRawDescriptorBytes(byte[] bytes); + + /** Return the descriptor's preferred compression. */ + public String getCompressionType(); + + /** + * Provides a single function for validating a single log line. + * + * @since 2.2.0 + */ + public interface Validator { + + /** + * Verifies a log line. + * + * @since 2.2.0 + */ + public boolean validate(String line); + + } + +} + diff --git a/src/main/java/org/torproject/descriptor/log/InternalWebServerAccessLog.java b/src/main/java/org/torproject/descriptor/log/InternalWebServerAccessLog.java new file mode 100644 index 0000000..540f25d --- /dev/null +++ b/src/main/java/org/torproject/descriptor/log/InternalWebServerAccessLog.java @@ -0,0 +1,17 @@ +/* Copyright 2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.descriptor.log; + +/** + * This interface provides methods for internal use only. + * + * @since 2.2.0 + */ +public interface InternalWebServerAccessLog extends InternalLogDescriptor { + + /** The log's name should include this string. */ + public static final String MARKER = "access.log"; + +} + diff --git a/src/main/java/org/torproject/descriptor/log/LogDescriptorImpl.java b/src/main/java/org/torproject/descriptor/log/LogDescriptorImpl.java new file mode 100644 index 0000000..97854e4 --- /dev/null +++ b/src/main/java/org/torproject/descriptor/log/LogDescriptorImpl.java @@ -0,0 +1,163 @@ +/* Copyright 2017--2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.descriptor.log; + +import org.torproject.descriptor.Descriptor; +import org.torproject.descriptor.DescriptorParseException; +import org.torproject.descriptor.LogDescriptor; +import org.torproject.descriptor.internal.FileType; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +/** + * Base class for log descriptors. + * + * @since 2.2.0 + */ +public abstract class LogDescriptorImpl + implements LogDescriptor, InternalLogDescriptor { + + /** The log's file name should contain this string. */ + public static final String MARKER = ".log"; + + private static final int unrecognizedLinesLimit = 3; + + private static final Logger log + = LoggerFactory.getLogger(LogDescriptorImpl.class); + + private static Pattern filenamePattern = Pattern.compile( + "(?:\S*)" + MARKER + SEP + "(?:[0-9a-zA-Z]*)(?:\.?)([a-zA-Z2]*)"); + + private final File descriptorFile; + + /** Byte array for plain, i.e. uncompressed, log data. */ + private byte[] logBytes; + + private FileType fileType; + + private List<String> unrecognizedLines = new ArrayList<>(); + + private Validator validator = (String line) -> true; + + /** + * This constructor performs basic operations on the given bytes. + * + * <p>An unknown compression type (see {@link #getCompressionType}) + * is interpreted as missing compression. In this case the bytes + * will be compressed to the given compression type.</p> + * + * @since 2.2.0 + */ + protected LogDescriptorImpl(byte[] logBytes, File descriptorFile, + FileType defaultCompression) throws DescriptorParseException { + this.logBytes = logBytes; + this.descriptorFile = descriptorFile; + try { + Matcher mat = filenamePattern.matcher(descriptorFile.getName()); + if (!mat.find()) { + throw new DescriptorParseException( + "Log file name doesn't comply to standard: " + descriptorFile); + } + this.fileType = FileType.findType(mat.group(1).toUpperCase()); + if (FileType.PLAIN == this.fileType) { + this.fileType = defaultCompression; + } else { + this.logBytes = this.fileType.decompress(this.logBytes); + } + } catch (Exception ex) { + throw new DescriptorParseException("Cannot parse file " + + descriptorFile.getName(), ex); + } + } + + @Override + public void validate() throws DescriptorParseException { + try (BufferedReader br + = new BufferedReader(new InputStreamReader(new ByteArrayInputStream( + this.logBytes)))) { + this.unrecognizedLines.addAll(br.lines().parallel().filter((line) + -> null != line && !line.isEmpty() && !validator.validate(line)) + .limit(unrecognizedLinesLimit).collect(Collectors.toList())); + } catch (Exception ex) { + throw new DescriptorParseException("Cannot validate log lines.", ex); + } + } + + /** + * Assemble a LogDescriptor. + * + * @since 2.2.0 + */ + public static List<Descriptor> parse(byte[] logBytes, + File descriptorFile) throws DescriptorParseException { + if (descriptorFile.getName().contains(InternalWebServerAccessLog.MARKER)) { + return Arrays.asList(new Descriptor[]{ + new WebServerAccessLogImpl(logBytes, descriptorFile)}); + } else { + throw new DescriptorParseException("Cannot parse file " + + descriptorFile.getName()); + } + } + + public static byte[] collectionToBytes(Collection<String> lines) { + return lines.stream().collect(Collectors.joining("\n", "", "\n")) + .getBytes(); + } + + @Override + public void setValidator(Validator validator) { + this.validator = validator; + } + + @Override + public String getCompressionType() { + return this.fileType.name().toLowerCase(); + } + + @Override + public byte[] getRawDescriptorBytes() { + return this.logBytes; + } + + @Override + public void setRawDescriptorBytes(byte[] bytes) { + this.logBytes = bytes; + } + + @Override + public int getRawDescriptorLength() { + return this.logBytes.length; + } + + @Override + public List<String> getAnnotations() { + return Collections.emptyList(); + } + + @Override + public List<String> getUnrecognizedLines() { + return this.unrecognizedLines; + } + + @Override + public File getDescriptorFile() { + return descriptorFile; + } + +} + diff --git a/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java b/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java new file mode 100644 index 0000000..6708c3a --- /dev/null +++ b/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java @@ -0,0 +1,119 @@ +/* Copyright 2017--2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.descriptor.log; + +import org.torproject.descriptor.DescriptorParseException; +import org.torproject.descriptor.WebServerAccessLog; +import org.torproject.descriptor.internal.FileType; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.Collection; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Implementation of web server access log descriptors. + * + * <p>Defines sanitization and validation for web server access logs.</p> + * + * @since 2.2.0 + */ +public class WebServerAccessLogImpl extends LogDescriptorImpl + implements InternalWebServerAccessLog, WebServerAccessLog { + + private static final Logger log + = LoggerFactory.getLogger(WebServerAccessLogImpl.class); + + /** The log's name should include this string. */ + public static final String MARKER = InternalWebServerAccessLog.MARKER; + + /** The mandatory web server log descriptor file name pattern. */ + public static final Pattern filenamePattern + = Pattern.compile("(\S*)" + SEP + "(\S*)" + SEP + "" + MARKER + + SEP + "(\d*)(?:\.?)([a-zA-Z]*)"); + + private final String physicalHost; + + private final String virtualHost; + + private final LocalDate logDate; + + /** + * Creates a WebServerAccessLog from the given bytes and filename. + * + * <p>The given bytes are read, whereas the file is not read.</p> + * + * <p>The path of the given file has to be compliant to the following + * naming pattern + * {@code + * <virtualHost>-<physicalHost>-access.log-<yyyymmdd>.<compression>}, + * where an unknown compression type (see {@link #getCompressionType}) + * is interpreted as missing compression. In this case the bytes + * will be compressed to the default compression type. + * The immediate parent name is taken to be the physical host collecting the + * logs.</p> + */ + protected WebServerAccessLogImpl(byte[] logBytes, File file) + throws DescriptorParseException { + this(logBytes, file, FileType.XZ); + } + + /** For internal use only. */ + public WebServerAccessLogImpl(Collection<String> lines, String filename) + throws DescriptorParseException { + this(LogDescriptorImpl.collectionToBytes(lines), new File(filename)); + } + + private WebServerAccessLogImpl(byte[] logBytes, File file, + FileType defaultCompression) throws DescriptorParseException { + super(logBytes, file, defaultCompression); + try { + String fn = file.toPath().getFileName().toString(); + Matcher mat = filenamePattern.matcher(fn); + if (!mat.find()) { + throw new DescriptorParseException( + "WebServerAccessLog file name doesn't comply to standard: " + fn); + } + this.virtualHost = mat.group(1); + this.physicalHost = mat.group(2); + if (null == this.virtualHost || null == this.physicalHost + || this.virtualHost.isEmpty() || this.physicalHost.isEmpty()) { + throw new DescriptorParseException( + "WebServerAccessLog file name doesn't comply to standard: " + fn); + } + String ymd = mat.group(3); + this.logDate = LocalDate.parse(ymd, DateTimeFormatter.BASIC_ISO_DATE); + this.setValidator((line) + -> WebServerAccessLogLine.makeLine(line).isValid()); + this.validate(); + } catch (DescriptorParseException dpe) { + throw dpe; // escalate + } catch (Exception pe) { + throw new DescriptorParseException( + "Cannot parse WebServerAccessLog file: " + file, pe); + } + } + + @Override + public String getPhysicalHost() { + return this.physicalHost; + } + + @Override + public String getVirtualHost() { + return this.virtualHost; + } + + @Override + public LocalDate getLogDate() { + return this.logDate; + } + +} + diff --git a/src/main/java/org/torproject/descriptor/log/WebServerAccessLogLine.java b/src/main/java/org/torproject/descriptor/log/WebServerAccessLogLine.java new file mode 100644 index 0000000..ab20dd2 --- /dev/null +++ b/src/main/java/org/torproject/descriptor/log/WebServerAccessLogLine.java @@ -0,0 +1,135 @@ +/* Copyright 2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.descriptor.log; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.time.LocalDate; +import java.time.ZoneOffset; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class WebServerAccessLogLine { + + private static final Logger log = LoggerFactory + .getLogger(WebServerAccessLogLine.class); + + private static final String DATE_PATTERN = "dd/MMM/yyyy"; + + private static final DateTimeFormatter dateTimeFormatter + = DateTimeFormatter.ofPattern(DATE_PATTERN + ":HH:mm:ss xxxx"); + + private static Pattern logLinePattern = Pattern.compile( + "^((?:\d{1,3}\.){3}\d{1,3}) (\S+) (\S+) " + + "\[([\w/]+)([\w:]+)(\s[+\-]\d{4})\] " + + ""([A-Z]+) ([^"]+) ([A-Z]+/\d\.\d)" " + + "(\d{3}) (\d+|-)(.*)"); + + private String ip; + private int response; + private String request; + private String method; + private String dateString; + private LocalDate date; + private String protocol; + private Optional<Integer> size; + private boolean valid = false; + private String type; + + /** Returns a log line string. Possibly empty. */ + public String toLogString() { + if (!this.valid) { + return ""; + } + return toString(); + } + + @Override + public String toString() { + return String.format("%s - - [%s:00:00:00 +0000] "%s %s %s" %d %s", + this.ip, this.dateString, this.method, this.request, this.type, + this.response, this.size.isPresent() ? this.size.get() : "-"); + } + + /** Returns the string of the date using 'yyyymmdd' format. */ + public String getDateString() { + return dateString; + } + + /** Returns a string containing the ip. */ + public String getIp() { + return this.ip; + } + + /** Only used internally during sanitization. */ + public void setIp(String ip) { + this.ip = ip; + } + + public String getMethod() { + return this.method; + } + + public String getProtocol() { + return this.protocol; + } + + public String getRequest() { + return this.request; + } + + public int getResponse() { + return this.response; + } + + /** Only used internally during sanitization. */ + public void setRequest(String request) { + this.request = request; + } + + public LocalDate getDate() { + return this.date; + } + + public boolean isValid() { + return this.valid; + } + + /** Creates a Line from a string. */ + public static WebServerAccessLogLine makeLine(String line) { + WebServerAccessLogLine res = new WebServerAccessLogLine(); + try { + Matcher mat = logLinePattern.matcher(line); + if (mat.find()) { + res.response = Integer.valueOf(mat.group(10)); + res.method = mat.group(7); + res.protocol = mat.group(9); + String dateTimeString = mat.group(4) + mat.group(5) + mat.group(6); + res.date = ZonedDateTime.parse(dateTimeString, + dateTimeFormatter).withZoneSameInstant(ZoneOffset.UTC) + .toLocalDate(); + res.dateString = res.date + .format(DateTimeFormatter.ofPattern(DATE_PATTERN)); + res.ip = mat.group(1); + res.request = mat.group(8); + res.type = mat.group(9); + if ("-".equals(mat.group(11))) { + res.size = Optional.empty(); + } else { + res.size = Optional.of(Integer.valueOf(mat.group(11))); + } + res.valid = true; + } + } catch (Throwable th) { + log.debug("Unmatchable line: '{}'.", line, th); + return new WebServerAccessLogLine(); + } + return res; + } + +} diff --git a/src/main/java/org/torproject/descriptor/log/package-info.java b/src/main/java/org/torproject/descriptor/log/package-info.java new file mode 100644 index 0000000..68bcfa1 --- /dev/null +++ b/src/main/java/org/torproject/descriptor/log/package-info.java @@ -0,0 +1,14 @@ +/* Copyright 2017--2018 The Tor Project + * See LICENSE for licensing information */ + +/** + * <h1>This package is part of the implementation not the public API.</h1> + * <p>The public interface might change in unexpected ways.</p> + * + * <p>Interfaces and essential classes for obtaining and processing + * log descriptors.</p> + * + * @since 2.2.0 + */ +package org.torproject.descriptor.log; + diff --git a/src/main/java/org/torproject/descriptor/package-info.java b/src/main/java/org/torproject/descriptor/package-info.java index 0410bac..d844d40 100644 --- a/src/main/java/org/torproject/descriptor/package-info.java +++ b/src/main/java/org/torproject/descriptor/package-info.java @@ -65,9 +65,11 @@ * connected to the Tor network rather than by the Tor software. This * group comprises descriptors by the bridge distribution service BridgeDB * ({@link org.torproject.descriptor.BridgePoolAssignment}), the exit list - * service TorDNSEL ({@link org.torproject.descriptor.ExitList}), and the + * service TorDNSEL ({@link org.torproject.descriptor.ExitList}), the * performance measurement service Torperf - * ({@link org.torproject.descriptor.TorperfResult}).</li> + * ({@link org.torproject.descriptor.TorperfResult}), and sanitized access logs + * of Tor's web servers + * ({@link org.torproject.descriptor.WebServerAccessLog}).</li> * </ol> * * @since 1.0.0 diff --git a/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java b/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java new file mode 100644 index 0000000..b12cfc0 --- /dev/null +++ b/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java @@ -0,0 +1,178 @@ + +/* Copyright 2017--2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.descriptor.log; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.torproject.descriptor.Descriptor; +import org.torproject.descriptor.DescriptorParser; +import org.torproject.descriptor.DescriptorReader; +import org.torproject.descriptor.DescriptorSourceFactory; +import org.torproject.descriptor.LogDescriptor; +import org.torproject.descriptor.UnparseableDescriptor; +import org.torproject.descriptor.WebServerAccessLog; + +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; + +@RunWith(Parameterized.class) +public class LogDescriptorTest { + + /** Temporary folder containing all files for this test. */ + @Rule + public TemporaryFolder temp = new TemporaryFolder(); + + /** Directory containing two input descriptor files. */ + protected File indir; + + /** Descriptor reader used in this test. */ + protected DescriptorReader reader + = DescriptorSourceFactory.createDescriptorReader(); + + protected int size; + protected String[] pan; + protected Class<LogDescriptor> type; + protected boolean isDecompressionTest; + + /** All types of data that can be encountered during sync. */ + @Parameters + public static Collection<Object[]> pathAndName() { + return Arrays.asList(new Object[][] { + {Boolean.TRUE, 1878, new String[]{"meronense.torproject.org", + "metrics.torproject.org_meronense.torproject.org_access.log" + + "_20170530.gz", + "metrics.torproject.org", "20170530", "gz"}, + WebServerAccessLog.class}, + {Boolean.FALSE, 1878, new String[]{"meronense.torproject.org", + "xy.host.org_meronense.torproject.org_access.log_20170530.log", + "metrics.torproject.org", "20170530", "xz"}, + WebServerAccessLog.class}, + {Boolean.TRUE, 70730, new String[]{"archeotrichon.torproject.org", + "archive.torproject.org_archeotrichon.torproject.org_access.log_" + + "20151007.xz", + "archive.torproject.org", "20151007", "xz"}, + WebServerAccessLog.class}, + {Boolean.TRUE, 0, new String[]{"dummy.host.net", + "nix.server.org_dummy.host.net_access.log_20111111.bz2", + "nix.server.org", "20111111", "bz2"}, + WebServerAccessLog.class}}); + } + + /** This constructor receives the above defined data for each run. */ + public LogDescriptorTest(boolean decompression, int size, String[] pan, + Class<LogDescriptor> type) { + this.pan = pan; + this.size = size; + this.type = type; + this.isDecompressionTest = decompression; + } + + /** Prepares the temporary folder and writes files to it for this test. */ + private void createTemporaryFolderAndContents() throws IOException { + this.indir = this.temp.newFolder(); + String path = this.pan[0]; + String name = this.pan[1]; + File logdir = new File(indir, path); + logdir.mkdir(); + File accessLogFile = new File(logdir, name); + Files.copy(getClass().getClassLoader().getResource(path + "/" + name) + .openStream(), accessLogFile.toPath()); + } + + /** Read the test files. */ + @Before + public void readAll() throws IOException { + createTemporaryFolderAndContents(); + Iterator<Descriptor> descs = this.reader + .readDescriptors(this.indir).iterator(); + while (descs.hasNext()) { + descs.next(); + } + } + + protected List<Descriptor> retrieve() throws Exception { + assertEquals(1, this.reader.getParsedFiles().size()); + File logFile = new File(this.reader.getParsedFiles().firstKey()); + byte[] raw = Files.readAllBytes(logFile.toPath()); + DescriptorParser dp = DescriptorSourceFactory.createDescriptorParser(); + List<Descriptor> descs = new ArrayList<>(); + for (Descriptor desc + : dp.parseDescriptors(raw, logFile, logFile.getName())) { + descs.add(desc); + } + return descs; + } + + @Test + public void testParsing() throws Exception { + List<Descriptor> descs = retrieve(); + assertTrue("Wrong type. " + dataUsed(), + (descs.get(0) instanceof LogDescriptor)); + InternalLogDescriptor ld = (InternalLogDescriptor) descs.get(0); + assertEquals("Wrong compression type string. " + dataUsed(), + pan[4], ld.getCompressionType()); + } + + private String dataUsed() { + return "Used data: " + Arrays.toString(pan); + } + + @Test + public void testUnknownLogType() throws Exception { + assertEquals(dataUsed(), 1, this.reader.getParsedFiles().size()); + File logFile = new File(this.reader.getParsedFiles().firstKey()); + DescriptorParser dp = DescriptorSourceFactory.createDescriptorParser(); + File invalidFile = new File(this.reader.getParsedFiles().firstKey() + .replace("access", "-")); + List<Descriptor> descs = new ArrayList<>(); + for (Descriptor desc // note: only 'invalidFile' is used by LogDescriptor + : dp.parseDescriptors(new byte[]{}, invalidFile, logFile.getName())) { + descs.add(desc); + } + assertTrue(dataUsed() + "\nWrong type: " + + Arrays.toString(descs.get(0).getClass().getInterfaces()), + (descs.get(0) instanceof UnparseableDescriptor)); + } + + @Test + public void testCompressionInvalid() throws Exception { + if (!isDecompressionTest) { + return; + } + assertEquals(1, this.reader.getParsedFiles().size()); + File logFile = new File(this.reader.getParsedFiles().firstKey()); + byte[] raw = Files.readAllBytes(logFile.toPath()); + for (int i = 0; i < 3; i++) { + raw[0] = (byte) i; + } + DescriptorParser dp = DescriptorSourceFactory.createDescriptorParser(); + List<Descriptor> descs = new ArrayList<>(); + for (Descriptor desc + : dp.parseDescriptors(raw, logFile, logFile.getName())) { + descs.add(desc); + } + assertTrue(dataUsed() + "\nWrong type: " + + Arrays.toString(descs.get(0).getClass().getInterfaces()), + (descs.get(0) instanceof UnparseableDescriptor)); + assertArrayEquals(dataUsed(), raw, descs.get(0).getRawDescriptorBytes()); + } +} + diff --git a/src/test/java/org/torproject/descriptor/log/WebServerAccessLogLineTest.java b/src/test/java/org/torproject/descriptor/log/WebServerAccessLogLineTest.java new file mode 100644 index 0000000..ec23b61 --- /dev/null +++ b/src/test/java/org/torproject/descriptor/log/WebServerAccessLogLineTest.java @@ -0,0 +1,140 @@ +/* Copyright 2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.descriptor.log; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameter; +import org.junit.runners.Parameterized.Parameters; + +import java.util.Arrays; +import java.util.Collection; + +@RunWith(Parameterized.class) +public class WebServerAccessLogLineTest { + + /** Test data structure: + * reference date, real log line, cleaned line, is valid. + */ + @Parameters + public static Collection<Object[]> logData() { + return Arrays.asList(new Object[][] { + { "0.0.0.0 - - [22/Jan/2018:00:00:00 +0000] "GET " + + "/collector/archive HTTP/1.1" 301 -", + "0.0.0.0 - - [22/Jan/2018:00:00:00 +0000] "GET " + + "/collector/archive HTTP/1.1" 301 -", Boolean.TRUE}, + { "0.0.0.0 - - [22/Jan/2018:00:00:00 +0000] "GET " + + "/collector/archive HTTP/1.1" 301 X "ccc"", + "", Boolean.FALSE}, + { "123.98.100.23 xyz xyz [22/Jan/2018:01:20:03 +0000] "GET " + + "/collector/archive HTTP/1.1" 301 - xyz abc xxxXXXXXXXX", + "123.98.100.23 - - [22/Jan/2018:00:00:00 +0000] "GET " + + "/collector/archive HTTP/1.1" 301 -", Boolean.TRUE}, + { "127.0.0.1 abc xyz [03/May/2017:06:07:08 +0000] " + + ""GET /server-status HTTP/1.1" 303 294 " + + ""-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"", + "127.0.0.1 - - [03/May/2017:00:00:00 +0000] "GET /server-status" + + " HTTP/1.1" 303 294", Boolean.TRUE}, + { "127.0.0.1 abc xyz [03/May/2017:06:07:08 +0000] " + + ""GET /server-status?auto HTTP/1.1" 303 294 " + + ""-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"", + "127.0.0.1 - - [03/May/2017:00:00:00 +0000] "GET /server-status" + + "?auto HTTP/1.1" 303 294", Boolean.TRUE}, + { "42.41.40.39 - - [04/May/2017:06:07:08 +0000] " + + ""HEAD /server-status?auto HTTP/1.1" 200 294 " + + ""-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"", + "42.41.40.39 - - [04/May/2017:00:00:00 +0000] "HEAD /server-status" + + "?auto HTTP/1.1" 200 294", Boolean.TRUE}, + { "42.41.39 - - [04/May/2017:06:07:08 +0000] " + + ""HEAD /server-status?auto HTTP/1.1" 200 294 " + + ""-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"", + "", Boolean.FALSE}, + { "42.41.40.1039 - - [04/May/2017:06:07:08 +0000] " + + ""HEAD /server-status?auto HTTP/1.1" 200 294 " + + ""-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"", + "", Boolean.FALSE}, + { "42.41.40_039 - - [04/May/2017:06:07:08 +0000] " + + ""HEAD /server-status?auto HTTP/1.1" 200 294 " + + ""-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"", + "", Boolean.FALSE}, + { "0.0.0.2 - - [05/May/2017:15:16:17 +0000] " + + ""GET http://metrics.torproject.org/favicon.ico HTTP/1.1" " + + "404 536 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0;" + + " SLCC1; .NET CLR 2.0.50; Media Center PC 5.0; .NET CLR 3.5.2;"", + "0.0.0.2 - - [05/May/2017:00:00:00 +0000] " + + ""GET http://metrics.torproject.org/favicon.ico HTTP/1.1" " + + "404 536", + Boolean.TRUE}, + { "0.0.0.99 - - [05/June/2017:15:16:17 +0000] " + + ""GET http://metrics.torproject.org/favicon.ico FTP/1.0" " + + "300 536 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0;" + + " SLCC1; .NET CLR 2.0.50; Media Center PC 5.0; .NET CLR 3.5.2;"", + "", Boolean.FALSE}, + { "0.0.0.99 - - [05/Jun/2017:15:16:17 +0000] " + + ""GET http://metrics.torproject.org/favicon.ico FTP/1.0" " + + "300 536 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0;" + + " SLCC1; .NET CLR 2.0.50; Media Center PC 5.0; .NET CLR 3.5.2;"", + "0.0.0.99 - - [05/Jun/2017:00:00:00 +0000] " + + ""GET http://metrics.torproject.org/favicon.ico FTP/1.0" 300 536", + Boolean.TRUE}, + { "0.0.0.7 - - [06/May/2017:00:16:17 +0100] " + + ""GET http://metrics.torproject.org/favicon.ico HTTP/1.1" " + + "333 536 "-" "Mozilla/4.0 (compatible; Opera 7.0; Windows 6.0;" + + " funky values ; "", + "0.0.0.7 - - [05/May/2017:00:00:00 +0000] " + + ""GET http://metrics.torproject.org/favicon.ico HTTP/1.1" " + + "333 536", Boolean.TRUE}, + { "0.0.0.1 - - [07/Dec/2016:20:16:18 -1000] " + + ""GET http://t3.torproject.org/?query=what HTTP/1.1" " + + "200 777 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows 10;" + + " SLCC1; .NET CLR 2.0; Media Center PC 5.0; .NET CLR 3.5.2)"", + "0.0.0.1 - - [08/Dec/2016:00:00:00 +0000] " + + ""GET http://t3.torproject.org/?query=what HTTP/1.1" 200 777", + Boolean.TRUE}, + { "abcdefghijklmnop1234567890", "", Boolean.FALSE}, + { "", "", Boolean.FALSE}, + { "0.0.0.7 - - [06/May/2017:00:16:17 +0000] " + + ""GET http://metrics.torproject.org/favicon.ico HTTP/1.1" " + + "333 536 "-" "Mozilla/4.0 (compatible; Opera 7.0; Windows 8.0;", + "0.0.0.7 - - [06/May/2017:00:00:00 +0000] " + + ""GET http://metrics.torproject.org/favicon.ico HTTP/1.1" " + + "333 536", Boolean.TRUE}, + { "0.0.0.7 - - [06/May/2017:00:16:17 +0000] " + + ""GET http://metrics.torproject.org/favicon.ico HTTP/1.1" " + + "333 536 "-" "Mozilla/4.0 (compatible; Opera 7.0; Windows XT;", + "0.0.0.7 - - [06/May/2017:00:00:00 +0000] " + + ""GET http://metrics.torproject.org/favicon.ico HTTP/1.1" " + + "333 536", Boolean.TRUE}, + { "0.0.0.0 - - [08/May/2017:00:00:00 +0000] " + + ""GET /server-status HTTP/1.1" 200 1294", + "0.0.0.0 - - [08/May/2017:00:00:00 +0000] "GET " + + "/server-status HTTP/1.1" 200 1294", Boolean.TRUE} + }); + } + + @Parameter(0) + public String real; + + @Parameter(1) + public String clean; + + @Parameter(2) + public boolean valid; + + @Test + public void testValidation() { + WebServerAccessLogLine line = WebServerAccessLogLine.makeLine(real); + assertEquals("Failed on line: " + real, valid, line.isValid()); + assertEquals("Failed on line: " + real, clean, line.toLogString()); + if (valid && !"".equals(clean)) { // A cleaned, accepted line is valid. + assertEquals("Failed on line: " + clean, clean, + WebServerAccessLogLine.makeLine(clean).toLogString()); + } + } + +} + diff --git a/src/test/java/org/torproject/descriptor/log/WebServerAccessLogTest.java b/src/test/java/org/torproject/descriptor/log/WebServerAccessLogTest.java new file mode 100644 index 0000000..3e98f13 --- /dev/null +++ b/src/test/java/org/torproject/descriptor/log/WebServerAccessLogTest.java @@ -0,0 +1,94 @@ +/* Copyright 2017--2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.descriptor.log; + +import static org.junit.Assert.assertEquals; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +import java.io.File; +import java.util.Arrays; +import java.util.Collection; + +@RunWith(Parameterized.class) +public class WebServerAccessLogTest { + + /** Test data structure: given line, cleaned line, valid, filename. */ + @Parameters + public static Collection<Object[]> logData() { + return Arrays.asList(new Object[][] { + { "0.0.0.0 - - [20/Sep/2017:00:00:00 +0000] " + + ""GET /fonts/WOFF/OTF/SourceSansPro-It.otf.woff HTTP/1.1" " + + "200 50556 "https://metrics.torproject.org/%5C" "-" -", + "0.0.0.0 - - [20/Sep/2017:00:00:00 +0000] "GET " + + "/fonts/WOFF/OTF/SourceSansPro-It.otf.woff HTTP/1.1" 200 50556\n", + Boolean.TRUE, "virt.host0_phys.host1a_access.log_20170920"}, + { "127.0.0.1 qwer 123 [30/May/2017:06:07:08 +0000] " + + ""GET /server-status?auto HTTP/1.1" 333 294 " + + ""-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"", + "0.0.0.0 - - [30/May/2017:00:00:00 +0000] "GET /server-status" + + " HTTP/1.1" 333 294\n", Boolean.TRUE, + "virt.host1_phys.host2a_access.log_20170530"}, + { "0.0.0.3 abc 567 [30/May/2017:06:07:08 +0000] " + + ""GET /server-status?auto HTTP/1.1" 333 294 " + + ""-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)"", + "0.0.0.3 - - [30/May/2017:00:00:00 +0000] "GET /server-status" + + " HTTP/1.1" 333 294\n", Boolean.TRUE, + "virt-host1_phys.host2a_access.log_20170530"}, + { "11.22.33.44 - - [30/Jul/2017:15:16:17 +0000] " + + ""GET http://www.torproject.org/favicon.ico HTTP/1.1" " + + "100 536 "-" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0;" + + " SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.5.2; " + + ".NET CLR 3.5.30729; .NET CLR 3.0.30618)"", + "0.0.0.0 - - [30/Jul/2017:00:00:00 +0000] " + + ""GET http://www.torproject.org/favicon.ico HTTP/1.1" " + + "100 536\n", Boolean.TRUE, + "virt.host1_phys.host2b_access.log_20170730"}, + { "abcdefghijklmnop1234567890", "", Boolean.FALSE, + "vhost1_phys.host2c_access.log_20170731.log"}, + { "", "", Boolean.FALSE, "host2d_host1_access.log_20170731.log"}, + { "0.0.0.0 - - [30/May/2017:00:00:00 +0000] " + + ""GET /server-status HTTP/1.1" 200 1294 "-" "-" -", + "0.0.0.0 - - [30/May/2017:00:00:00 +0000] "GET " + + "/server-status HTTP/1.1" 200 1294\n", Boolean.TRUE, + "some/other/path/virtual_physical_access.log_20170530.log"} + }); + } + + private String real; + private String clean; + private int count; + private boolean valid; + private String fn; + private File file; + + /** Set the above test data. */ + public WebServerAccessLogTest(String in, String out, boolean valid, + String filename) { + this.real = in; + this.clean = out; + this.valid = valid; + this.fn = filename; + this.file = new File(filename); + } + + @Test + public void testValidation() throws Exception { + WebServerAccessLogImpl wsal + = new WebServerAccessLogImpl(real.getBytes(), file); + wsal.validate(); + if (valid) { + assertEquals(0, wsal.getUnrecognizedLines().size()); + } else { + if (!real.isEmpty()) { + assertEquals(real, wsal.getUnrecognizedLines().get(0)); + } + } + } + +} + diff --git a/src/test/java/org/torproject/descriptor/log/WebServerModuleTest.java b/src/test/java/org/torproject/descriptor/log/WebServerModuleTest.java new file mode 100644 index 0000000..a11bc30 --- /dev/null +++ b/src/test/java/org/torproject/descriptor/log/WebServerModuleTest.java @@ -0,0 +1,113 @@ +/* Copyright 2017--2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.descriptor.log; + +import static org.junit.Assert.assertEquals; + +import org.torproject.descriptor.DescriptorParseException; + +import org.hamcrest.Matchers; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import java.io.File; +import java.util.Arrays; +import java.util.stream.Collectors; + +/** This class contains various tests for the webstats module. */ +public class WebServerModuleTest { + + @Rule + public ExpectedException thrown = ExpectedException.none(); + + @Test + public void testWrongFormat() throws Exception { + String filename = "h1_phys1_access.log_no-date.log"; + thrown.expect(DescriptorParseException.class); + thrown.expectMessage(Matchers + .containsString("Cannot parse WebServerAccessLog file: " + + filename)); + new WebServerAccessLogImpl(new byte[0], + new File(filename)); + } + + @Test + public void testDateFormat() throws Exception { + String filename = "h2_phys2_access.log_05001713"; + thrown.expect(DescriptorParseException.class); + thrown.expectMessage(Matchers + .containsString("Cannot parse WebServerAccessLog file: " + + filename)); + new WebServerAccessLogImpl(new byte[0], + new File(filename)); + } + + @Test + public void testNoParentPathRoot() throws Exception { + String filename = "h3_access.log_05001213"; + thrown.expect(DescriptorParseException.class); + thrown.expectMessage(Matchers + .containsString("WebServerAccessLog " + + "file name doesn't comply to standard: " + filename)); + new WebServerAccessLogImpl(new byte[0], + new File(filename)); + } + + @Test + public void testNoParentPathThis() throws Exception { + String filename = "_h3_access.log_05001213"; + thrown.expect(DescriptorParseException.class); + thrown.expectMessage(Matchers + .containsString("WebServerAccessLog " + + "file name doesn't comply to standard: " + filename)); + new WebServerAccessLogImpl(new byte[0], + new File(filename)); + } + + @Test + public void testNoParentPathParent() throws Exception { + String filename = "h3__access.log_05001213"; + thrown.expect(DescriptorParseException.class); + thrown.expectMessage(Matchers + .containsString("WebServerAccessLog " + + "file name doesn't comply to standard: " + filename)); + new WebServerAccessLogImpl(new byte[0], + new File(filename)); + } + + private static String[] logLines = { + "0.0.0.0 - - [30/May/2017:00:00:00 +0000] "GET " + + "/server-status HTTP/1.1" 200 1205", + "0.0.0.0 - - [30/May/2017:00:00:00 +0000] "GET " + + "/server-status HTTP/1.1" 200 1203", + "0.0.0.0 - - [30/May/2017:00:00:00 +0000] "GET " + + "/server-status HTTP/1.1" 200 1207", + "0.0.0.0 - - [30/May/2017:00:00:00 +0000] "GET " + + "/server-status HTTP/1.1" 200 1204", + "0.0.0.0 - - [30/May/2017:00:00:00 +0000] "GET " + + "/server-status HTTP/1.1" 200 1202", + "0.0.0.0 - - [30/May/2017:00:00:00 +0000] "GET " + + "/server-status HTTP/1.1" 200 1206", + "0.0.0.0 - - [30/May/2017:00:00:00 +0000] "GET " + + "/server-status HTTP/1.1" 200 1201" + }; + + private static String logText = Arrays.asList(logLines).stream() + .map((String line) -> line + (" some content")) + .collect(Collectors.joining("\n")); + + @Test + public void testBasics() throws Exception { + WebServerAccessLogImpl wsal = new WebServerAccessLogImpl(logText.getBytes(), + new File("vhost_host7_access.log_20170530")); + assertEquals(wsal.getAnnotations().size(), 0); + assertEquals(logText, new String(wsal.getRawDescriptorBytes())); + assertEquals("host7", wsal.getPhysicalHost()); + assertEquals("vhost", wsal.getVirtualHost()); + } + +} + diff --git a/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz b/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz new file mode 100644 index 0000000..b459742 Binary files /dev/null and b/src/test/resources/archeotrichon.torproject.org/archive.torproject.org_archeotrichon.torproject.org_access.log_20151007.xz differ diff --git a/src/test/resources/dummy.host.net/nix.server.org_dummy.host.net_access.log_20111111.bz2 b/src/test/resources/dummy.host.net/nix.server.org_dummy.host.net_access.log_20111111.bz2 new file mode 100644 index 0000000..17f335d Binary files /dev/null and b/src/test/resources/dummy.host.net/nix.server.org_dummy.host.net_access.log_20111111.bz2 differ diff --git a/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170530.gz b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170530.gz new file mode 100644 index 0000000..8c2333b Binary files /dev/null and b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170530.gz differ diff --git a/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz new file mode 100644 index 0000000..8c2333b Binary files /dev/null and b/src/test/resources/meronense.torproject.org/metrics.torproject.org_meronense.torproject.org_access.log_20170531.gz differ diff --git a/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170530.log b/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170530.log new file mode 100644 index 0000000..eee478b --- /dev/null +++ b/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170530.log @@ -0,0 +1,26 @@ +123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "apache-munin/1.0libwww-perl/6.08" +123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2873 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:37:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2877 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "apache-munin/1.0libwww-perl/6.08" +123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:39:44 +0000] "GET / HTTP/1.1" 200 868 "-" "check_http/v2.1.1 (monitoring-plugins 2.1.1)" +123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "apache-munin/1.0libwww-perl/6.08" +123.456.789.0 - - [30/May/2017:06:42:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "apache-munin/1.0libwww-perl/6.08" +123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:49:50 +0000] "GET / HTTP/1.1" 200 868 "-" "Wget/1.15 (linux-gnu)" +123.456.789.0 - - [30/May/2017:06:52:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2872 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "apache-munin/1.0libwww-perl/6.08" +123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:54:44 +0000] "GET / HTTP/1.1" 200 868 "-" "check_http/v2.1.1 (monitoring-plugins 2.1.1)" +123.456.789.0 - - [30/May/2017:06:56:54 +0000] "-" 408 0 "-" "-" +123.456.789.0 - - [30/May/2017:06:57:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2876 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" + diff --git a/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170607.log b/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170607.log new file mode 100644 index 0000000..eee478b --- /dev/null +++ b/src/test/resources/meronense.torproject.org/xy.host.org_meronense.torproject.org_access.log_20170607.log @@ -0,0 +1,26 @@ +123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2875 "-" "apache-munin/1.0libwww-perl/6.08" +123.456.789.0 - - [30/May/2017:06:32:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2873 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:37:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2877 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "apache-munin/1.0libwww-perl/6.08" +123.456.789.0 - - [30/May/2017:06:37:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:39:44 +0000] "GET / HTTP/1.1" 200 868 "-" "check_http/v2.1.1 (monitoring-plugins 2.1.1)" +123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:42:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "apache-munin/1.0libwww-perl/6.08" +123.456.789.0 - - [30/May/2017:06:42:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2795 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2878 "-" "apache-munin/1.0libwww-perl/6.08" +123.456.789.0 - - [30/May/2017:06:47:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2880 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:49:50 +0000] "GET / HTTP/1.1" 200 868 "-" "Wget/1.15 (linux-gnu)" +123.456.789.0 - - [30/May/2017:06:52:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2872 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "apache-munin/1.0libwww-perl/6.08" +123.456.789.0 - - [30/May/2017:06:52:40 +0000] "GET /server-status?auto HTTP/1.1" 200 2874 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +123.456.789.0 - - [30/May/2017:06:54:44 +0000] "GET / HTTP/1.1" 200 868 "-" "check_http/v2.1.1 (monitoring-plugins 2.1.1)" +123.456.789.0 - - [30/May/2017:06:56:54 +0000] "-" 408 0 "-" "-" +123.456.789.0 - - [30/May/2017:06:57:39 +0000] "GET /server-status?auto HTTP/1.1" 200 2876 "-" "munin/2.0.25-1+deb8u3 (libwww-perl/6.08)" +