commit 7be397890d2ff66d7479b52aa245afdd3d487f9d Author: Karsten Loesing karsten.loesing@gmx.net Date: Thu Mar 22 21:37:28 2018 +0100
Adapt webstats to read logs from CollecTor.
In this change we: - update metrics-lib to 2.2.0, - start downloading and processing logs from CollecTor rather than from webstats.torproject.org, - change log line counts from int to long, - remove webstats tests which are now contained in metrics-lib, - update the CollecTor page, and - take out the beta notice from the Tor web server logs page.
Implements #25520. --- build.xml | 2 +- .../metrics/stats/collectdescs/Main.java | 3 +- .../torproject/metrics/stats/webstats/Main.java | 223 ++++++--------------- src/main/resources/spec/web-server-logs.xml | 4 - src/main/resources/web/jsps/collector.jsp | 38 ++++ src/main/resources/web/jsps/web-server-logs.jsp | 16 +- src/main/sql/webstats/init-webstats.sql | 2 +- .../metrics/stats/webstats/MainTest.java | 110 ---------- 8 files changed, 106 insertions(+), 292 deletions(-)
diff --git a/build.xml b/build.xml index e98757e..57eab68 100644 --- a/build.xml +++ b/build.xml @@ -9,7 +9,7 @@ <property name="javadoc-title" value="MetricsWeb API Documentation"/> <property name="implementation-title" value="metrics-web" /> <property name="release.version" value="1.0.3-dev" /> - <property name="metricslibversion" value="2.1.1" /> + <property name="metricslibversion" value="2.2.0" /> <property name="jetty.version" value="-9.2.21.v20170120" /> <property name="warfile" value="metrics-web-${release.version}.war"/> diff --git a/src/main/java/org/torproject/metrics/stats/collectdescs/Main.java b/src/main/java/org/torproject/metrics/stats/collectdescs/Main.java index 04dc86d..4c64425 100644 --- a/src/main/java/org/torproject/metrics/stats/collectdescs/Main.java +++ b/src/main/java/org/torproject/metrics/stats/collectdescs/Main.java @@ -24,7 +24,8 @@ public class Main { "/recent/relay-descriptors/consensuses/", "/recent/relay-descriptors/extra-infos/", "/recent/relay-descriptors/server-descriptors/", - "/recent/torperf/" + "/recent/torperf/", + "/recent/webstats/" }, 0L, new File("../../shared/in"), true); } } diff --git a/src/main/java/org/torproject/metrics/stats/webstats/Main.java b/src/main/java/org/torproject/metrics/stats/webstats/Main.java index f70963f..5d11114 100644 --- a/src/main/java/org/torproject/metrics/stats/webstats/Main.java +++ b/src/main/java/org/torproject/metrics/stats/webstats/Main.java @@ -3,14 +3,19 @@
package org.torproject.metrics.stats.webstats;
-import org.apache.commons.compress.compressors.xz.XZCompressorInputStream; +import static java.util.stream.Collectors.counting; +import static java.util.stream.Collectors.groupingByConcurrent; + +import org.torproject.descriptor.Descriptor; +import org.torproject.descriptor.DescriptorParseException; +import org.torproject.descriptor.DescriptorSourceFactory; +import org.torproject.descriptor.WebServerAccessLog; + import org.slf4j.Logger; import org.slf4j.LoggerFactory;
-import java.io.BufferedReader; +import java.io.File; import java.io.IOException; -import java.io.InputStreamReader; -import java.net.URL; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; @@ -23,19 +28,17 @@ import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.text.DateFormat; -import java.text.ParseException; import java.text.SimpleDateFormat; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.Calendar; -import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.SortedSet; import java.util.TimeZone; import java.util.TreeSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern;
/** Main class of the webstats module that downloads log files from the server, * imports them into a database, and exports aggregate statistics to a CSV @@ -45,26 +48,6 @@ public class Main { /** Logger for this class. */ private static Logger log = LoggerFactory.getLogger(Main.class);
- /** Pattern for links contained in directory listings. */ - static final Pattern URL_STRING_PATTERN = - Pattern.compile(".*<a href="([^"]+)">.*"); - - static final Pattern LOG_FILE_URL_PATTERN = - Pattern.compile("^.*/([^/]+)/([^/]+)-access.log-(\d{8}).xz$"); - - private static DateFormat logDateFormat; - - static { - logDateFormat = new SimpleDateFormat("yyyyMMdd"); - logDateFormat.setLenient(false); - logDateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); - } - - static final Pattern LOG_LINE_PATTERN = Pattern.compile( - "^0.0.0.[01] - - \[\d{2}/\w{3}/\d{4}:00:00:00 \+0000\] " - + ""(GET|HEAD) ([^ ]{1,2048}) HTTP[^ ]+" (\d+) (-|\d+) "-" "-" " - + "-$"); - private static final String LOG_DATE = "log_date";
private static final String REQUEST_TYPE = "request_type"; @@ -88,12 +71,10 @@ public class Main { log.info("Starting webstats module."); String dbUrlString = "jdbc:postgresql:webstats"; Connection connection = connectToDatabase(dbUrlString); - SortedSet<String> previouslyImportedLogFileUrls = - queryImportedFiles(connection); - String baseUrl = "https://webstats.torproject.org/out/"; - SortedSet<String> newLogFileUrls = downloadDirectoryListings(baseUrl, - previouslyImportedLogFileUrls); - importLogFiles(connection, newLogFileUrls); + SortedSet<String> skipFiles = queryImportedFileNames(connection); + importLogFiles(connection, skipFiles, + new File("../../shared/in/recent/webstats"), + new File("../../shared/in/archive/webstats")); SortedSet<String> statistics = queryWebstats(connection); writeStatistics(Paths.get("stats", "webstats.csv"), statistics); disconnectFromDatabase(connection); @@ -109,79 +90,55 @@ public class Main { return connection; }
- static SortedSet<String> queryImportedFiles(Connection connection) + static SortedSet<String> queryImportedFileNames(Connection connection) throws SQLException { - log.info("Querying URLs of previously imported log files."); + log.info("Querying previously imported log files."); SortedSet<String> importedLogFileUrls = new TreeSet<>(); Statement st = connection.createStatement(); - String queryString = "SELECT url FROM files"; + String queryString = "SELECT server, site, log_date FROM files"; + DateTimeFormatter dateFormat = DateTimeFormatter.ofPattern("yyyyMMdd"); try (ResultSet rs = st.executeQuery(queryString)) { while (rs.next()) { - importedLogFileUrls.add(rs.getString(1)); + importedLogFileUrls.add(String.format("%s_%s_access.log_%s.xz", + rs.getString(1), rs.getString(2), + rs.getDate(3).toLocalDate().format(dateFormat))); } } - log.info("Found {} URLs of previously imported log files.", + log.info("Found {} previously imported log files.", importedLogFileUrls.size()); return importedLogFileUrls; }
- static SortedSet<String> downloadDirectoryListings(String baseUrl, - SortedSet<String> importedLogFileUrls) throws IOException { - log.info("Downloading directory listings from {}.", baseUrl); - List<String> directoryListings = new ArrayList<>(); - directoryListings.add(baseUrl); - SortedSet<String> newLogFileUrls = new TreeSet<>(); - while (!directoryListings.isEmpty()) { - String urlString = directoryListings.remove(0); - if (urlString.endsWith("/")) { - directoryListings.addAll(downloadDirectoryListing(urlString)); - } else if (!urlString.endsWith(".xz")) { - log.debug("Skipping unrecognized URL {}.", urlString); - } else if (!importedLogFileUrls.contains(urlString)) { - newLogFileUrls.add(urlString); + static void importLogFiles(Connection connection, SortedSet<String> skipFiles, + File... inDirectories) { + for (Descriptor descriptor : DescriptorSourceFactory + .createDescriptorReader().readDescriptors(inDirectories)) { + if (!(descriptor instanceof WebServerAccessLog)) { + continue; } - } - log.info("Found {} URLs of log files that have not yet been imported.", - newLogFileUrls.size()); - return newLogFileUrls; - } - - static List<String> downloadDirectoryListing(String urlString) - throws IOException { - log.debug("Downloading directory listing from {}.", urlString); - List<String> urlStrings = new ArrayList<>(); - try (BufferedReader br = new BufferedReader(new InputStreamReader( - new URL(urlString).openStream()))) { - String line; - while ((line = br.readLine()) != null) { - Matcher matcher = URL_STRING_PATTERN.matcher(line); - if (matcher.matches() && !matcher.group(1).startsWith("/")) { - urlStrings.add(urlString + matcher.group(1)); - } + WebServerAccessLog logFile = (WebServerAccessLog) descriptor; + if (skipFiles.contains(logFile.getDescriptorFile().getName())) { + continue; } - } - return urlStrings; - } - - static void importLogFiles(Connection connection, - SortedSet<String> newLogFileUrls) { - log.info("Downloading, parsing, and importing {} log files.", - newLogFileUrls.size()); - for (String urlString : newLogFileUrls) { try { - Object[] metaData = parseMetaData(urlString); - if (metaData == null) { - continue; - } - Map<String, Integer> parsedLogLines = downloadAndParseLogFile( - urlString); - importLogLines(connection, urlString, metaData, parsedLogLines); - } catch (IOException | ParseException exc) { - log.warn("Cannot download or parse log file with URL {}. Retrying " - + "in the next run.", urlString, exc); + Map<String, Long> parsedLogLines = logFile.logLines().parallel() + /* The following mapping can be removed with metrics-lib + version > 2.2.0 */ + .map(line -> (WebServerAccessLog.Line) line) + .collect(groupingByConcurrent(line + -> String.format("%s %s %d", line.getMethod().name(), + truncateString(line.getRequest(), 2048), line.getResponse()), + counting())); + importLogLines(connection, logFile.getDescriptorFile().getName(), + logFile.getPhysicalHost(), logFile.getVirtualHost(), + logFile.getLogDate(), parsedLogLines); + } catch (DescriptorParseException exc) { + log.warn("Cannot parse log file with file name {}. Retrying in the " + + "next run.", logFile.getDescriptorFile().getName(), exc); } catch (SQLException exc) { - log.warn("Cannot import log file with URL {} into the database. " - + "Rolling back and retrying in the next run.", urlString, exc); + log.warn("Cannot import log file with file name {} into the database. " + + "Rolling back and retrying in the next run.", + logFile.getDescriptorFile().getName(), exc); try { connection.rollback(); } catch (SQLException exceptionWhileRollingBack) { @@ -191,68 +148,9 @@ public class Main { } }
- private static Object[] parseMetaData(String urlString) - throws ParseException { - log.debug("Importing log file {}.", urlString); - if (urlString.contains("-ssl-access.log-")) { - log.debug("Skipping log file containing SSL requests with URL {}.", - urlString); - return null; - } - Matcher logFileUrlMatcher = LOG_FILE_URL_PATTERN.matcher(urlString); - if (!logFileUrlMatcher.matches()) { - log.debug("Skipping log file with unrecognized URL {}.", urlString); - return null; - } - String server = logFileUrlMatcher.group(1); - String site = logFileUrlMatcher.group(2); - long logDateMillis = logDateFormat.parse(logFileUrlMatcher.group(3)) - .getTime(); - return new Object[] { server, site, logDateMillis }; - } - - static Map<String, Integer> downloadAndParseLogFile(String urlString) - throws IOException { - int skippedLines = 0; - Map<String, Integer> parsedLogLines = new HashMap<>(); - try (BufferedReader br = new BufferedReader(new InputStreamReader( - new XZCompressorInputStream(new URL(urlString).openStream())))) { - String line; - while ((line = br.readLine()) != null) { - if (!parseLogLine(line, parsedLogLines)) { - skippedLines++; - } - } - } - if (skippedLines > 0) { - log.debug("Skipped {} lines while parsing log file {}.", skippedLines, - urlString); - } - return parsedLogLines; - } - - static boolean parseLogLine(String logLine, - Map<String, Integer> parsedLogLines) { - Matcher logLineMatcher = LOG_LINE_PATTERN.matcher(logLine); - if (!logLineMatcher.matches()) { - return false; - } - String method = logLineMatcher.group(1); - String resource = logLineMatcher.group(2); - int responseCode = Integer.parseInt(logLineMatcher.group(3)); - String combined = String.format("%s %s %d", method, resource, - responseCode); - if (!parsedLogLines.containsKey(combined)) { - parsedLogLines.put(combined, 1); - } else { - parsedLogLines.put(combined, parsedLogLines.get(combined) + 1); - } - return true; - } - private static void importLogLines(Connection connection, String urlString, - Object[] metaData, Map<String, Integer> parsedLogLines) - throws SQLException { + String server, String site, LocalDate logDate, + Map<String, Long> parsedLogLines) throws SQLException { PreparedStatement psFiles = connection.prepareStatement( "INSERT INTO files (url, server, site, " + LOG_DATE + ") " + "VALUES (?, ?, ?, ?)", Statement.RETURN_GENERATED_KEYS); @@ -264,20 +162,17 @@ public class Main { PreparedStatement psRequests = connection.prepareStatement( "INSERT INTO requests (file_id, method, resource_id, response_code, " + COUNT + ") VALUES (?, CAST(? AS method), ?, ?, ?)"); - String server = (String) metaData[0]; - String site = (String) metaData[1]; - long logDateMillis = (long) metaData[2]; - int fileId = insertFile(psFiles, urlString, server, site, logDateMillis); + int fileId = insertFile(psFiles, urlString, server, site, logDate); if (fileId < 0) { log.debug("Skipping previously imported log file {}.", urlString); return; } - for (Map.Entry<String, Integer> requests : parsedLogLines.entrySet()) { + for (Map.Entry<String, Long> requests : parsedLogLines.entrySet()) { String[] keyParts = requests.getKey().split(" "); String method = keyParts[0]; String resource = keyParts[1]; int responseCode = Integer.parseInt(keyParts[2]); - int count = requests.getValue(); + long count = requests.getValue(); int resourceId = insertResource(psResourcesSelect, psResourcesInsert, resource); if (resourceId < 0) { @@ -290,18 +185,18 @@ public class Main { count); } connection.commit(); - log.debug("Finished importing log file with URL {} into database.", + log.debug("Finished importing log file with file name {} into database.", urlString); }
private static int insertFile(PreparedStatement psFiles, String urlString, - String server, String site, long logDateMillis) throws SQLException { + String server, String site, LocalDate logDate) throws SQLException { int fileId = -1; psFiles.clearParameters(); psFiles.setString(1, truncateString(urlString, 2048)); psFiles.setString(2, truncateString(server, 32)); psFiles.setString(3, truncateString(site, 128)); - psFiles.setDate(4, new Date(logDateMillis)); + psFiles.setDate(4, Date.valueOf(logDate)); psFiles.execute(); try (ResultSet rs = psFiles.getGeneratedKeys()) { if (rs.next()) { @@ -312,14 +207,14 @@ public class Main { }
private static void insertRequest(PreparedStatement psRequests, int fileId, - String method, int resourceId, int responseCode, int count) + String method, int resourceId, int responseCode, long count) throws SQLException { psRequests.clearParameters(); psRequests.setInt(1, fileId); psRequests.setString(2, method); psRequests.setInt(3, resourceId); psRequests.setInt(4, responseCode); - psRequests.setInt(5, count); + psRequests.setLong(5, count); psRequests.execute(); }
diff --git a/src/main/resources/spec/web-server-logs.xml b/src/main/resources/spec/web-server-logs.xml index c180f8c..5c2011f 100644 --- a/src/main/resources/spec/web-server-logs.xml +++ b/src/main/resources/spec/web-server-logs.xml @@ -20,10 +20,6 @@ </front> <middle> <section title="Purpose of this document"> - <t>BETA: As of November 14, 2017, this document is still under - discussion and subject to change without prior notice. Feel free - to <eref target="/about.html#contact">contact us</eref> for questions or - concerns regarding this document.</t> <t>Tor's web servers, like most web servers, keep request logs for maintenance and informational purposes.</t> <t>However, unlike most other web servers, Tor's web servers use a diff --git a/src/main/resources/web/jsps/collector.jsp b/src/main/resources/web/jsps/collector.jsp index 33ae7dd..13865ba 100644 --- a/src/main/resources/web/jsps/collector.jsp +++ b/src/main/resources/web/jsps/collector.jsp @@ -168,6 +168,15 @@ <td><a href="/collector/recent/torperf/" class="btn btn-primary btn-xs pull-left"><i class="fa fa-chevron-right" aria-hidden="true"></i> recent</a> <a href="/collector/archive/torperf/" class="btn btn-primary btn-xs pull-right"><i class="fa fa-chevron-right" aria-hidden="true"></i> archive</a></td> </tr> +<tr class="tableHeadline"> + <td colspan="3"><b><a href="#webstats">Tor web server logs</a></b></td> +</tr> +<tr> + <td><a href="#type-webstats">Tor web server logs</a></td> + <td></td> + <td><a href="/collector/recent/webstats/" class="btn btn-primary btn-xs pull-left"><i class="fa fa-chevron-right" aria-hidden="true"></i> recent</a> + <a href="/collector/archive/webstats/" class="btn btn-primary btn-xs pull-right"><i class="fa fa-chevron-right" aria-hidden="true"></i> archive</a></td> +</tr> </tbody> </table>
@@ -694,6 +703,35 @@ measurement; optional.</li> <li><code>SOURCEADDRESS:</code> Public IP address of the OnionPerf host obtained by connecting to well-known servers and finding the IP address in the result, which may be <code>"unknown"</code> if OnionPerf was not able to find this information; optional.</li> </ul>
+ + +<br> +<h2 id="webstats" class="hover">Tor web server logs +<a href="#webstats" class="anchor">#</a> +</h2> + +<p> +Tor's web servers, like most web servers, keep request logs for maintenance and +informational purposes. +However, unlike most other web servers, Tor's web servers use a privacy-aware +log format that avoids logging too sensitive data about their users. +Also unlike most other web server logs, Tor's logs are neither archived nor +analyzed before performing a number of post-processing steps to further reduce +any privacy-sensitive parts. +</p> + +<h3 id="type-webstats" class="hover">Tor web server logs +<a href="/collector/recent/webstats/" class="btn btn-primary btn-xs"><i class="fa fa-chevron-right" aria-hidden="true"></i> recent</a> +<a href="/collector/archive/webstats/" class="btn btn-primary btn-xs"><i class="fa fa-chevron-right" aria-hidden="true"></i> archive</a> +<a href="#type-webstats" class="anchor">#</a> +</h3> + +<p> +The data format and sanitizing steps for Tor web server logs are specified in +detail on a separate <a href="web-server-logs.html">page</a>. +</p> + + </div>
<br> diff --git a/src/main/resources/web/jsps/web-server-logs.jsp b/src/main/resources/web/jsps/web-server-logs.jsp index 8832b2a..530b2ab 100644 --- a/src/main/resources/web/jsps/web-server-logs.jsp +++ b/src/main/resources/web/jsps/web-server-logs.jsp @@ -22,37 +22,31 @@ "#rfc.section.1">1.</a> <a href= "#n-purpose-of-this-document">Purpose of this document</a></h2> <div id="rfc.section.1.p.1"> -<p>BETA: As of November 14, 2017, this document is still under -discussion and subject to change without prior notice. Feel free to -<a href="/about.html#contact">contact us</a> for questions or -concerns regarding this document.</p> -</div> -<div id="rfc.section.1.p.2"> <p>Tor's web servers, like most web servers, keep request logs for maintenance and informational purposes.</p> </div> -<div id="rfc.section.1.p.3"> +<div id="rfc.section.1.p.2"> <p>However, unlike most other web servers, Tor's web servers use a privacy-aware log format that avoids logging too sensitive data about their users.</p> </div> -<div id="rfc.section.1.p.4"> +<div id="rfc.section.1.p.3"> <p>Also unlike most other web server logs, Tor's logs are neither archived nor analyzed before performing a number of post-processing steps to further reduce any privacy-sensitive parts.</p> </div> -<div id="rfc.section.1.p.5"> +<div id="rfc.section.1.p.4"> <p>This document describes 1) meta-data contained in log file names written by Tor's web servers, 2) the privacy-aware log format used in these files, and 3) subsequent sanitizing steps that are applied before archiving and analyzing these log files.</p> </div> -<div id="rfc.section.1.p.6"> +<div id="rfc.section.1.p.5"> <p>As a basis for our current implementation this document also describes the naming conventions for the input log files, which is just a description of the current state and subject to change.</p> </div> -<div id="rfc.section.1.p.7"> +<div id="rfc.section.1.p.6"> <p>As a convention for this document, all format strings conform to the format strings used by <a href= "http://httpd.apache.org/docs/current/mod/mod_log_config.html">Apache's diff --git a/src/main/sql/webstats/init-webstats.sql b/src/main/sql/webstats/init-webstats.sql index e44205f..1396fa5 100644 --- a/src/main/sql/webstats/init-webstats.sql +++ b/src/main/sql/webstats/init-webstats.sql @@ -22,7 +22,7 @@ CREATE TABLE requests ( method METHOD NOT NULL, resource_id INTEGER REFERENCES resources (resource_id) NOT NULL, response_code SMALLINT NOT NULL, - count INTEGER NOT NULL, + count BIGINT NOT NULL, UNIQUE (file_id, method, resource_id, response_code) );
diff --git a/src/test/java/org/torproject/metrics/stats/webstats/MainTest.java b/src/test/java/org/torproject/metrics/stats/webstats/MainTest.java deleted file mode 100644 index a4e88d1..0000000 --- a/src/test/java/org/torproject/metrics/stats/webstats/MainTest.java +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright 2017--2018 The Tor Project - * See LICENSE for licensing information */ - -package org.torproject.metrics.stats.webstats; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -import org.junit.Test; - -import java.util.regex.Matcher; - -public class MainTest { - - static final String SAMPLE_LOG_FILE_NAME = - "metrics.torproject.org-access.log-20170117.xz"; - - static final String SAMPLE_SUBDIRECTORY_NAME = "meronense.torproject.org/"; - - static final String SAMPLE_LOG_FILE_URL = - "https://webstats.torproject.org/out/meronense.torproject.org/" - + "metrics.torproject.org-access.log-20170117.xz"; - - static final String[] SAMPLE_LOG_LINES = new String[] { - "0.0.0.0 - - [17/Jan/2017:00:00:00 +0000] " - + ""GET / HTTP/1.0" 200 10532 "-" "-" -", - "0.0.0.0 - - [17/Jan/2017:00:00:00 +0000] " - + ""HEAD /bubbles.html HTTP/1.1" 200 - "-" "-" -" - }; - - @Test - public void testUrlStringPatternComplete() { - Matcher matcher = Main.URL_STRING_PATTERN.matcher( - "<img src="/icons/unknown.gif" alt="[ ]"> " - + "<a href="" + SAMPLE_LOG_FILE_NAME + "">" + SAMPLE_LOG_FILE_NAME - + "</a> 2017-01-19 19:43 5.6K "); - assertTrue(matcher.matches()); - assertEquals(SAMPLE_LOG_FILE_NAME, matcher.group(1)); - } - - @Test - public void testUrlStringPatternOnlyATag() { - Matcher matcher = Main.URL_STRING_PATTERN.matcher("<a href="" - + SAMPLE_LOG_FILE_NAME + "">" + SAMPLE_LOG_FILE_NAME + "</a>"); - assertTrue(matcher.matches()); - assertEquals(SAMPLE_LOG_FILE_NAME, matcher.group(1)); - } - - @Test - public void testUrlStringPatternSubdirectory() { - Matcher matcher = Main.URL_STRING_PATTERN.matcher( - "<a href="" + SAMPLE_SUBDIRECTORY_NAME + "">" - + SAMPLE_SUBDIRECTORY_NAME + "/</a>"); - assertTrue(matcher.matches()); - assertEquals(SAMPLE_SUBDIRECTORY_NAME, matcher.group(1)); - } - - @Test - public void testUrlStringPatternAnythingBetweenDoubleQuotesHtml() { - Matcher matcher = Main.URL_STRING_PATTERN.matcher( - "<a href="anything-between-double-quotes.html">Link/</a>"); - assertTrue(matcher.matches()); - assertEquals("anything-between-double-quotes.html", matcher.group(1)); - } - - @Test - public void testLogFileUrlPatternComplete() { - Matcher matcher = Main.LOG_FILE_URL_PATTERN.matcher(SAMPLE_LOG_FILE_URL); - assertTrue(matcher.matches()); - assertEquals("meronense.torproject.org", matcher.group(1)); - assertEquals("metrics.torproject.org", matcher.group(2)); - assertEquals("20170117", matcher.group(3)); - } - - @Test - public void testLogLinePatternGetSlash() { - Matcher matcher = Main.LOG_LINE_PATTERN.matcher(SAMPLE_LOG_LINES[0]); - assertTrue(matcher.matches()); - assertEquals("GET", matcher.group(1)); - assertEquals("/", matcher.group(2)); - assertEquals("200", matcher.group(3)); - } - - @Test - public void testLogLinePatternHeadBubbles() { - Matcher matcher = Main.LOG_LINE_PATTERN.matcher(SAMPLE_LOG_LINES[1]); - assertTrue(matcher.matches()); - assertEquals("HEAD", matcher.group(1)); - assertEquals("/bubbles.html", matcher.group(2)); - assertEquals("200", matcher.group(3)); - } - - @Test - public void testLogLinePatternMaxLength() { - int maxLength = 2048; - String pre = "0.0.0.0 - - [17/Jan/2017:00:00:00 +0000] "GET "; - String post = " HTTP/1.0" 200 10532 "-" "-" -"; - StringBuilder sb = new StringBuilder(); - while (sb.length() <= maxLength) { - sb.append("/https://www.torproject.org"); - } - String tooLongLogLine = pre + sb.toString() + post; - assertFalse(Main.LOG_LINE_PATTERN.matcher(tooLongLogLine).matches()); - String notTooLongLogLine = pre + sb.toString().substring(0, maxLength) - + post; - assertTrue(Main.LOG_LINE_PATTERN.matcher(notTooLongLogLine).matches()); - } -} -