commit 2f00ddf47bae7e3f6f3d9a2776cfacabadfb58ca Author: Karsten Loesing karsten.loesing@gmx.net Date: Wed Jan 25 14:21:16 2017 +0100
Skip long resource strings.
This patch fixes a bug where we'd consider two resource strings with the same first 2048 characters as two distinct resource strings internally, but which the database considers the same, because it only stores the first 2048 characters. In reality, these are just hacking attempts or broken clients, so we can as well discard these lines entirely and not bother any further. --- .../main/java/org/torproject/metrics/webstats/Main.java | 3 ++- .../java/org/torproject/metrics/webstats/MainTest.java | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/modules/webstats/src/main/java/org/torproject/metrics/webstats/Main.java b/modules/webstats/src/main/java/org/torproject/metrics/webstats/Main.java index ea5a368..4c02a0f 100644 --- a/modules/webstats/src/main/java/org/torproject/metrics/webstats/Main.java +++ b/modules/webstats/src/main/java/org/torproject/metrics/webstats/Main.java @@ -62,7 +62,8 @@ public class Main {
static final Pattern LOG_LINE_PATTERN = Pattern.compile( "^0.0.0.[01] - - \[\d{2}/\w{3}/\d{4}:00:00:00 \+0000\] " - + ""(GET|HEAD) ([^ ]+) HTTP[^ ]+" (\d+) (-|\d+) "-" "-" -$"); + + ""(GET|HEAD) ([^ ]{1,2048}) HTTP[^ ]+" (\d+) (-|\d+) "-" "-" " + + "-$");
private static final String LOG_DATE = "log_date";
diff --git a/modules/webstats/src/test/java/org/torproject/metrics/webstats/MainTest.java b/modules/webstats/src/test/java/org/torproject/metrics/webstats/MainTest.java index 1c4f0bc..7b59c54 100644 --- a/modules/webstats/src/test/java/org/torproject/metrics/webstats/MainTest.java +++ b/modules/webstats/src/test/java/org/torproject/metrics/webstats/MainTest.java @@ -4,6 +4,7 @@ package org.torproject.metrics.webstats;
import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue;
import org.junit.Test; @@ -89,5 +90,21 @@ public class MainTest { assertEquals("/bubbles.html", matcher.group(2)); assertEquals("200", matcher.group(3)); } + + @Test + public void testLogLinePatternMaxLength() { + int maxLength = 2048; + String pre = "0.0.0.0 - - [17/Jan/2017:00:00:00 +0000] "GET "; + String post = " HTTP/1.0" 200 10532 "-" "-" -"; + StringBuilder sb = new StringBuilder(); + while (sb.length() <= maxLength) { + sb.append("/https://www.torproject.org"); + } + String tooLongLogLine = pre + sb.toString() + post; + assertFalse(Main.LOG_LINE_PATTERN.matcher(tooLongLogLine).matches()); + String notTooLongLogLine = pre + sb.toString().substring(0, maxLength) + + post; + assertTrue(Main.LOG_LINE_PATTERN.matcher(notTooLongLogLine).matches()); + } }
tor-commits@lists.torproject.org