commit 35feb816f81f26bcc9dc035a1aaf496c34a86647 Author: iwakeh iwakeh@torproject.org Date: Fri Feb 16 09:05:46 2018 +0000
Accomodate logs with more than Integer.MAX_VALUE lines.
Implements task-23046. --- .../org/torproject/descriptor/LogDescriptor.java | 10 +++++-- .../torproject/descriptor/WebServerAccessLog.java | 6 ++++ .../descriptor/log/WebServerAccessLogImpl.java | 32 ++++++++++++++++++---- .../descriptor/log/LogDescriptorTest.java | 5 +++- 4 files changed, 43 insertions(+), 10 deletions(-)
diff --git a/src/main/java/org/torproject/descriptor/LogDescriptor.java b/src/main/java/org/torproject/descriptor/LogDescriptor.java index 826fcda..8dd8460 100644 --- a/src/main/java/org/torproject/descriptor/LogDescriptor.java +++ b/src/main/java/org/torproject/descriptor/LogDescriptor.java @@ -5,6 +5,7 @@ package org.torproject.descriptor;
import java.io.InputStream; import java.util.List; +import java.util.stream.Stream;
/** * Contains a log file. @@ -64,11 +65,14 @@ public interface LogDescriptor extends Descriptor { public List<String> getUnrecognizedLines();
/** - * Returns a list of all parseable log lines. - * <p>Might require a lot of memory depending on log size.</p> + * Returns a stream of all parseable log lines. + * <p>Depending on log size this might not fit into a collection type.</p> + * + * @since 2.2.0 */ - public List<? extends Line> logLines() throws DescriptorParseException; + public Stream<? extends Line> logLines() throws DescriptorParseException;
+ /** Base interface for accessing log lines. */ public interface Line {
/** Returns a log line string. */ diff --git a/src/main/java/org/torproject/descriptor/WebServerAccessLog.java b/src/main/java/org/torproject/descriptor/WebServerAccessLog.java index b4f1940..5f3ad73 100644 --- a/src/main/java/org/torproject/descriptor/WebServerAccessLog.java +++ b/src/main/java/org/torproject/descriptor/WebServerAccessLog.java @@ -62,6 +62,12 @@ public interface WebServerAccessLog extends LogDescriptor { @Override public List<String> getUnrecognizedLines();
+ /** + * Facilitates access to all log line fields that don't only contain + * default values post sanitization. + * + * @since 2.2.0 + */ public interface Line extends LogDescriptor.Line {
/** Returns the IP address of the requesting host. */ diff --git a/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java b/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java index e48a262..3666d5d 100644 --- a/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java +++ b/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java @@ -15,10 +15,11 @@ import java.io.File; import java.io.InputStreamReader; import java.time.LocalDate; import java.time.format.DateTimeFormatter; +import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.stream.Collectors; +import java.util.stream.Stream;
/** * Implementation of web server access log descriptors. @@ -128,15 +129,34 @@ public class WebServerAccessLogImpl extends LogDescriptorImpl return this.logDate; }
- /** Returns a list of all valid log lines. */ + private static final int LISTLIMIT = Integer.MAX_VALUE / 2; + + /** Returns a stream of all valid log lines. */ @Override - public List<WebServerAccessLog.Line> logLines() + public Stream<WebServerAccessLog.Line> logLines() throws DescriptorParseException { try (BufferedReader br = new BufferedReader(new InputStreamReader( this.decompressedByteStream()))) { - return br.lines().map(line - -> (WebServerAccessLog.Line) WebServerAccessLogLine.makeLine(line)) - .filter(line -> line.isValid()).collect(Collectors.toList()); + List<List<WebServerAccessLogLine>> lists = new ArrayList<>(); + List<WebServerAccessLogLine> currentList = new ArrayList<>(); + lists.add(currentList); + String lineStr = br.readLine(); + int count = 0; + while (null != lineStr) { + WebServerAccessLogLine wsal = WebServerAccessLogLine.makeLine(lineStr); + if (wsal.isValid()) { + currentList.add(wsal); + count++; + } + if (count >= LISTLIMIT) { + currentList = new ArrayList<>(); + lists.add(currentList); + count = 0; + } + lineStr = br.readLine(); + } + br.close(); + return lists.stream().flatMap(list -> list.stream()); } catch (Exception ex) { throw new DescriptorParseException("Cannot retrieve log lines.", ex); } diff --git a/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java b/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java index 67ba638..0ff3e62 100644 --- a/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java +++ b/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java @@ -3,6 +3,8 @@
package org.torproject.descriptor.log;
+import static java.util.stream.Collectors.toList; + import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -130,7 +132,8 @@ public class LogDescriptorTest { InternalLogDescriptor ld = (InternalLogDescriptor) descs.get(0); assertEquals("Wrong compression type string. " + dataUsed(), pan[4], ld.getCompressionType()); - List<? extends LogDescriptor.Line> lines = ld.logLines(); + List<? extends LogDescriptor.Line> lines + = ld.logLines().collect(toList()); assertEquals(this.lineCount, lines.size()); }