[tor-commits] [metrics-lib/master] Accomodate logs with more than Integer.MAX_VALUE lines.

karsten at torproject.org karsten at torproject.org
Mon Feb 26 13:09:56 UTC 2018


commit 35feb816f81f26bcc9dc035a1aaf496c34a86647
Author: iwakeh <iwakeh at torproject.org>
Date:   Fri Feb 16 09:05:46 2018 +0000

    Accomodate logs with more than Integer.MAX_VALUE lines.
    
    Implements task-23046.
---
 .../org/torproject/descriptor/LogDescriptor.java   | 10 +++++--
 .../torproject/descriptor/WebServerAccessLog.java  |  6 ++++
 .../descriptor/log/WebServerAccessLogImpl.java     | 32 ++++++++++++++++++----
 .../descriptor/log/LogDescriptorTest.java          |  5 +++-
 4 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/src/main/java/org/torproject/descriptor/LogDescriptor.java b/src/main/java/org/torproject/descriptor/LogDescriptor.java
index 826fcda..8dd8460 100644
--- a/src/main/java/org/torproject/descriptor/LogDescriptor.java
+++ b/src/main/java/org/torproject/descriptor/LogDescriptor.java
@@ -5,6 +5,7 @@ package org.torproject.descriptor;
 
 import java.io.InputStream;
 import java.util.List;
+import java.util.stream.Stream;
 
 /**
  * Contains a log file.
@@ -64,11 +65,14 @@ public interface LogDescriptor extends Descriptor {
   public List<String> getUnrecognizedLines();
 
   /**
-   * Returns a list of all parseable log lines.
-   * <p>Might require a lot of memory depending on log size.</p>
+   * Returns a stream of all parseable log lines.
+   * <p>Depending on log size this might not fit into a collection type.</p>
+   *
+   * @since 2.2.0
    */
-  public List<? extends Line> logLines() throws DescriptorParseException;
+  public Stream<? extends Line> logLines() throws DescriptorParseException;
 
+  /** Base interface for accessing log lines. */
   public interface Line {
 
     /** Returns a log line string. */
diff --git a/src/main/java/org/torproject/descriptor/WebServerAccessLog.java b/src/main/java/org/torproject/descriptor/WebServerAccessLog.java
index b4f1940..5f3ad73 100644
--- a/src/main/java/org/torproject/descriptor/WebServerAccessLog.java
+++ b/src/main/java/org/torproject/descriptor/WebServerAccessLog.java
@@ -62,6 +62,12 @@ public interface WebServerAccessLog extends LogDescriptor {
   @Override
   public List<String> getUnrecognizedLines();
 
+  /**
+   * Facilitates access to all log line fields that don't only contain
+   * default values post sanitization.
+   *
+   * @since 2.2.0
+   */
   public interface Line extends LogDescriptor.Line {
 
     /** Returns the IP address of the requesting host. */
diff --git a/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java b/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java
index e48a262..3666d5d 100644
--- a/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java
+++ b/src/main/java/org/torproject/descriptor/log/WebServerAccessLogImpl.java
@@ -15,10 +15,11 @@ import java.io.File;
 import java.io.InputStreamReader;
 import java.time.LocalDate;
 import java.time.format.DateTimeFormatter;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
 /**
  * Implementation of web server access log descriptors.
@@ -128,15 +129,34 @@ public class WebServerAccessLogImpl extends LogDescriptorImpl
     return this.logDate;
   }
 
-  /** Returns a list of all valid log lines. */
+  private static final int LISTLIMIT = Integer.MAX_VALUE / 2;
+
+  /** Returns a stream of all valid log lines. */
   @Override
-  public List<WebServerAccessLog.Line> logLines()
+  public Stream<WebServerAccessLog.Line> logLines()
       throws DescriptorParseException {
     try (BufferedReader br = new BufferedReader(new InputStreamReader(
         this.decompressedByteStream()))) {
-      return br.lines().map(line
-          -> (WebServerAccessLog.Line) WebServerAccessLogLine.makeLine(line))
-        .filter(line -> line.isValid()).collect(Collectors.toList());
+      List<List<WebServerAccessLogLine>> lists = new ArrayList<>();
+      List<WebServerAccessLogLine> currentList = new ArrayList<>();
+      lists.add(currentList);
+      String lineStr = br.readLine();
+      int count = 0;
+      while (null != lineStr) {
+        WebServerAccessLogLine wsal = WebServerAccessLogLine.makeLine(lineStr);
+        if (wsal.isValid()) {
+          currentList.add(wsal);
+          count++;
+        }
+        if (count >= LISTLIMIT) {
+          currentList = new ArrayList<>();
+          lists.add(currentList);
+          count = 0;
+        }
+        lineStr = br.readLine();
+      }
+      br.close();
+      return lists.stream().flatMap(list -> list.stream());
     } catch (Exception ex) {
       throw new DescriptorParseException("Cannot retrieve log lines.", ex);
     }
diff --git a/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java b/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java
index 67ba638..0ff3e62 100644
--- a/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java
+++ b/src/test/java/org/torproject/descriptor/log/LogDescriptorTest.java
@@ -3,6 +3,8 @@
 
 package org.torproject.descriptor.log;
 
+import static java.util.stream.Collectors.toList;
+
 import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
@@ -130,7 +132,8 @@ public class LogDescriptorTest {
     InternalLogDescriptor ld = (InternalLogDescriptor) descs.get(0);
     assertEquals("Wrong compression type string. " + dataUsed(),
         pan[4], ld.getCompressionType());
-    List<? extends LogDescriptor.Line> lines = ld.logLines();
+    List<? extends LogDescriptor.Line> lines
+        = ld.logLines().collect(toList());
     assertEquals(this.lineCount, lines.size());
   }
 



More information about the tor-commits mailing list