[tor-commits] [metrics-tasks/master] Add code for web logs analysis (#20008).

karsten at torproject.org karsten at torproject.org
Sun Aug 28 12:38:42 UTC 2016


commit ff63b91cdb0383b4c02ac6c82912a02e195c8b25
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Sun Aug 28 14:19:35 2016 +0200

    Add code for web logs analysis (#20008).
---
 task-20008/.gitignore        |   9 ++
 task-20008/LICENSE           |  30 ++++++
 task-20008/README.md         |  46 ++++++++++
 task-20008/run.sh            |   8 ++
 task-20008/src/Importer.java | 211 +++++++++++++++++++++++++++++++++++++++++++
 task-20008/webstats.sql      |  28 ++++++
 6 files changed, 332 insertions(+)

diff --git a/task-20008/.gitignore b/task-20008/.gitignore
new file mode 100644
index 0000000..fb5bbfa
--- /dev/null
+++ b/task-20008/.gitignore
@@ -0,0 +1,9 @@
+/.classpath
+/.project
+/.vagrant/
+/Vagrantfile
+/bin/
+/lib/
+/log
+/webstats.torproject.org/
+
diff --git a/task-20008/LICENSE b/task-20008/LICENSE
new file mode 100644
index 0000000..9dbb66d
--- /dev/null
+++ b/task-20008/LICENSE
@@ -0,0 +1,30 @@
+Copyright 2016 The Tor Project
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above
+  copyright notice, this list of conditions and the following disclaimer
+  in the documentation and/or other materials provided with the
+  distribution.
+
+* Neither the names of the copyright owners nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/task-20008/README.md b/task-20008/README.md
new file mode 100644
index 0000000..5f7e254
--- /dev/null
+++ b/task-20008/README.md
@@ -0,0 +1,46 @@
+README: Perform an ad-hoc analysis of Tor's sanitized web logs
+==============================================================
+
+Sanitized versions of Tor's Apache web logs are available at
+https://webstats.torproject.org/.  Let's perform an ad-hoc analysis of
+these logs to decide what graphs we'll want to put on Tor Metrics.  And
+let's perform this analysis by throwing everything into a PostgreSQL
+database that we might later want to re-use for Tor Metrics.  
+
+Steps to import sanitized web logs into the database:
+
+Create PostgreSQL database user webstats and database of same name:
+
+$ sudo -u postgres createuser -P webstats
+$ sudo -u postgres createdb -O webstats webstats
+
+Import database schema:
+
+$ psql -f webstats.sql webstats
+
+Fetch sanitized web logs and put them under webstats.torproject.org/:
+
+$ wget --recursive --reject "index.html*" --no-parent \
+  --accept "*201608*" https://webstats.torproject.org/
+
+Fetch the following required libraries and put them in the lib/
+folder:
+
+ - lib/commons-compress-1.9.jar
+ - lib/postgresql-jdbc3-9.2.jar
+ - lib/xz-1.5.jar
+
+Run the importer:
+
+$ ./run.sh
+
+Log into database and run a simple query:
+
+$ psql webstats
+
+webstats=> SELECT log_date, SUM(count) AS hits FROM requests
+  NATURAL JOIN resources NATURAL JOIN files
+  WHERE method = 'GET' AND response_code = 200
+  AND (resource_string = '/' OR resource_string LIKE '/index%')
+  AND site = 'www.torproject.org' GROUP BY log_date ORDER BY log_date;
+
diff --git a/task-20008/run.sh b/task-20008/run.sh
new file mode 100755
index 0000000..4039ee6
--- /dev/null
+++ b/task-20008/run.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+date
+rm -rf bin
+mkdir -p bin/
+javac -d bin/ -cp lib/commons-compress-1.9.jar:lib/postgresql-jdbc3-9.2.jar:lib/xz-1.5.jar src/Importer.java
+java -cp bin:lib/commons-compress-1.9.jar:lib/postgresql-jdbc3-9.2.jar:lib/xz-1.5.jar Importer webstats.torproject.org >> log
+date
+
diff --git a/task-20008/src/Importer.java b/task-20008/src/Importer.java
new file mode 100644
index 0000000..6834bf0
--- /dev/null
+++ b/task-20008/src/Importer.java
@@ -0,0 +1,211 @@
+/* Copyright 2016 The Tor Project
+ * See LICENSE for licensing information */
+
+import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.sql.Connection;
+import java.sql.Date;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class Importer {
+
+  public static void main(String[] args) throws Exception,
+      SQLException {
+    new Importer().importFiles(new File(args[0]));
+  }
+
+  private void importFiles(File directory) throws Exception,
+      SQLException {
+    this.connect();
+    List<File> files = new ArrayList<>();
+    files.add(directory);
+    while (!files.isEmpty()) {
+      File file = files.remove(0);
+      if (file.isDirectory()) {
+        files.addAll(Arrays.asList(file.listFiles()));
+      } else {
+        this.importFile(file);
+      }
+    }
+    this.disconnect();
+  }
+
+  private Connection connection;
+
+  PreparedStatement psFiles;
+
+  PreparedStatement psResourcesSelect;
+
+  PreparedStatement psResourcesInsert;
+
+  PreparedStatement psRequests;
+
+  void connect() throws SQLException {
+    this.connection = DriverManager.getConnection(
+        "jdbc:postgresql://localhost/webstats?user=vagrant&password=vagrant");
+    this.connection.setAutoCommit(false);
+    this.psFiles = this.connection.prepareStatement("INSERT INTO files "
+        + "(server, site, log_date) VALUES (?, ?, ?)",
+        Statement.RETURN_GENERATED_KEYS);
+    this.psResourcesSelect = this.connection.prepareStatement(
+        "SELECT resource_id FROM resources WHERE resource_string = ?");
+    this.psResourcesInsert = this.connection.prepareStatement(
+        "INSERT INTO resources (resource_string) VALUES (?)",
+        Statement.RETURN_GENERATED_KEYS);
+    this.psRequests = this.connection.prepareStatement("INSERT INTO requests "
+        + "(file_id, method, resource_id, response_code, count,"
+        + " total_bytes_sent) VALUES (?, CAST(? AS method), ?, ?, ?, ?)");
+  }
+
+  void disconnect() throws SQLException {
+    this.connection.close();
+  }
+
+  final Pattern fileNamePattern =
+      Pattern.compile("^(.+)-access.log-(\\d{8}).xz$");
+
+  DateFormat dateFormat = new SimpleDateFormat("yyyyMMdd");
+
+  final Pattern logLinePattern = Pattern.compile("^0.0.0.[01] - - "
+      + "\\[\\d{2}/\\w{3}/\\d{4}:00:00:00 \\+0000\\] "
+      + "\"(GET|HEAD) ([^ ]+) HTTP[^ ]+\" (\\d+) (-|\\d+) \"-\" \"-\" -$");
+
+  private void importFile(File file) throws Exception { // TODO catch exceptions!
+    String server = file.getParentFile().getName();
+    if (file.getName().contains("-ssl-access.log-")) {
+      System.out.println("Skipping file: " + file.getAbsolutePath());
+      return;
+    }
+    Matcher fileNameMatcher = this.fileNamePattern.matcher(file.getName());
+    if (!fileNameMatcher.matches()) {
+      System.err.println("Invalid file name: " + file.getAbsolutePath());
+      return;
+    }
+    String site = fileNameMatcher.group(1);
+    long logDateMillis = this.dateFormat.parse(fileNameMatcher.group(2))
+        .getTime();
+    int fileId = writeFile(server, site, logDateMillis);
+    BufferedReader br = new BufferedReader(new InputStreamReader(
+        new XZCompressorInputStream(new FileInputStream(file))));
+    String line;
+    SortedMap<String, long[]> requests = new TreeMap<>();
+    while ((line = br.readLine()) != null) {
+      Matcher logLineMatcher = this.logLinePattern.matcher(line);
+      if (!logLineMatcher.matches()) {
+        System.err.println("Invalid line: " + line);
+        this.connection.rollback(); // TODO reconsider
+        br.close();
+        return;
+      }
+      String method = logLineMatcher.group(1);
+      String resource = this.truncateString(logLineMatcher.group(2), 2048);
+      String responseCodeString = logLineMatcher.group(3);
+      String combined = String.format("%s %s %s", method, resource,
+          responseCodeString);
+      long bytesSent = logLineMatcher.group(4).equals("-") ? 0L
+          : Long.parseLong(logLineMatcher.group(4));
+      long[] request = requests.get(combined);
+      if (request == null) {
+        request = new long[] { 1L, bytesSent };
+      } else {
+        request[0]++;
+        request[1] += bytesSent;
+      }
+      requests.put(combined, request);
+    }
+    for (Map.Entry<String, long[]> request : requests.entrySet()) {
+      String[] keyParts = request.getKey().split(" ");
+      String method = keyParts[0];
+      String resource = keyParts[1];
+      int responseCode = Integer.parseInt(keyParts[2]);
+      long[] valueParts = request.getValue();
+      int count = (int) valueParts[0];
+      long totalBytesSent = valueParts[1];
+      this.writeRequest(fileId, method, resource, responseCode, count,
+          totalBytesSent);
+    }
+    br.close();
+    this.connection.commit();
+  }
+
+  int writeFile(String server, String site, long logDateMillis)
+      throws Exception {
+    int fileId = -1;
+    this.psFiles.clearParameters();
+    server = this.truncateString(server, 32);
+    this.psFiles.setString(1, server);
+    site = this.truncateString(site, 128);
+    this.psFiles.setString(2, site);
+    this.psFiles.setDate(3, new Date(logDateMillis));
+    this.psFiles.execute();
+    ResultSet resultSet = psFiles.getGeneratedKeys();
+    if (resultSet.next()) {
+      fileId = resultSet.getInt(1);
+    }
+    resultSet.close();
+    return fileId;
+  }
+
+  void writeRequest(int fileId, String method, String resource,
+      int responseCode, int count, long totalBytesSent) throws Exception {
+    System.out.printf("Writing request to database: %d, %s, %s, %d, %d, %d%n",
+        fileId, method, resource, responseCode, count, totalBytesSent);
+    int resourceId = -1;
+    this.psResourcesSelect.clearParameters();
+    this.psResourcesSelect.setString(1, resource);
+    ResultSet rs = this.psResourcesSelect.executeQuery();
+    if (rs.next()) {
+      resourceId = rs.getInt(1);
+    } else {
+      /* There's a small potential for a race condition between the previous
+       * SELECT and this INSERT INTO, but that will be resolved by the UNIQUE
+       * constraint when committing the transaction. */
+      this.psResourcesInsert.clearParameters();
+      this.psResourcesInsert.setString(1, resource);
+      this.psResourcesInsert.execute();
+      ResultSet resultSet = psResourcesInsert.getGeneratedKeys();
+      if (resultSet.next()) {
+        resourceId = resultSet.getInt(1);
+      } else {
+        throw new Exception("Could not retrieve auto-generated key for new "
+            + "resources entry."); // TODO better error handling
+      }
+      resultSet.close();
+    }
+    this.psRequests.clearParameters();
+    this.psRequests.setInt(1, fileId);
+    this.psRequests.setString(2, method);
+    this.psRequests.setInt(3, resourceId);
+    this.psRequests.setInt(4, responseCode);
+    this.psRequests.setInt(5, count);
+    this.psRequests.setLong(6, totalBytesSent);
+    this.psRequests.execute();
+  }
+
+  private String truncateString(String originalString, int truncateAfter) {
+    if (originalString.length() > truncateAfter) {
+      System.err.println("String too long, truncating: " + originalString);
+      originalString = originalString.substring(0, truncateAfter);
+    }
+    return originalString;
+  }
+}
+
diff --git a/task-20008/webstats.sql b/task-20008/webstats.sql
new file mode 100644
index 0000000..91a36b6
--- /dev/null
+++ b/task-20008/webstats.sql
@@ -0,0 +1,28 @@
+-- Copyright 2016 The Tor Project
+-- See LICENSE for licensing information
+
+CREATE TYPE method AS ENUM ('GET', 'HEAD');
+
+CREATE TABLE files (
+  file_id serial PRIMARY KEY,
+  server character varying(32) NOT NULL,
+  site character varying(128) NOT NULL,
+  log_date date NOT NULL,
+  UNIQUE (server, site, log_date)
+);
+
+CREATE TABLE resources (
+  resource_id serial PRIMARY KEY,
+  resource_string character varying(2048) UNIQUE NOT NULL
+);
+
+CREATE TABLE requests (
+  file_id integer REFERENCES files (file_id),
+  method method NOT NULL,
+  resource_id integer REFERENCES resources (resource_id),
+  response_code smallint NOT NULL,
+  count integer NOT NULL,
+  total_bytes_sent bigint NOT NULL,
+  UNIQUE (file_id, method, resource_id, response_code)
+);
+





More information about the tor-commits mailing list