
commit ff63b91cdb0383b4c02ac6c82912a02e195c8b25 Author: Karsten Loesing <karsten.loesing@gmx.net> Date: Sun Aug 28 14:19:35 2016 +0200 Add code for web logs analysis (#20008). --- task-20008/.gitignore | 9 ++ task-20008/LICENSE | 30 ++++++ task-20008/README.md | 46 ++++++++++ task-20008/run.sh | 8 ++ task-20008/src/Importer.java | 211 +++++++++++++++++++++++++++++++++++++++++++ task-20008/webstats.sql | 28 ++++++ 6 files changed, 332 insertions(+) diff --git a/task-20008/.gitignore b/task-20008/.gitignore new file mode 100644 index 0000000..fb5bbfa --- /dev/null +++ b/task-20008/.gitignore @@ -0,0 +1,9 @@ +/.classpath +/.project +/.vagrant/ +/Vagrantfile +/bin/ +/lib/ +/log +/webstats.torproject.org/ + diff --git a/task-20008/LICENSE b/task-20008/LICENSE new file mode 100644 index 0000000..9dbb66d --- /dev/null +++ b/task-20008/LICENSE @@ -0,0 +1,30 @@ +Copyright 2016 The Tor Project + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + +* Neither the names of the copyright owners nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/task-20008/README.md b/task-20008/README.md new file mode 100644 index 0000000..5f7e254 --- /dev/null +++ b/task-20008/README.md @@ -0,0 +1,46 @@ +README: Perform an ad-hoc analysis of Tor's sanitized web logs +============================================================== + +Sanitized versions of Tor's Apache web logs are available at +https://webstats.torproject.org/. Let's perform an ad-hoc analysis of +these logs to decide what graphs we'll want to put on Tor Metrics. And +let's perform this analysis by throwing everything into a PostgreSQL +database that we might later want to re-use for Tor Metrics. + +Steps to import sanitized web logs into the database: + +Create PostgreSQL database user webstats and database of same name: + +$ sudo -u postgres createuser -P webstats +$ sudo -u postgres createdb -O webstats webstats + +Import database schema: + +$ psql -f webstats.sql webstats + +Fetch sanitized web logs and put them under webstats.torproject.org/: + +$ wget --recursive --reject "index.html*" --no-parent \ + --accept "*201608*" https://webstats.torproject.org/ + +Fetch the following required libraries and put them in the lib/ +folder: + + - lib/commons-compress-1.9.jar + - lib/postgresql-jdbc3-9.2.jar + - lib/xz-1.5.jar + +Run the importer: + +$ ./run.sh + +Log into database and run a simple query: + +$ psql webstats + +webstats=> SELECT log_date, SUM(count) AS hits FROM requests + NATURAL JOIN resources NATURAL JOIN files + WHERE method = 'GET' AND response_code = 200 + AND (resource_string = '/' OR resource_string LIKE '/index%') + AND site = 'www.torproject.org' GROUP BY log_date ORDER BY log_date; + diff --git a/task-20008/run.sh b/task-20008/run.sh new file mode 100755 index 0000000..4039ee6 --- /dev/null +++ b/task-20008/run.sh @@ -0,0 +1,8 @@ +#!/bin/bash +date +rm -rf bin +mkdir -p bin/ +javac -d bin/ -cp lib/commons-compress-1.9.jar:lib/postgresql-jdbc3-9.2.jar:lib/xz-1.5.jar src/Importer.java +java -cp bin:lib/commons-compress-1.9.jar:lib/postgresql-jdbc3-9.2.jar:lib/xz-1.5.jar Importer webstats.torproject.org >> log +date + diff --git a/task-20008/src/Importer.java b/task-20008/src/Importer.java new file mode 100644 index 0000000..6834bf0 --- /dev/null +++ b/task-20008/src/Importer.java @@ -0,0 +1,211 @@ +/* Copyright 2016 The Tor Project + * See LICENSE for licensing information */ + +import org.apache.commons.compress.compressors.xz.XZCompressorInputStream; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStreamReader; +import java.sql.Connection; +import java.sql.Date; +import java.sql.DriverManager; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class Importer { + + public static void main(String[] args) throws Exception, + SQLException { + new Importer().importFiles(new File(args[0])); + } + + private void importFiles(File directory) throws Exception, + SQLException { + this.connect(); + List<File> files = new ArrayList<>(); + files.add(directory); + while (!files.isEmpty()) { + File file = files.remove(0); + if (file.isDirectory()) { + files.addAll(Arrays.asList(file.listFiles())); + } else { + this.importFile(file); + } + } + this.disconnect(); + } + + private Connection connection; + + PreparedStatement psFiles; + + PreparedStatement psResourcesSelect; + + PreparedStatement psResourcesInsert; + + PreparedStatement psRequests; + + void connect() throws SQLException { + this.connection = DriverManager.getConnection( + "jdbc:postgresql://localhost/webstats?user=vagrant&password=vagrant"); + this.connection.setAutoCommit(false); + this.psFiles = this.connection.prepareStatement("INSERT INTO files " + + "(server, site, log_date) VALUES (?, ?, ?)", + Statement.RETURN_GENERATED_KEYS); + this.psResourcesSelect = this.connection.prepareStatement( + "SELECT resource_id FROM resources WHERE resource_string = ?"); + this.psResourcesInsert = this.connection.prepareStatement( + "INSERT INTO resources (resource_string) VALUES (?)", + Statement.RETURN_GENERATED_KEYS); + this.psRequests = this.connection.prepareStatement("INSERT INTO requests " + + "(file_id, method, resource_id, response_code, count," + + " total_bytes_sent) VALUES (?, CAST(? AS method), ?, ?, ?, ?)"); + } + + void disconnect() throws SQLException { + this.connection.close(); + } + + final Pattern fileNamePattern = + Pattern.compile("^(.+)-access.log-(\\d{8}).xz$"); + + DateFormat dateFormat = new SimpleDateFormat("yyyyMMdd"); + + final Pattern logLinePattern = Pattern.compile("^0.0.0.[01] - - " + + "\\[\\d{2}/\\w{3}/\\d{4}:00:00:00 \\+0000\\] " + + "\"(GET|HEAD) ([^ ]+) HTTP[^ ]+\" (\\d+) (-|\\d+) \"-\" \"-\" -$"); + + private void importFile(File file) throws Exception { // TODO catch exceptions! + String server = file.getParentFile().getName(); + if (file.getName().contains("-ssl-access.log-")) { + System.out.println("Skipping file: " + file.getAbsolutePath()); + return; + } + Matcher fileNameMatcher = this.fileNamePattern.matcher(file.getName()); + if (!fileNameMatcher.matches()) { + System.err.println("Invalid file name: " + file.getAbsolutePath()); + return; + } + String site = fileNameMatcher.group(1); + long logDateMillis = this.dateFormat.parse(fileNameMatcher.group(2)) + .getTime(); + int fileId = writeFile(server, site, logDateMillis); + BufferedReader br = new BufferedReader(new InputStreamReader( + new XZCompressorInputStream(new FileInputStream(file)))); + String line; + SortedMap<String, long[]> requests = new TreeMap<>(); + while ((line = br.readLine()) != null) { + Matcher logLineMatcher = this.logLinePattern.matcher(line); + if (!logLineMatcher.matches()) { + System.err.println("Invalid line: " + line); + this.connection.rollback(); // TODO reconsider + br.close(); + return; + } + String method = logLineMatcher.group(1); + String resource = this.truncateString(logLineMatcher.group(2), 2048); + String responseCodeString = logLineMatcher.group(3); + String combined = String.format("%s %s %s", method, resource, + responseCodeString); + long bytesSent = logLineMatcher.group(4).equals("-") ? 0L + : Long.parseLong(logLineMatcher.group(4)); + long[] request = requests.get(combined); + if (request == null) { + request = new long[] { 1L, bytesSent }; + } else { + request[0]++; + request[1] += bytesSent; + } + requests.put(combined, request); + } + for (Map.Entry<String, long[]> request : requests.entrySet()) { + String[] keyParts = request.getKey().split(" "); + String method = keyParts[0]; + String resource = keyParts[1]; + int responseCode = Integer.parseInt(keyParts[2]); + long[] valueParts = request.getValue(); + int count = (int) valueParts[0]; + long totalBytesSent = valueParts[1]; + this.writeRequest(fileId, method, resource, responseCode, count, + totalBytesSent); + } + br.close(); + this.connection.commit(); + } + + int writeFile(String server, String site, long logDateMillis) + throws Exception { + int fileId = -1; + this.psFiles.clearParameters(); + server = this.truncateString(server, 32); + this.psFiles.setString(1, server); + site = this.truncateString(site, 128); + this.psFiles.setString(2, site); + this.psFiles.setDate(3, new Date(logDateMillis)); + this.psFiles.execute(); + ResultSet resultSet = psFiles.getGeneratedKeys(); + if (resultSet.next()) { + fileId = resultSet.getInt(1); + } + resultSet.close(); + return fileId; + } + + void writeRequest(int fileId, String method, String resource, + int responseCode, int count, long totalBytesSent) throws Exception { + System.out.printf("Writing request to database: %d, %s, %s, %d, %d, %d%n", + fileId, method, resource, responseCode, count, totalBytesSent); + int resourceId = -1; + this.psResourcesSelect.clearParameters(); + this.psResourcesSelect.setString(1, resource); + ResultSet rs = this.psResourcesSelect.executeQuery(); + if (rs.next()) { + resourceId = rs.getInt(1); + } else { + /* There's a small potential for a race condition between the previous + * SELECT and this INSERT INTO, but that will be resolved by the UNIQUE + * constraint when committing the transaction. */ + this.psResourcesInsert.clearParameters(); + this.psResourcesInsert.setString(1, resource); + this.psResourcesInsert.execute(); + ResultSet resultSet = psResourcesInsert.getGeneratedKeys(); + if (resultSet.next()) { + resourceId = resultSet.getInt(1); + } else { + throw new Exception("Could not retrieve auto-generated key for new " + + "resources entry."); // TODO better error handling + } + resultSet.close(); + } + this.psRequests.clearParameters(); + this.psRequests.setInt(1, fileId); + this.psRequests.setString(2, method); + this.psRequests.setInt(3, resourceId); + this.psRequests.setInt(4, responseCode); + this.psRequests.setInt(5, count); + this.psRequests.setLong(6, totalBytesSent); + this.psRequests.execute(); + } + + private String truncateString(String originalString, int truncateAfter) { + if (originalString.length() > truncateAfter) { + System.err.println("String too long, truncating: " + originalString); + originalString = originalString.substring(0, truncateAfter); + } + return originalString; + } +} + diff --git a/task-20008/webstats.sql b/task-20008/webstats.sql new file mode 100644 index 0000000..91a36b6 --- /dev/null +++ b/task-20008/webstats.sql @@ -0,0 +1,28 @@ +-- Copyright 2016 The Tor Project +-- See LICENSE for licensing information + +CREATE TYPE method AS ENUM ('GET', 'HEAD'); + +CREATE TABLE files ( + file_id serial PRIMARY KEY, + server character varying(32) NOT NULL, + site character varying(128) NOT NULL, + log_date date NOT NULL, + UNIQUE (server, site, log_date) +); + +CREATE TABLE resources ( + resource_id serial PRIMARY KEY, + resource_string character varying(2048) UNIQUE NOT NULL +); + +CREATE TABLE requests ( + file_id integer REFERENCES files (file_id), + method method NOT NULL, + resource_id integer REFERENCES resources (resource_id), + response_code smallint NOT NULL, + count integer NOT NULL, + total_bytes_sent bigint NOT NULL, + UNIQUE (file_id, method, resource_id, response_code) +); +