commit 1c0ec1e13a507baa9621156645d2dc28d85c8748 Author: Karsten Loesing karsten.loesing@gmx.net Date: Mon Jan 23 20:09:38 2017 +0100
Add webstats module and webstats-tb graph.
Implements #21236. --- modules/webstats/.gitignore | 3 + modules/webstats/build.xml | 27 ++ .../java/org/torproject/metrics/webstats/Main.java | 406 +++++++++++++++++++++ .../webstats/src/main/resources/init-webstats.sql | 164 +++++++++ modules/webstats/src/main/resources/write-RData.R | 9 + .../org/torproject/metrics/webstats/MainTest.java | 93 +++++ shared/bin/90-run-webstats-stats.sh | 10 + shared/bin/99-copy-stats-files.sh | 2 + shared/build-base.xml | 42 +++ shared/build.xml | 2 +- website/etc/categories.json | 5 +- website/etc/metrics.json | 14 + website/etc/web.xml | 4 + website/rserve/graphs.R | 31 ++ .../metrics/web/research/ResearchStatsServlet.java | 1 + website/web/WEB-INF/sources.jsp | 6 + website/web/WEB-INF/stats.jsp | 34 +- 17 files changed, 849 insertions(+), 4 deletions(-)
diff --git a/modules/webstats/.gitignore b/modules/webstats/.gitignore new file mode 100644 index 0000000..a8e4d02 --- /dev/null +++ b/modules/webstats/.gitignore @@ -0,0 +1,3 @@ +/stats/*.csv +/RData/*.RData + diff --git a/modules/webstats/build.xml b/modules/webstats/build.xml new file mode 100644 index 0000000..bcfe251 --- /dev/null +++ b/modules/webstats/build.xml @@ -0,0 +1,27 @@ +<project default="run" name="webstats" basedir="."> + + <property name="sources" value="src/main/java"/> + <property name="testsources" value="src/test/java"/> + + <include file="../../shared/build-base.xml" as="basetask"/> + <target name="clean" depends="basetask.clean"/> + <target name="compile" depends="basetask.compile"/> + <target name="test" depends="basetask.test"/> + + <path id="classpath"> + <pathelement path="${classes}"/> + <path refid="base.classpath" /> + <fileset dir="${libs}"> + <include name="postgresql-jdbc3-9.2.jar"/> + </fileset> + </path> + + <target name="run" depends="compile"> + <java fork="true" + maxmemory="1g" + classname="org.torproject.metrics.webstats.Main"> + <classpath refid="classpath"/> + </java> + </target> +</project> + diff --git a/modules/webstats/src/main/java/org/torproject/metrics/webstats/Main.java b/modules/webstats/src/main/java/org/torproject/metrics/webstats/Main.java new file mode 100644 index 0000000..b6e2f96 --- /dev/null +++ b/modules/webstats/src/main/java/org/torproject/metrics/webstats/Main.java @@ -0,0 +1,406 @@ +/* Copyright 2016--2017 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.metrics.webstats; + +import org.apache.commons.compress.compressors.xz.XZCompressorInputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.sql.Connection; +import java.sql.Date; +import java.sql.DriverManager; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.text.DateFormat; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.SortedSet; +import java.util.TimeZone; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** Main class of the webstats module that downloads log files from the server, + * imports them into a database, and exports aggregate statistics to a CSV + * file. */ +public class Main { + + /** Logger for this class. */ + private static Logger log = LoggerFactory.getLogger(Main.class); + + /** Pattern for links contained in directory listings. */ + static final Pattern URL_STRING_PATTERN = + Pattern.compile(".*<a href="([^"]+)">.*"); + + static final Pattern LOG_FILE_URL_PATTERN = + Pattern.compile("^.*/([^/]+)/([^/]+)-access.log-(\d{8}).xz$"); + + private static DateFormat logDateFormat; + + static { + logDateFormat = new SimpleDateFormat("yyyyMMdd"); + logDateFormat.setLenient(false); + logDateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); + } + + static final Pattern LOG_LINE_PATTERN = Pattern.compile( + "^0.0.0.[01] - - \[\d{2}/\w{3}/\d{4}:00:00:00 \+0000\] " + + ""(GET|HEAD) ([^ ]+) HTTP[^ ]+" (\d+) (-|\d+) "-" "-" -$"); + + private static final String LOG_DATE = "log_date"; + + private static final String REQUEST_TYPE = "request_type"; + + private static final String PLATFORM = "platform"; + + private static final String CHANNEL = "channel"; + + private static final String LOCALE = "locale"; + + private static final String INCREMENTAL = "incremental"; + + private static final String COUNT = "count"; + + private static final String ALL_COLUMNS = LOG_DATE + "," + REQUEST_TYPE + "," + + PLATFORM + "," + CHANNEL + "," + LOCALE + "," + INCREMENTAL + "," + + COUNT; + + /** Executes this data-processing module. */ + public static void main(String[] args) throws Exception { + log.info("Starting webstats module."); + String dbUrlString = "jdbc:postgresql:webstats"; + Connection connection = connectToDatabase(dbUrlString); + SortedSet<String> previouslyImportedLogFileUrls = + queryImportedFiles(connection); + String baseUrl = "https://webstats.torproject.org/out/"; + SortedSet<String> newLogFileUrls = downloadDirectoryListings(baseUrl, + previouslyImportedLogFileUrls); + importLogFiles(connection, newLogFileUrls); + SortedSet<String> statistics = queryWebstats(connection); + writeStatistics(Paths.get("stats", "webstats.csv"), statistics); + disconnectFromDatabase(connection); + log.info("Terminated webstats module."); + } + + private static Connection connectToDatabase(String jdbcString) + throws SQLException { + log.info("Connecting to database."); + Connection connection = DriverManager.getConnection(jdbcString); + connection.setAutoCommit(false); + log.info("Successfully connected to database."); + return connection; + } + + static SortedSet<String> queryImportedFiles(Connection connection) + throws SQLException { + log.info("Querying URLs of previously imported log files."); + SortedSet<String> importedLogFileUrls = new TreeSet<>(); + Statement st = connection.createStatement(); + String queryString = "SELECT url FROM files"; + try (ResultSet rs = st.executeQuery(queryString)) { + while (rs.next()) { + importedLogFileUrls.add(rs.getString(1)); + } + } + log.info("Found {} URLs of previously imported log files.", + importedLogFileUrls.size()); + return importedLogFileUrls; + } + + static SortedSet<String> downloadDirectoryListings(String baseUrl, + SortedSet<String> importedLogFileUrls) throws IOException { + log.info("Downloading directory listings from {}.", baseUrl); + List<String> directoryListings = new ArrayList<>(); + directoryListings.add(baseUrl); + SortedSet<String> newLogFileUrls = new TreeSet<>(); + while (!directoryListings.isEmpty()) { + String urlString = directoryListings.remove(0); + if (urlString.endsWith("/")) { + directoryListings.addAll(downloadDirectoryListing(urlString)); + } else if (!urlString.endsWith(".xz")) { + log.debug("Skipping unrecognized URL {}.", urlString); + } else if (!importedLogFileUrls.contains(urlString)) { + newLogFileUrls.add(urlString); + } + } + log.info("Found {} URLs of log files that have not yet been imported.", + newLogFileUrls.size()); + return newLogFileUrls; + } + + static List<String> downloadDirectoryListing(String urlString) + throws IOException { + log.debug("Downloading directory listing from {}.", urlString); + List<String> urlStrings = new ArrayList<>(); + try (BufferedReader br = new BufferedReader(new InputStreamReader( + new URL(urlString).openStream()))) { + String line; + while ((line = br.readLine()) != null) { + Matcher matcher = URL_STRING_PATTERN.matcher(line); + if (matcher.matches() && !matcher.group(1).startsWith("/")) { + urlStrings.add(urlString + matcher.group(1)); + } + } + } + return urlStrings; + } + + static void importLogFiles(Connection connection, + SortedSet<String> newLogFileUrls) { + log.info("Downloading, parsing, and importing {} log files.", + newLogFileUrls.size()); + for (String urlString : newLogFileUrls) { + try { + Object[] metaData = parseMetaData(urlString); + if (metaData == null) { + continue; + } + List<String> downloadedLogLines = downloadLogFile(urlString); + Map<String, Integer> parsedLogLines = parseLogLines(urlString, + downloadedLogLines); + importLogLines(connection, urlString, metaData, parsedLogLines); + } catch (IOException | ParseException exc) { + log.warn("Cannot download or parse log file with URL {}. Retrying " + + "in the next run.", urlString, exc); + } catch (SQLException exc) { + log.warn("Cannot import log file with URL {} into the database. " + + "Rolling back and retrying in the next run.", urlString, exc); + try { + connection.rollback(); + } catch (SQLException exceptionWhileRollingBack) { + /* Ignore. */ + } + } + } + } + + private static Object[] parseMetaData(String urlString) + throws ParseException { + log.debug("Importing log file {}.", urlString); + if (urlString.contains("-ssl-access.log-")) { + log.debug("Skipping log file containing SSL requests with URL {}.", + urlString); + return null; + } + Matcher logFileUrlMatcher = LOG_FILE_URL_PATTERN.matcher(urlString); + if (!logFileUrlMatcher.matches()) { + log.debug("Skipping log file with unrecognized URL {}.", urlString); + return null; + } + String server = logFileUrlMatcher.group(1); + String site = logFileUrlMatcher.group(2); + long logDateMillis = logDateFormat.parse(logFileUrlMatcher.group(3)) + .getTime(); + return new Object[] { server, site, new Long(logDateMillis) }; + } + + static List<String> downloadLogFile(String urlString) throws IOException { + List<String> downloadedLogLines = new ArrayList<>(); + try (BufferedReader br = new BufferedReader(new InputStreamReader( + new XZCompressorInputStream(new URL(urlString).openStream())))) { + String line; + while ((line = br.readLine()) != null) { + downloadedLogLines.add(line); + } + } + return downloadedLogLines; + } + + static Map<String, Integer> parseLogLines(String urlString, + List<String> logLines) { + int skippedLines = 0; + Map<String, Integer> parsedLogLines = new HashMap<>(); + for (String logLine : logLines) { + Matcher logLineMatcher = LOG_LINE_PATTERN.matcher(logLine); + if (!logLineMatcher.matches()) { + skippedLines++; + continue; + } + String method = logLineMatcher.group(1); + String resource = logLineMatcher.group(2); + int responseCode = Integer.parseInt(logLineMatcher.group(3)); + String combined = String.format("%s %s %d", method, resource, + responseCode); + if (!parsedLogLines.containsKey(combined)) { + parsedLogLines.put(combined, 1); + } else { + parsedLogLines.put(combined, parsedLogLines.get(combined) + 1); + } + } + if (skippedLines > 0) { + log.debug("Skipped {} lines while parsing log file {}.", skippedLines, + urlString); + } + return parsedLogLines; + } + + private static void importLogLines(Connection connection, String urlString, + Object[] metaData, Map<String, Integer> parsedLogLines) + throws SQLException { + PreparedStatement psFiles = connection.prepareStatement( + "INSERT INTO files (url, server, site, " + LOG_DATE + ") " + + "VALUES (?, ?, ?, ?)", Statement.RETURN_GENERATED_KEYS); + PreparedStatement psResourcesSelect = connection.prepareStatement( + "SELECT resource_id FROM resources WHERE resource_string = ?"); + PreparedStatement psResourcesInsert = connection.prepareStatement( + "INSERT INTO resources (resource_string) VALUES (?)", + Statement.RETURN_GENERATED_KEYS); + PreparedStatement psRequests = connection.prepareStatement( + "INSERT INTO requests (file_id, method, resource_id, response_code, " + + COUNT + ") VALUES (?, CAST(? AS method), ?, ?, ?)"); + String server = (String) metaData[0]; + String site = (String) metaData[1]; + long logDateMillis = (long) metaData[2]; + int fileId = insertFile(psFiles, urlString, server, site, logDateMillis); + if (fileId < 0) { + log.debug("Skipping previously imported log file {}.", urlString); + return; + } + for (Map.Entry<String, Integer> requests : parsedLogLines.entrySet()) { + String[] keyParts = requests.getKey().split(" "); + String method = keyParts[0]; + String resource = keyParts[1]; + int responseCode = Integer.parseInt(keyParts[2]); + int count = requests.getValue(); + int resourceId = insertResource(psResourcesSelect, psResourcesInsert, + resource); + if (resourceId < 0) { + log.error("Could not retrieve auto-generated key for new resources " + + "entry."); + connection.rollback(); + return; + } + insertRequest(psRequests, fileId, method, resourceId, responseCode, + count); + } + connection.commit(); + log.debug("Finished importing log file with URL {} into database.", + urlString); + } + + private static int insertFile(PreparedStatement psFiles, String urlString, + String server, String site, long logDateMillis) throws SQLException { + int fileId = -1; + psFiles.clearParameters(); + psFiles.setString(1, truncateString(urlString, 2048)); + psFiles.setString(2, truncateString(server, 32)); + psFiles.setString(3, truncateString(site, 128)); + psFiles.setDate(4, new Date(logDateMillis)); + psFiles.execute(); + try (ResultSet rs = psFiles.getGeneratedKeys()) { + if (rs.next()) { + fileId = rs.getInt(1); + } + } + return fileId; + } + + private static void insertRequest(PreparedStatement psRequests, int fileId, + String method, int resourceId, int responseCode, int count) + throws SQLException { + psRequests.clearParameters(); + psRequests.setInt(1, fileId); + psRequests.setString(2, method); + psRequests.setInt(3, resourceId); + psRequests.setInt(4, responseCode); + psRequests.setInt(5, count); + psRequests.execute(); + } + + private static int insertResource(PreparedStatement psResourcesSelect, + PreparedStatement psResourcesInsert, String resource) + throws SQLException { + int resourceId = -1; + String truncatedResource = truncateString(resource, 2048); + psResourcesSelect.clearParameters(); + psResourcesSelect.setString(1, truncatedResource); + try (ResultSet rs = psResourcesSelect.executeQuery()) { + if (rs.next()) { + resourceId = rs.getInt(1); + } + } + if (resourceId < 0) { + /* There's a small potential for a race condition between the previous + * SELECT and this INSERT INTO, but that will be resolved by the UNIQUE + * constraint when committing the transaction. */ + psResourcesInsert.clearParameters(); + psResourcesInsert.setString(1, truncatedResource); + psResourcesInsert.execute(); + try (ResultSet rs = psResourcesInsert.getGeneratedKeys()) { + if (rs.next()) { + resourceId = rs.getInt(1); + } + } + } + return resourceId; + } + + private static String truncateString(String originalString, + int truncateAfter) { + if (originalString.length() > truncateAfter) { + originalString = originalString.substring(0, truncateAfter); + } + return originalString; + } + + static SortedSet<String> queryWebstats(Connection connection) + throws SQLException { + log.info("Querying statistics from database."); + SortedSet<String> statistics = new TreeSet<>(); + Statement st = connection.createStatement(); + String queryString = "SELECT " + ALL_COLUMNS + " FROM webstats"; + DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd", Locale.US); + dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); + try (ResultSet rs = st.executeQuery(queryString)) { + while (rs.next()) { + statistics.add(String.format("%s,%s,%s,%s,%s,%s,%d", + dateFormat.format(rs.getDate(LOG_DATE)), + emptyNull(rs.getString(REQUEST_TYPE)), + emptyNull(rs.getString(PLATFORM)), + emptyNull(rs.getString(CHANNEL)), + emptyNull(rs.getString(LOCALE)), + emptyNull(rs.getString(INCREMENTAL)), + rs.getLong(COUNT))); + } + } + return statistics; + } + + private static String emptyNull(String text) { + return null == text ? "" : text; + } + + static void writeStatistics(Path webstatsPath, + SortedSet<String> statistics) throws IOException { + webstatsPath.toFile().getParentFile().mkdirs(); + List<String> lines = new ArrayList<String>(); + lines.add(ALL_COLUMNS); + lines.addAll(statistics); + log.info("Writing {} lines to {}.", lines.size(), + webstatsPath.toFile().getAbsolutePath()); + Files.write(webstatsPath, lines, StandardCharsets.UTF_8); + } + + private static void disconnectFromDatabase(Connection connection) + throws SQLException { + log.info("Disconnecting from database."); + connection.close(); + } +} + diff --git a/modules/webstats/src/main/resources/init-webstats.sql b/modules/webstats/src/main/resources/init-webstats.sql new file mode 100644 index 0000000..98bb758 --- /dev/null +++ b/modules/webstats/src/main/resources/init-webstats.sql @@ -0,0 +1,164 @@ +-- Copyright 2016--2017 The Tor Project +-- See LICENSE for licensing information + +CREATE TYPE method AS ENUM ('GET', 'HEAD'); + +CREATE TABLE files ( + file_id SERIAL PRIMARY KEY, + url CHARACTER VARYING(2048) UNIQUE NOT NULL, + server CHARACTER VARYING(32) NOT NULL, + site CHARACTER VARYING(128) NOT NULL, + log_date DATE NOT NULL, + UNIQUE (server, site, log_date) +); + +CREATE TABLE resources ( + resource_id SERIAL PRIMARY KEY, + resource_string CHARACTER VARYING(2048) UNIQUE NOT NULL +); + +CREATE TABLE requests ( + file_id INTEGER REFERENCES files (file_id) NOT NULL, + method METHOD NOT NULL, + resource_id INTEGER REFERENCES resources (resource_id) NOT NULL, + response_code SMALLINT NOT NULL, + count INTEGER NOT NULL, + UNIQUE (file_id, method, resource_id, response_code) +); + +CREATE OR REPLACE VIEW webstats AS + SELECT log_date, + CASE WHEN resource_string LIKE '%.asc' THEN 'tbsd' + ELSE 'tbid' END AS request_type, + CASE WHEN resource_string LIKE '%.exe%' THEN 'w' + WHEN resource_string LIKE '%.dmg%' THEN 'm' + WHEN resource_string LIKE '%.tar.xz%' THEN 'l' + ELSE 'o' END AS platform, + CASE WHEN resource_string LIKE '%-hardened%' THEN 'h' + WHEN resource_string LIKE '%/%.%a%/%' THEN 'a' + ELSE 'r' END AS channel, + COALESCE(SUBSTRING(resource_string + FROM '.*_([a-zA-Z]{2}|[a-zA-Z]{2}-[a-zA-Z]{2})[._-].*'), '??') AS locale, + NULL::BOOLEAN AS incremental, + SUM(count) AS count + FROM files NATURAL JOIN requests NATURAL JOIN resources + WHERE (resource_string LIKE '%/torbrowser/%.exe' + OR resource_string LIKE '%/torbrowser/%.dmg' + OR resource_string LIKE '%/torbrowser/%.tar.xz' + OR resource_string LIKE '%/torbrowser/%.exe.asc' + OR resource_string LIKE '%/torbrowser/%.dmg.asc' + OR resource_string LIKE '%/torbrowser/%.tar.xz.asc') + AND response_code = 200 + AND method = 'GET' + GROUP BY log_date, request_type, platform, channel, locale, incremental + UNION + SELECT log_date, + 'tbup' AS request_type, + CASE WHEN resource_string LIKE '%/WINNT%' THEN 'w' + WHEN resource_string LIKE '%/Darwin%' THEN 'm' + ELSE 'l' END AS platform, + CASE WHEN resource_string LIKE '%/hardened/%' THEN 'h' + WHEN resource_string LIKE '%/alpha/%' THEN 'a' + WHEN resource_string LIKE '%/release/%' THEN 'r' + ELSE 'o' END AS channel, + COALESCE(SUBSTRING(resource_string + FROM '.*/([a-zA-Z]{2}|[a-zA-Z]{2}-[a-zA-Z]{2})??$'), '??') AS locale, + NULL::BOOLEAN AS incremental, + SUM(count) AS count + FROM files NATURAL JOIN requests NATURAL JOIN resources + WHERE resource_string LIKE '%/torbrowser/update_2/%' + AND resource_string NOT LIKE '%.xml' + AND response_code = 200 + AND method = 'GET' + GROUP BY log_date, request_type, platform, channel, locale, incremental + UNION + SELECT log_date, + 'tbur' AS request_type, + CASE WHEN resource_string LIKE '%-win32-%' THEN 'w' + WHEN resource_string LIKE '%-osx%' THEN 'm' + ELSE 'l' END AS platform, + CASE WHEN resource_string LIKE '%-hardened%' THEN 'h' + WHEN resource_string LIKE '%/%.%a%/%' THEN 'a' + ELSE 'r' END AS channel, + COALESCE(SUBSTRING(resource_string + FROM '.*_([a-zA-Z]{2}|[a-zA-Z]{2}-[a-zA-Z]{2})[._-].*'), '??') AS locale, + CASE WHEN resource_string LIKE '%.incremental.%' THEN TRUE + ELSE FALSE END AS incremental, + SUM(count) AS count + FROM files NATURAL JOIN requests NATURAL JOIN resources + WHERE resource_string LIKE '%/torbrowser/%.mar' + AND response_code = 302 + AND method = 'GET' + GROUP BY log_date, request_type, platform, channel, locale, incremental + UNION + SELECT log_date, + 'tmid' AS request_type, + CASE WHEN resource_string LIKE '%.exe' THEN 'w' + WHEN resource_string LIKE '%.dmg' THEN 'm' + WHEN resource_string LIKE '%.tar.xz' THEN 'l' + ELSE 'o' END AS platform, + NULL AS channel, + COALESCE(SUBSTRING(resource_string + FROM '.*_([a-zA-Z]{2}|[a-zA-Z]{2}-[a-zA-Z]{2})[._-].*'), '??') AS locale, + NULL::BOOLEAN AS incremental, + SUM(count) AS count + FROM files NATURAL JOIN requests NATURAL JOIN resources + WHERE (resource_string LIKE '%/tormessenger/%.exe' + OR resource_string LIKE '%/tormessenger/%.dmg' + OR resource_string LIKE '%/tormessenger/%.tar.xz') + AND response_code = 200 + AND method = 'GET' + GROUP BY log_date, request_type, platform, channel, locale, incremental + UNION + SELECT log_date, + 'tmup' AS request_type, + CASE WHEN resource_string LIKE '%/WINNT%' THEN 'w' + WHEN resource_string LIKE '%/Darwin%' THEN 'm' + WHEN resource_string LIKE '%/Linux%' THEN 'l' + ELSE 'o' END AS platform, + NULL AS channel, + COALESCE(SUBSTRING(resource_string + FROM '.*/([a-zA-Z]{2}|[a-zA-Z]{2}-[a-zA-Z]{2})??$'), '??') AS locale, + NULL::BOOLEAN AS incremental, + SUM(count) AS count + FROM files NATURAL JOIN requests NATURAL JOIN resources + WHERE resource_string LIKE '%/tormessenger/update_2/%' + AND resource_string NOT LIKE '%.xml' + AND resource_string NOT LIKE '%/' + AND resource_string NOT LIKE '%/?' + AND response_code = 200 + AND method = 'GET' + GROUP BY log_date, request_type, platform, channel, locale, incremental + UNION + SELECT log_date, + 'twhph' AS request_type, + NULL AS platform, + NULL AS channel, + NULL AS locale, + NULL::BOOLEAN AS incremental, + SUM(count) AS count + FROM files NATURAL JOIN requests NATURAL JOIN resources + WHERE (resource_string = '/' + OR resource_string LIKE '/index%') + AND response_code = 200 + AND (site = 'torproject.org' + OR site = 'www.torproject.org') + AND method = 'GET' + GROUP BY log_date, request_type, platform, channel, locale, incremental + UNION + SELECT log_date, + 'twdph' AS request_type, + NULL AS platform, + NULL AS channel, + NULL AS locale, + NULL::BOOLEAN AS incremental, + SUM(count) AS count + FROM files NATURAL JOIN requests NATURAL JOIN resources + WHERE (resource_string LIKE '/download/download%' + OR resource_string LIKE '/projects/torbrowser.html%') + AND response_code = 200 + AND (site = 'torproject.org' + OR site = 'www.torproject.org') + AND method = 'GET' + GROUP BY log_date, request_type, platform, channel, locale, incremental; + diff --git a/modules/webstats/src/main/resources/write-RData.R b/modules/webstats/src/main/resources/write-RData.R new file mode 100644 index 0000000..2cb8917 --- /dev/null +++ b/modules/webstats/src/main/resources/write-RData.R @@ -0,0 +1,9 @@ +dir.create("RData", showWarnings = FALSE) + +d <- read.csv("stats/webstats.csv", stringsAsFactors = FALSE) +d <- d[d$request_type %in% c('tbid', 'tbsd', 'tbup', 'tbur'), ] +data <- aggregate(list(count = d$count), + by = list(log_date = as.Date(d$log_date), request_type = d$request_type), + FUN = sum) +save(data, file = "RData/webstats-tb.RData") + diff --git a/modules/webstats/src/test/java/org/torproject/metrics/webstats/MainTest.java b/modules/webstats/src/test/java/org/torproject/metrics/webstats/MainTest.java new file mode 100644 index 0000000..1c4f0bc --- /dev/null +++ b/modules/webstats/src/test/java/org/torproject/metrics/webstats/MainTest.java @@ -0,0 +1,93 @@ +/* Copyright 2017 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.metrics.webstats; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import org.junit.Test; + +import java.util.regex.Matcher; + +public class MainTest { + + static final String SAMPLE_LOG_FILE_NAME = + "metrics.torproject.org-access.log-20170117.xz"; + + static final String SAMPLE_SUBDIRECTORY_NAME = "meronense.torproject.org/"; + + static final String SAMPLE_LOG_FILE_URL = + "https://webstats.torproject.org/out/meronense.torproject.org/" + + "metrics.torproject.org-access.log-20170117.xz"; + + static final String[] SAMPLE_LOG_LINES = new String[] { + "0.0.0.0 - - [17/Jan/2017:00:00:00 +0000] " + + ""GET / HTTP/1.0" 200 10532 "-" "-" -", + "0.0.0.0 - - [17/Jan/2017:00:00:00 +0000] " + + ""HEAD /bubbles.html HTTP/1.1" 200 - "-" "-" -" + }; + + @Test + public void testUrlStringPatternComplete() { + Matcher matcher = Main.URL_STRING_PATTERN.matcher( + "<img src="/icons/unknown.gif" alt="[ ]"> " + + "<a href="" + SAMPLE_LOG_FILE_NAME + "">" + SAMPLE_LOG_FILE_NAME + + "</a> 2017-01-19 19:43 5.6K "); + assertTrue(matcher.matches()); + assertEquals(SAMPLE_LOG_FILE_NAME, matcher.group(1)); + } + + @Test + public void testUrlStringPatternOnlyATag() { + Matcher matcher = Main.URL_STRING_PATTERN.matcher("<a href="" + + SAMPLE_LOG_FILE_NAME + "">" + SAMPLE_LOG_FILE_NAME + "</a>"); + assertTrue(matcher.matches()); + assertEquals(SAMPLE_LOG_FILE_NAME, matcher.group(1)); + } + + @Test + public void testUrlStringPatternSubdirectory() { + Matcher matcher = Main.URL_STRING_PATTERN.matcher( + "<a href="" + SAMPLE_SUBDIRECTORY_NAME + "">" + + SAMPLE_SUBDIRECTORY_NAME + "/</a>"); + assertTrue(matcher.matches()); + assertEquals(SAMPLE_SUBDIRECTORY_NAME, matcher.group(1)); + } + + @Test + public void testUrlStringPatternAnythingBetweenDoubleQuotesHtml() { + Matcher matcher = Main.URL_STRING_PATTERN.matcher( + "<a href="anything-between-double-quotes.html">Link/</a>"); + assertTrue(matcher.matches()); + assertEquals("anything-between-double-quotes.html", matcher.group(1)); + } + + @Test + public void testLogFileUrlPatternComplete() { + Matcher matcher = Main.LOG_FILE_URL_PATTERN.matcher(SAMPLE_LOG_FILE_URL); + assertTrue(matcher.matches()); + assertEquals("meronense.torproject.org", matcher.group(1)); + assertEquals("metrics.torproject.org", matcher.group(2)); + assertEquals("20170117", matcher.group(3)); + } + + @Test + public void testLogLinePatternGetSlash() { + Matcher matcher = Main.LOG_LINE_PATTERN.matcher(SAMPLE_LOG_LINES[0]); + assertTrue(matcher.matches()); + assertEquals("GET", matcher.group(1)); + assertEquals("/", matcher.group(2)); + assertEquals("200", matcher.group(3)); + } + + @Test + public void testLogLinePatternHeadBubbles() { + Matcher matcher = Main.LOG_LINE_PATTERN.matcher(SAMPLE_LOG_LINES[1]); + assertTrue(matcher.matches()); + assertEquals("HEAD", matcher.group(1)); + assertEquals("/bubbles.html", matcher.group(2)); + assertEquals("200", matcher.group(3)); + } +} + diff --git a/shared/bin/90-run-webstats-stats.sh b/shared/bin/90-run-webstats-stats.sh new file mode 100755 index 0000000..37091b4 --- /dev/null +++ b/shared/bin/90-run-webstats-stats.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +cd modules/webstats/ + +ant run | grep "[java]" + +R --slave -f src/main/resources/write-RData.R > /dev/null 2>&1 + +cd ../../ + diff --git a/shared/bin/99-copy-stats-files.sh b/shared/bin/99-copy-stats-files.sh index d236630..a828686 100755 --- a/shared/bin/99-copy-stats-files.sh +++ b/shared/bin/99-copy-stats-files.sh @@ -6,7 +6,9 @@ cp -a modules/advbwdist/stats/advbwdist.csv shared/stats/ cp -a modules/hidserv/stats/hidserv.csv shared/stats/ cp -a modules/clients/stats/clients*.csv shared/stats/ cp -a modules/clients/stats/userstats-combined.csv shared/stats/ +cp -a modules/webstats/stats/webstats.csv shared/stats/
mkdir -p shared/RData cp -a modules/clients/RData/*.RData shared/RData/ +cp -a modules/webstats/RData/*.RData shared/RData/
diff --git a/shared/build-base.xml b/shared/build-base.xml index 7051f49..759e1d0 100644 --- a/shared/build-base.xml +++ b/shared/build-base.xml @@ -1,9 +1,11 @@ <project basedir=".">
<property name="sources" value="src"/> + <property name="testsources" value="src/test/java"/> <property name="libs" value="../../shared/lib"/> <property name="generated" value="generated"/> <property name="classes" value="${generated}/classes/"/> + <property name="testclasses" value="${generated}/test-classes/"/> <property name="source-and-target-java-version" value="1.7" /> <property name="descriptorversion" value="1.5.0" />
@@ -21,6 +23,21 @@ </fileset> </path>
+ <path id="base.testclasspath"> + <pathelement path="${base.testclasses}"/> + <pathelement path="base.classpath"/> + <fileset dir="${libs}"> + <include name="hamcrest-core-1.3.jar"/> + <include name="junit4-4.11.jar"/> + </fileset> + </path> + + <path id="testclasspath"> + <pathelement path="${testclasses}"/> + <path refid="base.testclasspath" /> + <path refid="base.classpath" /> + </path> + <target name="clean"> <delete includeEmptyDirs="true" quiet="true"> <fileset dir="${generated}" defaultexcludes="false" includes="**" /> @@ -29,6 +46,7 @@
<target name="init"> <mkdir dir="${classes}"/> + <mkdir dir="${testclasses}"/> </target>
<target name="compile" depends="init" > @@ -45,6 +63,30 @@ </javac> </target>
+ <target name="testcompile" depends="compile" > + <javac destdir="${testclasses}" + srcdir="${testsources}" + source="${source-and-target-java-version}" + target="${source-and-target-java-version}" + debug="true" debuglevel="lines,source" + deprecation="true" + optimize="false" + failonerror="true" + includeantruntime="false"> + <classpath refid="testclasspath"/> + </javac> + </target> + + <target name="test" depends="testcompile"> + <junit fork="true" haltonfailure="true" printsummary="off"> + <classpath refid="testclasspath"/> + <formatter type="plain" usefile="false"/> + <batchtest> + <fileset dir="${testclasses}" + includes="**/*Test.class"/> + </batchtest> + </junit> + </target>
</project>
diff --git a/shared/build.xml b/shared/build.xml index 13a09f7..cb51d5f 100644 --- a/shared/build.xml +++ b/shared/build.xml @@ -26,9 +26,9 @@ <fileset dir="../modules/clients/src" includes="**/*.java"/> <fileset dir="../modules/collectdescs/src" includes="**/*.java"/> <fileset dir="../modules/connbidirect/src" includes="**/*.java"/> - <fileset dir="../modules/disagreement/src" includes="**/*.java"/> <fileset dir="../modules/hidserv/src" includes="**/*.java"/> <fileset dir="../modules/legacy/src" includes="**/*.java"/> + <fileset dir="../modules/webstats/src" includes="**/*.java"/> <classpath> <path refid="checkstyle.classpath" /> </classpath> diff --git a/website/etc/categories.json b/website/etc/categories.json index 8b4ea77..6825634 100644 --- a/website/etc/categories.json +++ b/website/etc/categories.json @@ -78,6 +78,9 @@ "icon": "fa-download", "header": "Applications", "summary": "How many Tor applications, like Tor Browser, have been downloaded or updated.", - "metrics": [] + "description": "The following application statistics are based on the analysis of requests to <code>torproject.org</code> web servers.", + "metrics": [ + "webstats-tb" + ] } ] diff --git a/website/etc/metrics.json b/website/etc/metrics.json index f7666be..4a97ca1 100644 --- a/website/etc/metrics.json +++ b/website/etc/metrics.json @@ -402,5 +402,19 @@ "title": "Network churn rate by relay flag", "type": "Link", "description": "<p>This image shows the churn rate of the Tor network by <a href="glossary.html#relay-flag">relay flag</a> in a given month. The churn rate, a value in the interval <b>[0,1]</b>, captures the rate of <a href="glossary.html#relay">relays</a> joining and leaving the network from one <a href="glossary.html#consensus">consensus</a> to the next (that is, within one hour). The complete image gallery can be found on <a href="https://nymity.ch/sybilhunting/churn-values/%5C%22%3EPhilipp Winter's homepage</a>.</p><p><a href="https://nymity.ch/sybilhunting/churn-values/%5C%22%3E<img src="images/networkchurn.png" alt="Network churn rate by relay flag"></a></p>" + }, + { + "id": "webstats-tb", + "title": "Tor Browser downloads and updates", + "type": "Graph", + "description": "<p>This graph shows absolute numbers of requests to Tor's web servers by request type. It is based on data from <a href="https://webstats.torproject.org/%5C" target="_blank"><code>webstats.torproject.org</code></a> which collects logs from <code>torproject.org</code> web servers and provides them as a stripped-down version of Apache's "combined" log format without IP addresses, log times, HTTP parameters, referers, and user agent strings. <em>Initial downloads</em> and <em>signature downloads</em> are requests made by the user to download a Tor Browser executable or a corresponding signature file from the Tor website. <em>Update pings</em> and <em>update requests</em> are requests made by Tor Browser to check whether a newer version is available or to download a newer version.</p>", + "function": "plot_webstats_tb", + "parameters": [ + "start", + "end" + ], + "data": [ + "webstats" + ] } ] diff --git a/website/etc/web.xml b/website/etc/web.xml index 7444cf5..916984c 100644 --- a/website/etc/web.xml +++ b/website/etc/web.xml @@ -46,6 +46,7 @@ <url-pattern>/hidserv-dir-onions-seen.html</url-pattern> <url-pattern>/hidserv-rend-relayed-cells.html</url-pattern> <url-pattern>/hidserv-frac-reporting.html</url-pattern> + <url-pattern>/webstats-tb.html</url-pattern> </servlet-mapping>
<servlet> @@ -177,6 +178,9 @@ <url-pattern>/hidserv-frac-reporting.png</url-pattern> <url-pattern>/hidserv-frac-reporting.pdf</url-pattern> <url-pattern>/hidserv-frac-reporting.svg</url-pattern> + <url-pattern>/webstats-tb.png</url-pattern> + <url-pattern>/webstats-tb.pdf</url-pattern> + <url-pattern>/webstats-tb.svg</url-pattern> </servlet-mapping>
<servlet> diff --git a/website/rserve/graphs.R b/website/rserve/graphs.R index 9f8daa7..fc6cfb6 100644 --- a/website/rserve/graphs.R +++ b/website/rserve/graphs.R @@ -1095,3 +1095,34 @@ plot_hidserv_frac_reporting <- function(start, end, path) { ggsave(filename = path, width = 8, height = 5, dpi = 72) }
+plot_webstats_tb <- function(start, end, path) { + end <- min(end, as.character(Sys.Date() - 2)) + load("/srv/metrics.torproject.org/metrics/shared/RData/webstats-tb.RData") + d <- data + d <- d[d$log_date >= start & d$log_date <= end, ] + date_breaks <- date_breaks(as.numeric(max(d$log_date) - min(d$log_date))) + d$request_type <- factor(d$request_type) + levels(d$request_type) <- list( + 'Initial downloads' = 'tbid', + 'Signature downloads' = 'tbsd', + 'Update pings' = 'tbup', + 'Update requests' = 'tbur') + formatter <- function(x, ...) { + format(x, ..., scientific = FALSE, big.mark = ' ') } + ggplot(d, aes(x = log_date, y = count)) + + geom_point() + + geom_line() + + expand_limits(y = 0) + + facet_grid(request_type ~ ., scales = "free_y") + + scale_x_date(name = paste("\nThe Tor Project - ", + "https://metrics.torproject.org/", sep = ""), + labels = date_format(date_breaks$format), + breaks = date_breaks$major, + minor_breaks = date_breaks$minor) + + scale_y_continuous(name = 'Requests per day\n', labels = formatter) + + theme(strip.text.y = element_text(angle = 0, hjust = 0, size = rel(1.5)), + strip.background = element_rect(fill = NA)) + + ggtitle("Tor Browser downloads and updates\n") + ggsave(filename = path, width = 8, height = 5, dpi = 72) +} + diff --git a/website/src/org/torproject/metrics/web/research/ResearchStatsServlet.java b/website/src/org/torproject/metrics/web/research/ResearchStatsServlet.java index 8f5c399..de5715e 100644 --- a/website/src/org/torproject/metrics/web/research/ResearchStatsServlet.java +++ b/website/src/org/torproject/metrics/web/research/ResearchStatsServlet.java @@ -39,6 +39,7 @@ public class ResearchStatsServlet extends HttpServlet { this.availableStatisticsFiles.add("advbwdist"); this.availableStatisticsFiles.add("hidserv"); this.availableStatisticsFiles.add("disagreement"); + this.availableStatisticsFiles.add("webstats"); }
@Override diff --git a/website/web/WEB-INF/sources.jsp b/website/web/WEB-INF/sources.jsp index 64ca85b..b36d43f 100644 --- a/website/web/WEB-INF/sources.jsp +++ b/website/web/WEB-INF/sources.jsp @@ -39,6 +39,12 @@ </div>
<div class="container"> + <ul> + <li><a href="https://webstats.torproject.org/" target="_blank"><code>webstats.torproject.org</code></a> collects logs from <code>torproject.org</code> web servers and provides them as a stripped-down version of Apache's "combined" log format without IP addresses, log times, HTTP parameters, referers, and user agent strings.</li> + </ul> + </div> + + <div class="container"> <h2>Measurement tools <a href="#measurement" name="measurement" class="anchor">#</a></h2> <p>The following tools perform active measurements in the Tor network.</p> <ul> diff --git a/website/web/WEB-INF/stats.jsp b/website/web/WEB-INF/stats.jsp index fc676ba..fbecc0f 100644 --- a/website/web/WEB-INF/stats.jsp +++ b/website/web/WEB-INF/stats.jsp @@ -483,8 +483,38 @@ given attribute.</li>
</ul>
- </div> - </div> +</div> + +<div class="container"> +<h2>Requests to <code>torproject.org</code> web servers <a href="#webstats" name="webstats" class="anchor">#</a></h2> + +<p>The following data file contains aggregate statistics on requests to <code>torproject.org</code> web servers.</p> + +<p><b>Download as <a href="stats/webstats.csv">CSV file</a>.</b></p> + +<p>The statistics file contains the following columns:</p> +<ul> +<li><b>log_date:</b> UTC date (YYYY-MM-DD) when requests to <code>torproject.org</code> web servers have been logged.</li> +<li><b>request_type:</b> Request type with fixed identifiers as follows: +<ul> +<li><b>"tbid":</b> Tor Browser initial downloads: GET requests to all sites with resource strings <code>'%/torbrowser/%.exe'</code>, <code>'%/torbrowser/%.dmg'</code>, and <code>'%/torbrowser/%.tar.xz'</code> and response code 200.</li> +<li><b>"tbsd":</b> Tor Browser signature downloads: GET requests to all sites with resource strings <code>'%/torbrowser/%.exe.asc'</code>, <code>'%/torbrowser/%.dmg.asc'</code>, and <code>'%/torbrowser/%.tar.xz.asc'</code> and response code 200.</li> +<li><b>"tbup":</b> Tor Browser update pings: GET requests to all sites with resource strings <code>'%/torbrowser/update_2/%'</code> and response code 200.</li> +<li><b>"tbur":</b> Tor Browser update requests: GET requests to all sites with resource strings <code>'%/torbrowser/%.mar'</code> and response code 302.</li> +<li><b>"tmid":</b> Tor Messenger initial downloads: GET requests to all sites with resource strings <code>'%/tormessenger/%.exe'</code>, <code>'%/tormessenger/%.dmg'</code>, and <code>'%/tormessenger/%.tar.xz'</code> and response code 200.</li> +<li><b>"tmup":</b> Tor Messenger update pings: GET requests to all sites with resource strings <code>'%/tormessenger/update_2/%'</code> and response code 200.</li> +<li><b>"twhph":</b> Tor website home page hits: GET requests to sites <code>'torproject.org'</code> and <code>'www.torproject.org'</code> with resource strings <code>'/'</code> and <code>'/index%'</code> and response code 200.</li> +<li><b>"twdph":</b> Tor website download page hits: GET requests to sites <code>'torproject.org'</code> and <code>'www.torproject.org'</code> with resource strings <code>'/download/download%'</code> and <code>'/projects/torbrowser.html%'</code> and response code 200.</li> +</ul> +</li> +<li><b>platform:</b> Platform string, like <b>"w"</b> for Windows, <b>"m"</b> for macOS, or <b>"l"</b> for Linux, <b>"o"</b> for other platforms, and the empty string for all platforms.</li> +<li><b>channel:</b> Release channel, like <b>"r"</b> for stable releases, <b>"a"</b> for alpha releases, <b>"h"</b> for hardened releases, and the empty string for all channels.</li> +<li><b>locale:</b> Locale, like <b>"en-US"</b> for English (United States), <b>"de"</b> for German, etc., <b>"??"</b> for unrecognized locales, and the empty string for all locales.</li> +<li><b>incremental:</b> Incremental update, with <b>"t"</b> for incremental updates, <b>"f"</b> for non-incremental (full) updates, and the empty string for all updates types.</li> +<li><b>count:</b> Number of request for the given request type, platform, etc.</li> +</ul> + +</div>
<jsp:include page="bottom.jsp"/>
tor-commits@lists.torproject.org