commit 66704ba064c13f6349722edd8d63f93b5d2244ec Author: Iain R. Learmonth irl@fsfe.org Date: Wed Jul 24 14:54:19 2019 +0100
Adds guidelines for new metrics page
Fixes: #29315 --- .../metrics/web/MetricsGuidelinesServlet.java | 25 +++ src/main/resources/web.xml | 11 + src/main/resources/web/jsps/metrics-guidelines.jsp | 223 +++++++++++++++++++++ src/main/resources/web/jsps/sources.jsp | 6 + 4 files changed, 265 insertions(+)
diff --git a/src/main/java/org/torproject/metrics/web/MetricsGuidelinesServlet.java b/src/main/java/org/torproject/metrics/web/MetricsGuidelinesServlet.java new file mode 100644 index 0000000..41dae13 --- /dev/null +++ b/src/main/java/org/torproject/metrics/web/MetricsGuidelinesServlet.java @@ -0,0 +1,25 @@ +/* Copyright 2018 The Tor Project + * See LICENSE for licensing information */ + +package org.torproject.metrics.web; + +import java.io.IOException; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +public class MetricsGuidelinesServlet extends AnyServlet { + + private static final long serialVersionUID = 6099009779662419291L; + + @Override + public void doGet(HttpServletRequest request, + HttpServletResponse response) throws IOException, ServletException { + + request.setAttribute("categories", this.categories); + request.getRequestDispatcher("WEB-INF/metrics-guidelines.jsp") + .forward(request, response); + } +} + diff --git a/src/main/resources/web.xml b/src/main/resources/web.xml index 045dd2e..f92813a 100644 --- a/src/main/resources/web.xml +++ b/src/main/resources/web.xml @@ -218,6 +218,17 @@ </servlet-mapping>
<servlet> + <servlet-name>MetricsGuidelinesServlet</servlet-name> + <servlet-class> + org.torproject.metrics.web.MetricsGuidelinesServlet + </servlet-class> + </servlet> + <servlet-mapping> + <servlet-name>MetricsGuidelinesServlet</servlet-name> + <url-pattern>/metrics-guidelines.html</url-pattern> + </servlet-mapping> + + <servlet> <servlet-name>NewsServlet</servlet-name> <servlet-class> org.torproject.metrics.web.NewsServlet diff --git a/src/main/resources/web/jsps/metrics-guidelines.jsp b/src/main/resources/web/jsps/metrics-guidelines.jsp new file mode 100644 index 0000000..0f06fe5 --- /dev/null +++ b/src/main/resources/web/jsps/metrics-guidelines.jsp @@ -0,0 +1,223 @@ +<%@ taglib prefix="c" uri="http://java.sun.com/jsp/jstl/core" %> +<%@ taglib prefix="fn" uri="http://java.sun.com/jsp/jstl/functions" %> +<jsp:include page="top.jsp"> + <jsp:param name="pageTitle" value="Sources – Tor Metrics"/> + <jsp:param name="navActive" value="Sources"/> +</jsp:include> + +<div class="container"> +<ul class="breadcrumb"> +<li><a href="/">Home</a></li> +<li><a href="sources.html">Sources</a></li> +<li class="active">Guidelines for getting your data into Tor Metrics</li> +</ul> +</div> + +<div class="container"> + +<h1>Guidelines for getting your data into Tor Metrics +<a href="#metrics-guidelines" id="metrics-guidelines" class="anchor">#</a></h1> + +<h2>Scope and preliminaries</h2> + +<p>This document provides guidelines to authors and operators of tools that + collect data about the publicly deployed Tor network that would like to + contribute data, or allow data to be contributed easily by others using + the tool, to Tor Metrics.</p> + +<p>This document does not discuss how to ensure measurements are safe, for this + refer to the Research Safety Board Guidelines [<a href="#ref-0">0</a>] and Guidelines for + Performing Safe Measurement on the Internet [<a href="#ref-1">1</a>].</p> + +<h2>What data belongs on Tor Metrics?</h2> + +<ul> +<li>If it happens in the public deployed Tor network it likely belongs on Tor + Metrics.</li> +<li>If it happens for a short term only, like for a research project, it's + unlikely worth the effort to have Tor Metrics archive, publish, aggregate, + and visualize it. In this case you should collect the data yourself (keeping + in mind research ethics!), and we can later talk about linking to it or even + using it as external data.</li> +<li>If your data is a combination of existing data on Tor Metrics plus maybe + external data, we shouldn't add it, either. In such a case we should rather + talk about extending our services towards what your service does, if that + makes sense.</li> +</ul> + +<h2>What data do you want to see on Tor Metrics?</h2> + +<p>This section aims to help you organize your thoughts before making a request + to the Metrics team. It might be that there are good reasons that something is + not done in one of the preferred ways, but ideally data collection tools can + be written with this guidelines in mind.</p> + +<ul> +<li>What is your data about? Is it about servers or users or both? Is it + passively gathered or actively measured or both?</li> +<li>This will help us to decide how we might present the data on Tor Metrics + and perhaps which other datasets we have that might benefit from being + combined with the new dataset.</li> +<li>Is there a way for you to aggregate the data before you hand it over to us? + Of course this requires more thinking upfront, but it's a great way to ensure + not to give out too sensitive data to us or anyone else. It's not always + possible or even useful to aggregate data and discard the original data, + though. Two examples:<ol> + <li>Relays count how many clients download the consensus from them and from + which country they connect. When 24 hours have passed, they include the + count by country in their next extra-info descriptor. This is aggregated + data. The obviously more sensitive, non-aggregated variant would be for + relays to provide a log of clients downloading consensuses.</li> + <li>The torproject.org webservers keep highly sanitized logs of web clients + making requests to them that we sanitize even more before we archive them. + This is non-aggregated data. The possibly less sensitive aggregated variant + would be for webservers to count requests by requested URL or similar.</li> + </ol></li> +<li>Is the data you're planning to give us too sensitive? If so, can you sanitize + it yourself before giving it to us (we can help you with that), or does the + sanitizing need to happen on our side (we should still involve you in this + case)? +<p>There are currently cases where Tor Metrics performs the sanitization of + data before archiving, but the preferred system would sanitize the data as + close to the source as possible to minimize the possibility that sensitive + data could be leaked.</li> +<li>How will you expect that Tor Metrics will fetch your data? For most data + currently, CollecTor fetches from a web server secured with TLS. This is + the easiest and quickest method to implement and so there should be a good + reason to not use this method.</li> +<li>When is your data available and for how long? Ideally, we'd survive reboots + or downtimes on our side for up to 72 hours without losing any of your data. + Typically, you'd implement this using a cache. If that is hard or impossible + to do on your side, we'll have to think about adding redundancy on our side. + That's all possible and we did it before, it'll just make the process take + longer.</li> +<li>Do you expect any difficulties on our side to write code that processes your + data? If we only need to fetch and store your data, probably not. But if we + have to inflate, parse, verify, combine, sanitize, split, and deflate your + data, maybe. And if we need to include fancy crypto libraries in order to + process your data, then for sure. Any intuitions you have about possible + difficulties would be good to know, even if things turn out to be easier in + the end. +<p>As far as possible, use simple formats for providing data. The Tor Directory + Protocol meta-format [<a href="#ref-2">2</a>, §1.2] is a simple format for which we already have + parsers. Without good reason, do not serialize to formats such as YAML, TOML, + etc. as this would require adding a new parsing library into Tor Metrics just + to parse the new data.</p></li> +<li>How much data do you think you'll give us over the next five years? A + ballpark figure is fine, like the number of bytes as a power of ten.</li> + </ul> + +<h2>What belongs into the data format for the data to be archived?</h2> + +<dl> +<dt>Timestamp</dt><dd>We're using the timestamp to place the data item into the right + archive file, among other things. Exception: microdescriptors do not contain + a timestamp, which makes them a pain to archive.</dd> +<dt>Source identifier</dt><dd>Ideally, we'd expect a cryptographic identifier of the + source, but if that is not available, any identifier will do. Exception: exit + lists do not contain a source identifier, because there happened to be just + one exit list scanner in the network; you can see how this doesn't scale so + well.</dd> +<dt>Generator identifier</dt><dd>The name of software and its version (either release + or a commit reference) that produced the result. If a bug is discovered in the + software then this allows us to see which data may have been affected by it.</dd> +<dt>Network location</dt><dd>If performing active measurement, the network location of + the vantage point (e.g. IP address, ASN, and/or country) can help to provide + context when comparing between different vantage points.</dd> +<dt>Signature</dt><dd>The signature is the proof that the source produced the data item, + not us. And even if we don't verify all signatures, others might want to do + that. If you are using the Tor Directory Protocol meta-format to serialize + your metrics then signing metrics using RSA or Ed25519 signatures can be + done easily. Signatures should not be detached to keep fetching, archiving + and validating simple. Exception: hello, exit list, you again!</dd> +</dl> + +<h2>First steps</h2> + +<p>You're still reading, so it seems that we caught your interest! How should we + start?</p> + +<ul> +<li>Is the data already publicly available somewhere and all you want is discuss + a way to include it in Tor Metrics? That's easy then. Just share with us what + you have and we can talk.</li> +<li>If the data is not public yet, do you maybe have a data format that we can + discuss? Bonus points if it comes with samples, but only if you're absolutely + certain that the data is safe to be published.</li> +<li>If you have none of the above, can you share logs with us, so that we can + help you derive a possible data format? It doesn't need to be recent logs + (even though time might not magically make your data safe to be published). + You could edit the logs and take out any parts you think are too sensitive. + And you should encrypt the data before sending it to us.</li> +<li>If you have nothing at all yet, let's talk anyway. Describe to us what you think would be good to include in Tor Metrics, and we'll figure something + out.</li> +</ul> + +<h2>How will Tor Metrics include the data?</h2> + + <p>It's a process to get your data on Tor Metrics, and not a short one. Let's go + through the necessary steps for doing it. After each step we should together + decide whether we're ready to move forward, need to take a step back, or maybe + even stop the project, because we found out that it's not what we wanted.</p> + +<ul> + <li>If you can, give us a few months as heads-up. Ideally, it won't take us that + long to do this project, but we'd prefer to make room for it in our next + six-month roadmap. Otherwise we might not be able to do it right away.</li> +<li> We discuss your data format with you and other Tor developers on the public + tor-dev@ mailing list. Maybe you or we need to write a Tor proposal for this.</li> +<li>We write a documentation page for the data format plus any necessary + sanitizing steps. See the Tor Metrics website and the tor-spec Git repository + for a couple of examples.</li> +<li>We write code for metrics-lib and/or Stem to parse your data and verify the + data format. At this point we'll find out if there are any misunderstandings + regarding data types or data structure that we haven't seen before.</li> +<li>We write code for CollecTor to fetch and archive your data, but without + publishing just yet. As part of this we also agree on file names and URLs + where your data will later be available.</li> +<li>We make a one-time visualization using your data, mostly as a sanity check. + You'd be surprised how many issues are hiding well enough that we would + otherwise not find them.</li> +<li>At this point we can think about adding your data to our services like + Onionoo, Relay Search, and ExoneraTor and our visualizations on Tor Metrics. + Typically, we'd do that as a separate project, though.</li> +<li>Finally we make your data available for download on CollecTor and put the + documentation on the Tor Metrics website. We announce that your data is now + on Tor Metrics.</li> +</ul> + +<h2>Maintenance</h2> + +<p>Congratulations, your data is now on Tor Metrics. But that's not the end of the + story! Here's what we need you to do as long as we have your data:</p> + +<ul><li>Make sure that we always get the data by whatever means we came up with + together. Avoid longer downtimes and fix any related issues in a timely + fashion. We do care about this, because people will come to us and complain + that "our" data is not up-to-date, when it may in fact be your fault.</li> +<li>If you're planning to make any changes that affect the data format or the way + how the data comes to us, talk to us beforehand with enough time to make such + changes. Several weeks in advance would be good, because we may have to + inform our users about upcoming changes and give them some time to update + their tools.</li> + <li>Let's be honest: we had to remove data from Tor Metrics in the past, because + the services providing them have become unreliable or unmaintained. In such a + case we'd talk to you and try to improve the situation. But if that doesn't + work, we'd remove your data from Tor Metrics with enough heads up time for + you and others to prepare. We'd very likely archive your data and keep it + around in such a case. Sorry, and thanks for understanding!</li> +</ul> + +<h2>References</h2> + +[<a id="ref-0">0</a>] Tor Project. Research Safety Board Guidelines. + <a href="https://research.torproject.org/safetyboard/#guidelines">https://research.torproject.org/safetyboard/#guidelines</a><br> +[<a id="ref-1">1</a>] I. Learmonth. Guidelines for Performing Safe Measurement on the Internet. + (Work-in-progress). <a href="https://datatracker.ietf.org/doc/draft-learmonth-pearg-safe-internet-measurement/">https://datatracker.ietf.org/doc/draft-learmonth-pearg-safe-internet-measurement/</a><br> +[<a id="ref-2">2</a>] Tor Project. Tor Directory Protocol, version 3. + <a href="https://spec.torproject.org/dir-spec">https://spec.torproject.org/dir-spec</a> + +</div><!-- .container --> + +<jsp:include page="bottom.jsp"/> + diff --git a/src/main/resources/web/jsps/sources.jsp b/src/main/resources/web/jsps/sources.jsp index a43d681..1f2930c 100644 --- a/src/main/resources/web/jsps/sources.jsp +++ b/src/main/resources/web/jsps/sources.jsp @@ -41,6 +41,12 @@ </ul> </div>
+ <div class="container"> + <h2>New sources <a href="#new" name="new" class="anchor">#</a></h2> + <p>We have written <a href="/metrics-guidelines.html">guidelines for Tor developers</a> that would like metrics collected by their service to be integrated + with Tor Metrics.</p> + </div> + <div class="container"> <h2>Specifications <a href="#specifications" name="specifications" class="anchor">#</a></h2> <p>The following specification documents are available for Tor network data.</p>
tor-commits@lists.torproject.org