commit 2c25f916e2457a4b08b167e0ea24a05214345170
Author: A. Johnson <aaron.m.johnson(a)nrl.navy.mil>
Date: Wed Apr 22 14:33:17 2015 -0400
fixing HS protocol step numbering
---
2015/hidden-service-stats/hidden-service-stats.bib | 143 ++-
2015/hidden-service-stats/hidden-service-stats.tex | 972 +++++++++++---------
2015/hidden-service-stats/protocol.odg | Bin 17986 -> 14037 bytes
3 files changed, 690 insertions(+), 425 deletions(-)
diff --git a/2015/hidden-service-stats/hidden-service-stats.bib b/2015/hidden-service-stats/hidden-service-stats.bib
index 2fbb81a..c55e917 100644
--- a/2015/hidden-service-stats/hidden-service-stats.bib
+++ b/2015/hidden-service-stats/hidden-service-stats.bib
@@ -39,4 +39,145 @@
booktitle = {Proceedings of the Forty-first Annual ACM Symposium on Theory of Computing},
series = {STOC '09},
year = {2009}
-}
\ No newline at end of file
+}
+
+@misc{torprop-rend-spec-ng,
+ author = {Nick Mathewson},
+ title = {Next-Generation Hidden Services in {Tor}},
+ howpublished = {\url{https://gitweb.torproject.org/torspec.git/plain/proposals/224-rend-spec-ng.txt}},
+ note = {Accessed: 2015-04-18}
+}
+
+@inproceedings{tor-design,
+ title = {{Tor}: The Second-Generation Onion Router},
+ author = {Roger Dingledine and Nick Mathewson and Paul Syverson},
+ booktitle = {Proceedings of the 13th USENIX Security Symposium},
+ year = {2004}
+}
+
+@misc{rend-spec,
+ key = {Tor Rendezvous Specification},
+ title = {Tor Rendezvous Specification},
+ howpublished = {\url{https://gitweb.torproject.org/torspec.git/plain/rend-spec.txt}},
+ note = {Accessed: 2015-04-18}
+}
+
+@misc{tormetrics,
+ key = {{Tor Metrics Portal}},
+ title = {{Tor Metrics Portal}},
+ howpublished = {\url{http://metrics.torproject.org/}},
+ note = {Accessed: 2015-04-18}
+}
+
+@misc{dir-spec,
+ key = {Tor directory protocol},
+ title = {Tor directory protocol},
+ howpublished = {\url{https://gitweb.torproject.org/torspec.git/plain/dir-spec.txt}},
+ note = {Accessed: 2015-04-18}
+}
+
+@techreport{extrapolation-tor-techreport,
+ author = {George Kadianakis and Karsten Loesing},
+ title = {Extrapolating network totals from hidden-service statistics},
+ number = {2015-01-001},
+ institution = {The Tor Project},
+ year = {2015}
+}
+
+@misc{torprop-hsstats,
+ authors = {George Kadianakis, David Goulet, Karsten Loesing, Aaron Johnson},
+ title = {Better hidden service stats from {Tor} relays},
+ howpublished = {\url{https://gitweb.torproject.org/torspec.git/plain/proposals/238-hs-relay-stats.txt}},
+ note = {Accessed: 2015-04-18}
+}
+
+@inproceedings{botnetfc14,
+ title = {Challenges in protecting Tor hidden services from botnet abuse},
+ author = {Nicholas Hopper},
+ booktitle = {Proceedings of Financial Cryptography and Data Security (FC'14)},
+ year = {2014},
+ month = {March},
+ www_tags = {selected},
+ www_pdf_url = {http://fc14.ifca.ai/papers/fc14_submission_152.pdf},
+ www_section = {Anonymous communication},
+}
+
+@inproceedings{oakland2013-trawling,
+ title = {Trawling for {Tor} Hidden Services: Detection, Measurement, Deanonymization},
+ author = {Alex Biryukov and Ivan Pustogarov and Ralf-Philipp Weinmann},
+ booktitle = {Proceedings of the 2013 IEEE Symposium on Security and Privacy},
+ year = {2013}
+}
+
+@misc{tor2web,
+ key = {Tor2web: browse the anonymous internet},
+ title = {Tor2web: browse the anonymous internet},
+ howpublished = {\url{https://www.tor2web.org/}},
+ note = {Accessed: 2015-04-18}
+}
+
+@inproceedings{torflow-hotpets10,
+ title = {TorFlow: Tor network analysis},
+ author = {Mike Perry},
+ booktitle = {3rd Hot Topics in Privacy Enhancing Technologies (HotPETs 2010)},
+ year = {2010}
+}
+
+@misc{torprop-hsdir-stable,
+ author = {George Kadianakis},
+ title = {{Give out HSDir flag only to relays with Stable flag}},
+ howpublished = {\url{https://gitweb.torproject.org/torspec.git/plain/proposals/243-hsdir-flag-need-stable.txt}},
+ note = {Accessed: 2015-04-19}
+}
+
+@misc{torprop-tor2web,
+ author = {Virgil Griffith and Fabio Pietrosanti and Giovanni Pellerano},
+ title = {Making Tor2Web mode faster},
+ howpublished = {\url{https://gitweb.torproject.org/torspec.git/plain/proposals/233-quicken-tor2web-mode.txt}},
+ note = {Accessed: 2015-04-19}
+}
+
+@inproceedings{privda-acsac2014,
+ author = {Eigner, Fabienne and Kate, Aniket and Maffei, Matteo and Pampaloni, Francesca and Pryvalov, Ivan},
+ title = {Differentially Private Data Aggregation with Optimal Utility},
+ booktitle = {Proceedings of the 30th Annual Computer Security Applications Conference},
+ series = {ACSAC '14},
+ year = {2014}
+}
+
+@inproceedings{shadow-ndss12,
+ title = {{Shadow: Running Tor in a Box for Accurate and Efficient Experimentation}},
+ author = {Rob Jansen and Nicholas Hopper},
+ booktitle = {Proceedings of the Network and Distributed System Security Symposium -
+ {NDSS}'12},
+ year = {2012}
+}
+
+@mastersthesis{sukhbir-thesis,
+ author = {Sukhbir Singh},
+ title = {Large-Scale Emulation of Anonymous Communication Networks},
+ school = {University of Waterloo},
+ year = {2014}
+}
+
+@inproceedings{Dwork06differentialprivacy,
+ author = {Cynthia Dwork},
+ title = {Differential privacy},
+ booktitle = {in ICALP},
+ year = {2006}
+}
+
+@misc{tor-spec,
+ author = {Roger Dingledine and Nick Mathewson},
+ title = {Tor Protocol Specification},
+ howpublished = {\url{https://gitweb.torproject.org/torspec.git/plain/tor-spec.txt}},
+ note = {Accessed: 2015-04-21}
+}
+
+@inproceedings{ganta-kdd2008,
+ author = {Ganta, Srivatsava Ranjit and Kasiviswanathan, Shiva Prasad and Smith, Adam},
+ title = {Composition Attacks and Auxiliary Information in Data Privacy},
+ booktitle = {Proceedings of the 14th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
+ series = {KDD '08},
+ year = {2008}
+}
\ No newline at end of file
diff --git a/2015/hidden-service-stats/hidden-service-stats.tex b/2015/hidden-service-stats/hidden-service-stats.tex
index d2650ea..9d57b1f 100644
--- a/2015/hidden-service-stats/hidden-service-stats.tex
+++ b/2015/hidden-service-stats/hidden-service-stats.tex
@@ -4,19 +4,25 @@
\usepackage{hyperref}
\usepackage{longtable}
\usepackage{paralist}
+\usepackage{comment}
+
+\usepackage{styles/pdfdraftcopy}
+\draftcolor{gray20}
+\draftstring{DRAFT}
+\draftfontsize{150pt}
\begin{document}
\title{Hidden-service statistics reported by relays}
-\author{David Goulet, George Kadianakis, Karsten Loesing}
-
-\contact{\href{mailto:dgoulet@torproject.org}{dgoulet@torproject.org},%
-\href{mailto:asn@torproject.org}{asn@torproject.org},%
-\href{mailto:karsten@torproject.org}{karsten@torproject.org}}
+\author{David Goulet\\The Tor Project\\ \href{mailto:dgoulet@torproject.org}{dgoulet@torproject.org} \and
+Aaron Johnson\\U.S. Naval Research Laboratory\\ \href{mailto:aaron.m.johnson@nrl.navy.mil}{aaron.m.johnson@nrl.navy.mil} \and
+George Kadianakis\\The Tor Project\\ \href{mailto:asn@torproject.org}{asn@torproject.org} \and
+Karsten Loesing\\The Tor Project\\ \href{mailto:karsten@torproject.org}{karsten@torproject.org}}
+%\footnotemark[\ref{tpi}]
-\reportid{DRAFT}
-\date{January XX, 2015}
+%\reportid{DRAFT}
+\date{}
\maketitle
@@ -26,325 +32,360 @@
% - Abbreviations are best avoided.
% - Code, cell names, etc. are put inside \verb+...+.
-\tableofcontents
+%\tableofcontents
\begin{abstract}
-We have little insight into hidden-service usage in the public Tor
-network.
-In this report we discuss possible statistics to understand hidden
-services better which can be used to make performance improvements, find
-bugs, and possibly even detect attacks.
-We also evaluate whether, and how, statistics could be abused by an
-adversary.
-The main contribution of this report is a comprehensive list of
-hidden-service related statistics with recommendations for or against
-gathering them in the public Tor network.
+In order to understand how hidden services are being used in Tor, relays
+can collect and report statistics about the hidden-service activity they
+observe. However, such statistics might harm the privacy of individual Tor
+users. We describe how relays can be used to collect statistics about
+hidden services, how they might be useful, and how they can be published
+in a way that protects individual privacy.
\end{abstract}
\section{Introduction}
-Tor hidden services are a means to provide web services over Tor while
-hiding their physical location.
-Unfortunately (or some would say fortunately), we have little insight into
-hidden-service usage in the public Tor network.
-A basic understanding of hidden-service usage would help us direct efforts
-on hidden-service development.
-These efforts include making performance improvements, finding bugs, and
-possibly even detecting attacks.
-
-In this report we compile a list of hidden-service related statistics and
-evaluate whether and how they could be used to make hidden services
-better.
-% There are no plans for gathering hidden-service statistics on hidden
-% servers or clients, mostly because there is no data-collecting
-% infrastructure in place and because privacy implications are even less
-% clear in the case of single clients or servers reporting statistics than
-% in the case of relays serving dozens or hundreds of hidden services and
-% their clients.
-At the same time we evaluate whether and how these statistics could be
-abused by an adversary by providing them with data from relays they don't
-control.
-The primary purpose of hidden services to hide their location and the
-location of their users must not be put at risk.
-Other security properties like their ability to hide their existance, at
-least to some extent, should not be sacrificed for potential improvements.
-As a result we present a list of hidden-service related statistics with
-recommendations for or against deploying them in the public Tor network.
-
-This report is structured as follows:
-the next section gives an overview of the hidden-service protocol with
-special focus on measurement points for hidden-service statistics.
-Section~\ref{sec:criteria} defines evaluation criteria for statistics to
-decide whether they should be considered helpful and/or harmful.
-Section~\ref{sec:list} contains a list of hidden-service related
-statistics, partly with early results obtained from a private Tor network,
-and an evaluation of their helpfulness and harmfulness.
-Section~\ref{sec:recommendation} concludes this report by recommending
-which statistics should be implemented and deployed in the public Tor
-network.
+Tor hidden services~\cite{tor-design,rend-spec} are a means to provide a
+network service while hiding the location of the server. To provide this
+anonymity to the server, client connections are set up using a different
+process than is used by connections to known servers. This process comes
+with a performance cost. Also, servers must be configured explicitly to
+be accessed as a hidden service. As a result, the way that Tor hidden
+services are being used may be quite different from the way that Tor is
+used to anonymize traffic to normal, non-hidden Internet services.
+
+Therefore, in addition to the general statistics that Tor collects
+about its network~\cite{tormetrics}, it is interesting and useful to
+collect statistics specific to hidden services. For example,
+hidden-service statistics could show how popular they are, how well they
+perform, and if they are under attack. Publishing such data could help
+Tor to manage and improve its network, Tor advocates to argue more
+effectively in support of Tor, and researchers to understand how Tor is
+being used.
+
+Tor relays are frequently in a position to learn some information about
+hidden services and their usage. Relays have long collected and reported
+information for many \emph{general} statistics of interest (i.e.
+those that are not specific to hidden services), such as the amount of
+total traffic that they have relayed. They do this by including these data
+in ``extra-info documents''~\cite{dir-spec}, which
+are uploaded to the Directory Authorities. These local observations can be
+combined to give a global view of Tor's hidden service activity.
+
+However, Tor's privacy protections depend on relays not sharing all their
+local observations. Therefore, we must carefully consider how to publish
+data in a way that doesn't hurt Tor's privacy goals. Tor has adopted
+some privacy protections for the general statistics it gathers. These
+protections include
+reporting data aggregated over a period of time and rounding some
+measurements to a desired level of accuracy. There is a set of privacy
+objectives and risks that are specific to hidden services, however, and so
+these techniques may not apply directly.
+For example, hidden services have a primary privacy goal of hiding the
+server's location and a secondary goal of allowing the existence of the
+service itself to be hidden.
+
+% more detail / a paragraph about obfuscation techniques?
+Thus, to address the particular challenge of studying Tor hidden services,
+this report describes how relays can safely collect useful statistics
+about them. It includes the following contributions:
+\begin{itemize}
+\item A description of the types of observations that relays can make
+about hidden services
+\item A description of the privacy objectives specific to hidden services
+\item A description of obfuscation techniques that can be used to safely
+release hidden-service statistics
+\end{itemize}
\section{Hidden-service protocol and measurement points}
\label{sec:protocol}
-The hidden-service protocol consists of multiple substeps to make a
-service available in the network and for clients connecting to a service.
-The statistics discussed in this document all rely on relays gathering
-aggregate statistics of their role in the hidden-service protocol.
-These roles are: a) directory, b) introduction point, and c) rendezvous
-point.
-Figure~\ref{fig:protocol} gives an overview of protocol steps.
+The hidden-service protocol consists of multiple steps for a service to
+become available through the network and for a client to connect to a
+service~\cite{rend-spec}. Each step gives some relays the opportunity to
+gather statistics about their role in the hidden-service
+protocol. Hidden service clients and servers also are in a position to
+gather statistics, but in this report we focus on collection by relays.
+The major roles of relays in the hidden-service protocol are: a) Hidden
+Service Directory (HSDir), b) Introduction Point (IP), and c) Rendezvous
+Point (RP). Figure~\ref{fig:protocol} gives an overview of protocol steps.
\begin{figure}
\centering
\includegraphics[width=0.8\textwidth]{protocol.pdf}
-\caption{Hidden-service protocol steps as observed by relays.}
+\caption{Hidden-service protocol steps as observed by relays}
\label{fig:protocol}
\end{figure}
-The protocol substeps for making a service available in the network are as
-follows:
+The protocol steps for making a service available through the network are
+as follows:
\begin{enumerate}
-\item The service \emph{establishes an introduction point} on one or more
+\item The service \emph{establishes an Introduction Point} on one or more
relays.
A relay first receives a circuit extension request from another relay to
become the next relay in a circuit.
At this time the relay does not recognize that it will become an
-introduction point.
-Then the relay receives a request to establish an introduction point on
+Introduction Point.
+Then the relay receives a request to establish an Introduction Point on
behalf of the service that built the circuit.
-However, the introduction point does not know which service it's serving,
-because the service creates a fresh identity key for each introduction
-point.
+However, the Introduction Point does not know which service it's serving,
+because the service creates a fresh identity key for each Introduction
+Point.
The circuit, now called introduction circuit, will be kept open until the
service closes it.
From that time on the relay accepts client introductions for the service
coming in via other circuits.
\item The service \emph{publishes a descriptor} to a total number of six
-directories, which are common relays with high-enough uptime of 24 hours
-or more.
-Each of those relays first receives a circuit extension request, followed
+Hidden Service Directories, which are the set of relays with high-enough
+uptime as indicated by the ``HSDir'' flags in the network consensus.
+Each of those relays receives a circuit extension request followed
by a request to publish the descriptor.
-The relay can read most parts of the descriptor which includes service
-identity and selected introduction points.
+The relay can read most parts of the descriptor, including the service
+identity (this may change as part of a proposed protocol
+update~\cite{torprop-rend-spec-ng}) and selected Introduction Points.
The circuit that was built by the service is closed immediately after
transmitting the descriptor.
\end{enumerate}
-The protocol substeps for a client connecting to a service are as
+The protocol steps for a client connecting to a service are as
follows:
\begin{enumerate}
\setcounter{enumi}{2}
-\item The client \emph{fetches a descriptors} from a directory.
-The relay that acts as directory first receives a circuit extension
-request, followed by a request to fetch the descriptor.
-Once the descriptor is returned, or a response saying that it does not
-exist, the circuit is closed immediately.
-\item The client \emph{establishes a rendezvous point} on a relay.
-The relay observes a circuit extension request, followed by the request to
-become the client's rendezvous point for connecting to a service.
+\item The client \emph{fetches a descriptor} from a Hidden Service
+Directory. The relay that acts as an HSDir receives a circuit extension
+request followed by a request to fetch the descriptor.
+Once the descriptor is returned, or a response is returned saying that the
+descriptor does not exist, the circuit is closed immediately.
+\item The client \emph{establishes a Rendezvous Point} on a relay.
+The relay observes a circuit extension request, followed by a request to
+become the client's Rendezvous Point for connecting to a service.
The relay does not learn which service the client attempts to connect to,
-but it only learns a random identifier.
+only a random identifier.
The circuit, now called rendezvous circuit, is kept open until either
-client or service close it.
+the client or service closes it.
\item The client \emph{sends an introduction} to one of the service's
-introduction points.
-The introduction point first observes a circuit extension request,
+Introduction Points.
+The Introduction Point observes a circuit extension request
followed by the introduction message, which it forwards to the service via
its introduction circuit.
-The client's circuit is torn down immediately after receiving the
+The client's circuit is torn down by the relay after receiving the
introduction and responding with an acknowledgement.
\item The service \emph{sends a rendezvous message} to the client's
-rendezvous point.
-The rendezvous point sees a circuit extension request, followed by the
+Rendezvous Point.
+The Rendezvous Point sees a circuit extension request followed by the
rendezvous message, which it forwards to the client via its rendezvous
circuit.
Both service-side and client-side circuits are kept open until either side
decides to close it.
\item Both \emph{client and service send and receive data} along their
part of the rendezvous circuit.
-The rendezvous point sees cells coming in from either side and forwards
+The Rendezvous Point sees cells coming in from either side and forwards
them to the other side.
All cells are padded and end-to-end encrypted between client and service,
-so that the rendezvous point only sees encrypted cells of the same size.
+so that the Rendezvous Point only sees encrypted cells of the same size.
\end{enumerate}
-All these protocol steps constitute potential measurement points for
-hidden-service related statistics.
+All of these protocol steps constitute potential measurement points for
+hidden-service related statistics. For example, some Tor relays have
+been reporting the following statistics about hidden services, which
+were proposed by Kadianakis et al.~\cite{torprop-hsstats} and the
+results analyzed by Kadianakis and
+Loesing~\cite{extrapolation-tor-techreport}:
+\begin{compactenum}
+\item \textsf{NumRPCells}: The number of hidden-service data
+cells seen at a Rendezvous Point, reported every 24 hours
+\item \textsf{NumHSDirIDs}: The number of unique hidden
+identifiers seen at an HSDir, reported every 24 hours
+\end{compactenum}
\section{Evaluation criteria for statistics}
\label{sec:criteria}
-
-Each of the hidden-service related statistics needs to fulfill two
-criteria: first, it needs to serve a concrete purpose for making hidden
-services better; and second, it must not provide an adversary with data
-that would help them locate clients or services.
+A proposal to collect some hidden-service statistics must be evaluated
+according to several criteria. The main criteria are the benefits to Tor,
+the effect on privacy, the reliability of the statistics, the efficiency
+of the collection process, and alternatives to live collection.
\subsection{Possible benefits from gathering statistics}
-
-The purpose of the statistics discussed in this report is to learn more
-about hidden services and as a result make them better.
-We can imagine a couple possible benefits from gathering these statistics
-that we outline in the following.
-For all possible benefits we assess whether we need statistics from the
-public Tor network, or if we could as well obtain statistics in a private
-testing network.
+There are several compelling reasons to collect information about hidden
+service statistics. We describe some of them to illustrate the possible
+benefits.
\paragraph{Learn about usage}
-
-We are interested in learning more about hidden service usage to direct
-our development efforts.
-All past design decisions around hidden services were either based on
-assumptions how hidden services might be used, or on own observations.
-If we had statistics about provided services and their usage, we might
-adapt the design to its actual usage.
-Related to this, having statistics on hidden service usage as compared to
-normal Tor usage might help in getting sponsors and developers interested
-in making hidden services better.
+Understanding how hidden services are being used can help both to direct
+development efforts and to justify them. Usage statistics could provide
+insight into how many hidden services exist, how available they are, how
+many clients they have, how much Tor traffic is hidden-service traffic,
+and what types
+of applications use hidden services. These data could help Tor developers
+decide how much effort to spend improving hidden services and what types
+of changes would most benefit users. They can also be of use to
+researchers that analyze Tor and design improvements.
+Hidden-service statistics would also provide evidence
+that can be used in efforts to improve their media coverage,
+to persuade potential funders, and to inform government policy.
\paragraph{Improve performance}
-
-We want to measure performance of hidden services as a whole and of their
-protocol substeps to identify any bottlenecks.
-We hope that we can perform most of these measurements in private networks
-where we don't put any users at risk.
-But if we want to build a model that resembles reality, we'll need at
-least some real data as input, or we're back at making assumptions which
-may not reflect reality.
+Data about hidden services can help Tor developers improve performance.
+Such data might include speed and failure rates for the steps of the
+hidden-service protocol, resource utilization at the relays, and
+hidden-service availability.
+This information could be used to improve load balancing across relays,
+to optimize the values of various parameters in the hidden-service
+protocol, and to add relay resources where they would be most effective.
+They could be used to inform more ambitious redesigns of the
+hidden-service protocols. They could also be used to build models of
+client, network, and hidden-service behavior to use in network simulations
+done in support of performance improvements.
\paragraph{Identify bugs}
-
-Statistics can help us detect bugs that cannot be found in private Tor
-networks.
-Obviously, we'd want to fix as many bugs as possible in a private network
-setting.
-But there will always be cases that we'd miss in a test network, possibly
-caused by different software versions or non-standard usage that we didn't
-think of.
-Having some real data indicating problems in the hidden-service protocol
-would serve as good starting point to go bug hunting.
+Statistics can help detect bugs in hidden services. This could be
+especially useful to identify problems those that occur infrequently or
+only under realistic conditions, such the network traffic load or the mix
+of Tor software versions used. Information that could be particularly
+useful for this purpose includes failures rates and the frequency of
+unexpected events.
\paragraph{Discover attacks}
+Hidden-service statistics may uncover ongoing attacks in the network.
+Denial-of-service attacks in particular might be detected from spikes in
+usage and failure statistics. Statistics about the utilization of relay
+resources such as CPU, memory, and bandwidth might also reveal variations
+that are unexpected under normal use. In addition, data about the software
+versions may reveal client Sybil attacks or botnet
+abuse~\cite{botnetfc14}.
+
+\subsection{Hidden-service privacy goals}
+Gathering statistics on Tor has implications for its privacy because the
+data is determined from relays' observations of the private behavior of
+Tor users. Tor statistics are published and thus may be used by an
+adversary to determine the private activity of Tor users.
+
+Tor has several privacy goals. Primary, of course, is to provide
+anonymity to the source of a connection and to hide the network location
+of a hidden service. However, there are several
+other related and secondary goals. Complementing user privacy, for
+example, is the goal of unlinkability among separate Tor connections,
+which prevents an adversary from creating a pseudonymous profile
+of a user's network activity. We focus on the various privacy goals
+specific to hidden services, which to our knowledge have not previously
+been explicitly articulated. Our listing of privacy goals is not
+exhaustive. Tor's default position for information about the use
+of its network is that it should be protected, and exceptions should be
+made only in specific, carefully-considered cases. The privacy goals we
+highlight are those for which choosing to contradict them is very
+unlikely.
+
+\paragraph{Information about a specific hidden service}
+The following information about any specific hidden service should remain
+private:
+\begin{compactenum}
+\item The network location of the hidden service
+\item The exact number of hidden services (i.e. the existence of the
+hidden service)
+\item The number of clients that have used the hidden service
+\item The number of connections to the hidden service
+\item The amount of traffic to or from the hidden service
+\item The availability of the hidden service
+\end{compactenum}
+Observe that including several of these amounts to a goal of allowing a
+hidden service to remain undiscovered or undistinguished. This allows a
+hidden service to maintain a low profile and not come under investigation
+or scrutiny by an observant adversary.
+Also note that these are not all currently well-protected by Tor. For
+example, the existence and popularity of a hidden service can currently be
+learned by an HSDir~\cite{oakland2013-trawling}, although there are plans
+to close this information leak~\cite{torprop-rend-spec-ng}.
+
+\paragraph{Information about a specific hidden-service client}
+Related to the above, it is a goal to hide information that is specific
+to a given hidden-service client, including
+\begin{compactenum}
+\item The number of hidden-service connections of the client
+\item The amount of the client's hidden-service traffic
+\item When a client is active on hidden services
+\end{compactenum}
+These privacy goals also exist more generally for clients' Tor activity
+that doesn't use hidden services.
-We might be able to use hidden-service related statistics to uncover
-ongoing attacks in the network.
-If a reported statistic is off by more than a certain expected threshold
-or against the past trend, that might indicate that an attack is mounted
-on the network.
-This is obviously something we can only find out in the public Tor network
-and not in private testing networks.
-
-\subsection{Possible risks of gathering statistics}
-
-% HSDirs threat model notes
-%
-% Hidden Service directories periodically receive HS descriptors from
-% hidden services. They cache them, and then serve them to any clients
-% that ask for them.
-%
-% Hidden service directories are placed in a hash ring, and each hidden
-% service picks a slice of hidden service directories from that hash ring.
-% Given the address of a hidden service, it's easy to learn which
-% directories are responsible for it. This makes hidden-service directory
-% statistics dangerous since they can potentially be matched to specific
-% hidden services.
-%
-% Furthermore, each hidden service has 6 directories, and each directory
-% serves a different set of services. This means that attackers have 6
-% different data points per hidden service every hour that can be used to
-% reduce measurement noise.
-
-The benefits of statistics have been discussed above, so it's clear that
-statistics can be used for good.
-But they can also be used for bad.
-The risk of gathering statistics is that an adversary could misuse them
-for their attacks on clients and/or services.
-All statistics are designed to be publicly available, so an adversary,
-who might already control one or more relays, could use statistics to
-learn something about relays she does not (yet) control.
-In the following we outline aspects of hidden-service usage that we don't
-want to reveal by statistics.
-
-\paragraph{Infer availability or popularity of a specific service}
-
-We want to learn interesting facts about all services together, but we
-want to avoid that statistics can be used to single out any specific
-service and derive its availability or popularity.
-This includes services identified by their service address as well as
-popular services that may be identified by the number of connecting
-clients or handled traffic volume.
-As a matter of fact, it's not difficult for an adversary to link services
-to relays working as directories or introduction points: the six
-directories storing descriptors for that service can be determined easily,
-and the introduction points of a service are listed in its descriptors.
-The adversary can compare statistics reported by all directories or
-introduction points of a service to reduce measurement noise.
-Only the rendezvous point changes for each client connection, so that
-statistics reported by rendezvous point cannot easily be linked to a
-specific service.
-
-\paragraph{Infer activity of a specific client}
-
-Related to the above, we want to learn about activity of all clients, but
-we want to avoid that statistics can be used to single out a specific
-client and learn about its activity.
-This includes power users that access lots of services or transfer large
-data volumes as well as clients which are services themselves, like
-tor2web. It also includes the fact that a given user under direct
-observation is using hidden services at all.
-
-\paragraph{Assess precise number of available services}
-
-We want to learn roughly how many services are available in the network,
-but we want to avoid that these estimates make it easier for an adversary
-to enumerate available services.
-While hiding the existence of a service is not the primary purpose of
-hidden services, it's a security feature we don't want to give up easily.
-
-\paragraph{Unknown future attacks}
-
-Special care needs to be taken when designing and collecting
-statistics because in anonymity the attacker landscape changes
-continuously and attacks that are currently ineffective might become
-powerful in the future. Alternatively, in the future attackers might
-be able to acquire auxiliary data that can combine with statistics in
-such ways that allow attacks that would not have been possible before.
-
-\subsection{Other aspects of gathering statistics}
-
-There are certain aspects of any given statistic that should be
-considered in order to decide for or against gathering them.
-We list a few of those aspects below.
-
-\paragraph{Robustness against liars}
-
-The statistics discussed in this document would all be reported by relays
-and not confirmed by third parties.
-We must consider cases where a relay operators modifies their source code
-to manipulate reported statistics to their advantage.
-A statistic should be robust against single liars, as long as there is a
-sufficient number of honest relays, possibly run by trusted operators.
-We also should not depend on statistics reported by single relays, if
-possible.
-Though it would be interesting to have statistics indicating adoption of
+\paragraph{Information about a specific hidden-service connection}
+It is also a goal to hide information about any given hidden-service
+connection, including
+\begin{compactenum}
+\item The client and the service of the connection
+\item The exact number of hidden-service connections
+\item When a connection occurred
+\item How much traffic was transferred over some connection
+\item If the connection shares a client or hidden service with another
+given connection
+\end{compactenum}
+As with specific client information, it is a privacy goal more generally
+to protect information about any specific client connection.
+
+\paragraph{Information about ongoing activity}
+Statistics should not reveal information relevant to the \emph{ongoing}
+activity of clients, hidden services, or connections. This should be
+maintained even when the information is safe to release \emph{after} the
+activity has ceased. For example, it should not be revealed that certain
+relays are currently being used as Introduction Points by some hidden
+service, although it may be reasonable to reveal that some relays have
+been used as Introduction Points in the past. The risk of revealing
+ongoing activity is that an active adversary can use that information
+to begin targeted surveillance of that activity.
+
+
+\subsection{Reliability}
+An important criterion for a statistics-gathering procedure is its
+reliability. To be reliable, data must be accurate, available, robust to
+network changes, and resilient to malicious attacks. This criterion
+becomes especially important when a statistic affects the operation
+of Tor in real time. For example, the measurements gathered by
+TorFlow~\cite{torflow-hotpets10} are used to determine relay weights
+in the hourly consensuses.
+
+Robustness to future network changes is an important concern in the Tor
+network, which frequently improves in response to user needs and
+adversarial capabilities. For example, there are several current
+proposals to change hidden services in
+minor~\cite{torprop-hsdir-stable,torprop-tor2web} and
+major~\cite{torprop-rend-spec-ng} ways.
+Statistics collection shouldn't be broken or precluded by such
protocol changes.
-\paragraph{Robustness against protocol changes}
-
-We are planning to improve the hidden-service protocol in the medium term
-by making major protocol changes.%
-\footnote{\url{https://gitweb.torproject.org/torspec.git/blob/HEAD:/proposals/224-rend-spec-ng.txt}}
-We should not implement a statistic if we know it will soon become
-obsolete because of those protocol changes.
-
+Malicious attacks are another particular concern for Tor, because it
+operates in a highly-adversarial setting that is not typical of many other
+networks and services. When relays collect and report hidden-service
+statistics, the process is particularly vulnerable to manipulation
+by malicious relays, which can provide completely arbitrary data. An
+ideal procedure for gathering statistics would be robust to attempted
+manipulation by a small fraction of the network.
+
+\subsection{Efficiency}
+Statistics collection should not place an undue burden on Tor's existing
+infrastructure. It should be designed to minimize bandwidth consumption,
+CPU utilization, and memory requirements. Thus it may be necessary to
+limit the frequency at which statistics are updated or the complexity of
+a cryptographic aggregation
+procedure~\cite{elahi-ccs2014,privda-acsac2014}.
+
+\subsection{Alternatives}
+Some of the possible benefits of measuring the live Tor network may
+also obtained by measuring private or
+simulated~\cite{shadow-ndss12,sukhbir-thesis} networks. For example,
+private networks can be used to find bugs by testing hidden services under
+various conditions and looking for unexpected behavior. Using these
+methods eliminates the privacy risk that accompanies measurement of the
+live network, and so such methods should be preferred when possible.
+
+\begin{comment}
\section{List of hidden-service related statistics}
\label{sec:list}
-\textbf{The descriptions in this section have not been updated yet. Let's
-wait with cleaning them up until we have a final list of statistics and a
-final section structure. Otherwise we'd be rewriting this section over
-and over and over.}
+%\textbf{The descriptions in this section have not been updated yet. Let's
+%wait with cleaning them up until we have a final list of statistics and a
+%final section structure. Otherwise we'd be rewriting this section over
+%and over and over.}
In this section we attempt to compile a comprehensive list of
hidden-service related statistics.
@@ -461,7 +502,7 @@ So, the measured time will be close to zero.
But if we ever decide to re-use existing circuits for rendezvous without
extending them by another hop, this metric will give us an idea on the
adoption of that change.
-Admitted, this benefit is not huge.
+%Admitted, this benefit is not huge.
\\
\textbf{Risks:}
%
@@ -592,16 +633,18 @@ are.
\textbf{Risks:}
%
Publishing this stat would allow someone who is indexing hidden services
-to be able to say ``I have seen 76~\% of all HSes''.
+to be able to say ``I have seen 76~\% of all hidden services''.
We would really like to avoid having such an enumeration-facilitating
property.
-We could be persuaded that with some heavy stats obfuscation (heavier than
-the bridge stats obfuscation), this statistic might be plausible.
-By statistics obfuscation, we mean obfuscating the numbers so that the
-attacker can only say ``I'm somewhere between 60~\% to 75~\% of all
-HSes.''.
-This is a bit related to differential privacy as we understand it, but
-much more basic.
+%We could be persuaded that with some heavy stats obfuscation (heavier than
+%the bridge stats obfuscation), this statistic might be plausible.
+%By statistics obfuscation, we mean obfuscating the numbers so that the
+%attacker can only say ``I'm somewhere between 60~\% to 75~\% of all
+%HSes.''.
+%This is a bit related to differential privacy as we understand it, but
+%much more basic.
+Obfuscation will be necessary to prevent this (see
+Section~\ref{sec:obfuscation}).
Another risk is that changes in the number of HSes could reveal patterns
in the services over time. For example, it could reveal some services that
@@ -612,7 +655,8 @@ network outages over time to narrow down the location of hidden services.
\\
\textbf{Notes:}
%
-When \verb+rend-spec-ng.txt+ gets implemented, it will be harder for
+When \verb+rend-spec-ng.txt+\cite{torprop-rend-spec-ng} gets implemented, it will
+be harder for
hidden service directories to learn the number of served services
since the descriptor will be encrypted.
However, directories will still be able to approximate the number of
@@ -646,9 +690,10 @@ Still it doesn't seem like something we want to risk.
Also, if the result is greater than 24, it means that an HS with modded
RendPostPeriod was publishing to that HSDir (and that the HSDir doesn't
have many clients).
-Do we want to reveal that?
-OTOH, it seems to me that if the directory is serving many services, this
-statistic doesn't really provide any insight. In addition, this could
+%Do we want to reveal that?
+%OTOH, it seems to me that if the directory is serving many services, this
+%statistic doesn't really provide any insight.
+In addition, this could
be used to reveal the introduction points used by a hidden service
(assuming its address is known, but its descriptors are encrypted) by
DOSing suspected IPs and observing in the responsible HSDirs report
@@ -680,7 +725,7 @@ hidden-service descriptors, possibly also percentiles.
It would be interesting to know whether services deviate from the default
number of introduction points.
Though it's unclear what we're going to do with this information.
-This statistic will also be killed by rend-spec-ng.
+This statistic will also be killed by rend-spec-ng~\cite{torprop-rend-spec-ng}.
\subsubsection{Number of descriptors with encrypted introduction points
(3.1.5.)} \label{subsubsec:num_desc_encrypted_ips}
@@ -693,7 +738,8 @@ descriptors with plain-text vs. encrypted introduction point sections.
\textbf{Benefits:}
%
We would learn what fraction of services uses authentication features.
-This statistic won't be available after implementing rend-spec-ng.
+This statistic won't be available after implementing
+rend-spec-ng~\cite{torprop-rend-spec-ng}.
\subsection{Statistics on service usage}
@@ -725,7 +771,8 @@ for other HSes sharing the individual HSDirs by identifying by how much
the counts of a given HSDir tend to change during the periods for which
the target HS is assigned to them.
This is a major problem that doesn't seem resolvable without some kind
-of anonymization or private aggregation of the per-relay stats.
+of anonymization or private aggregation of the per-relay stats (see
+Section~\ref{sec:obfuscation}).
\subsubsection{Number of descriptor fetch requests by service identity} \label{subsubsec:num_descriptor_fetches_per_hs}
@@ -1017,7 +1064,7 @@ us a lot about performance of the rendezvous protocol.
The rendezvous point is the only place in the protocol that witnesses
events near the beginning and near the end of the connection establishment
process.
-If we ever want to improve the substeps inbetween, this metric is the only
+If we ever want to improve the steps inbetween, this metric is the only
way to measure effectiveness of improvements in the deployed network.
\\
\textbf{Risks:}
@@ -1041,7 +1088,7 @@ of the protocol.
\textbf{Risks:}
%
There are no obvious risks from learning the time between these two
-substeps in the rendezvous protocol.
+steps in the rendezvous protocol.
\subsection{Failure statistics}
@@ -1174,13 +1221,18 @@ The benefit gained from this statistic is not huge though.
\textbf{Risk:}
%
No obvious risks.
+\end{comment}
\section{Obfuscation methodology} \label{sec:obfuscation}
-The published statistics shouldn't reveal private information to an
-adversary when combined with plausible background knowledge. We will use
-techniques to provide uncertainty about any specific hidden service,
-client, or connection, while maintaining good accuracy in the aggregate
-statistics. These techniques include
+There is a variety of available methods to produce general
+statistics about a dataset without revealing specific sensitive details.
+We briefly discuss how this might be done in the context of hidden
+services, and then we provide two algorithms for revealing count and
+distributional statistics that can be applied to a variety of specific
+hidden-service statistics.
+
+Some useful techniques for privately publishing statistics on hidden
+services are
\begin{compactitem}
\item Releasing aggregate statistics over time, such as total counts or
averages in a given period
@@ -1192,42 +1244,128 @@ doesn't reveal information about ongoing activity
\item Using cryptographic techniques to hide the source of information,
such as anonymizing reports from individual relays
\end{compactitem}
-
-We will be adding noise in a way that provides differential
-privacy~\cite{dwork-tcc2006} for
-``single actions''. What constitutes a single action will depend on the
-specific statistic. For example, when publishing the number of unique
-descriptors seen at each HSDir, a single action could be publishing a
-descriptor to
-six relays. To obtain differential privacy, we will add noise using the
-Laplace distribution, which has a distribution function
-$\textsf{Lap}(b)$ of $e^{-|x|/b}/(2b)$. We will choose $b$ such that
-altering a single action will change the probability of the total output
-by a factor of at most $e^{\epsilon}$. Thus more privacy is provided the
-smaller that $\epsilon$ is.
-
-
-\subsection{Adversary knowledge}
+However, in general, publishing any information that is broadly
+informative might break privacy, because that information may be just what
+the adversary needs to know in order to learn a sensitive
+fact~\cite{Dwork06differentialprivacy}. Therefore, a discussion of privacy
+on Tor should consider plausible background knowledge for an adversary.
We can expect that the adversary may know things such as
\begin{compactitem}
\item The addresses of a large number of publicly-available services
(e.g. by crawling the Web)
\item A minimum amount of traffic received by a given hidden service
(e.g. due to sending that traffic himself)
-\item The introduction points of a service (by obtaining the descriptor)
-\item The availability of the service (by attempting to connect
+\item The Introduction Points of a service (e.g. by obtaining the
+descriptor)
+\item The availability of a service (e.g. by attempting to connect
periodically)
\item Roughly the number of client connections and amount of client
-traffic (possibly leaked by the service itself, e.g. a web forum)
+traffic for a service (e.g. a web forum that reveals user activity on the
+site)
\end{compactitem}
+Given such information, it can be seen that certain statistics cannot be
+published by individual relays with reasonable accuracy without violating
+some privacy. For example, if each relay accurately reported the number of
+client introduction requests it received, then an adversary that
+knows the \texttt{.onion} address of a hidden service (and thus can obtain
+its Introduction Points) could infer how many client connections the
+hidden-service received, especially if there were few other hidden
+services sharing its Introduction Points. The general problem is that, for
+certain types of hidden-service activities, different hidden services or
+clients will make use of different relays in a way that may be known by
+the adversary.
+
+\begin{comment}
+Table~\ref{table:stats_needing_aggregation} lists
+those statistics for which this is an issue.
+\begin{table}
+\center
+\caption{Per-relay statistics at high risk to leak private information}
+\label{table:stats_needing_aggregation}
+\begin{tabular}{|l|l|}
+\hline
+\textbf{Description} & \textbf{Section}\\
+\hline
+Number of descriptor fetch requests &
+\ref{subsubsec:num_desc_fetches}\\
+\hline
+Number of introductions received from clients &
+\ref{subsubsec:num_intros_from_clients}\\
+\hline
+Reasons for terminating established introduction points &
+\ref{subsubsec:reasons_end_ips}\\
+\hline
+Number of discarded client introductions by reason &
+\ref{subsubsec:num_discarded_client_intros}\\
+\hline
+Time from establishing introduction point to tearing down circuit &
+\ref{subsubsec:time_intro_to_teardown}\\
+\hline
+Number of descriptor updates per service &
+\ref{subsubsec:num_decriptor_updates}\\
+\hline
+Time between last and first published descriptor with same identifier &
+\ref{subsubsec:time_first_last_descriptor_update}\\
+\hline
+Number of descriptor fetch requests by service identity &
+\ref{subsubsec:num_descriptor_fetches_per_hs}\\
+\hline
+Number of introductions received by established introduction point &
+\ref{subsubsec:num_intros_per_circ}\\
+\hline
+Time from establishing introduction point to receiving first client
+introduction & \ref{subsubsec:time_ip_est_to_introduce1}\\
+\hline
+\end{tabular}
+\end{table}
+\end{comment}
+
+We give two algorithms for privately publishing statistics that do not
+have this problem, that is, that can be released by each relay.
+Two examples of such statistics are those described by Kadianakis et
+al.~\cite{torprop-hsstats} (also mentioned in Section~\ref{sec:protocol}):
+\begin{compactenum}
+\item \textsf{NumRPCells}: The number of hidden-service data
+cells seen at a Rendezvous Point
+\item \textsf{NumHSDirIDs}: The number of unique hidden
+identifiers seen at an HSDir.
+\end{compactenum}
+The first algorithm provides a way to
+publish statistics that are simple running counts, including both
+\textsf{NumRPCells} and \textsf{NumHSDirIDs}. The second algorithm
+provides a way to publish statistics about a distribution of values.
+
+Our algorithms will add noise in a way that provides differential
+privacy~\cite{dwork-tcc2006} for ``single actions''. What constitutes a
+single action will depend on the specific statistic. For example,
+Kadianakis et al.~\cite{torprop-hsstats} use the following for their
+statistics:
+\begin{compactenum}
+\item For \textsf{NumRPCells}, a single action is
+transferring a fixed amount of data (specifically, 2048 cells). Thus the
+added noise obscures the activity of a single service, client, or
+connection up to that amount of traffic.
+\item For \textsf{NumHSDirIDs}, a single action is
+publishing a fixed number of hidden services (specifically, 8). Thus the
+added noise obscures the existence of any eight or fewer hidden services.
+\end{compactenum}
+To obtain differential privacy, our algorithms add noise using the
+Laplace distribution, which has a probability distribution function
+$\textsf{Lap}(b)$ of $f(x) = e^{-|x|/b}/(2b)$. We will choose $b$ such
+that altering a single action will change the probability of the total
+output by a factor of at most $e^{\epsilon}$. Thus more privacy is
+provided the smaller that $\epsilon$ is.
+
\subsection{Counts} \label{subsec:counts}
Reporting a basic count is useful in its own right, and it also provides a
simple setting in which to develop privacy-preserving methodology that
we can use for more complicated statistics. Counts can be used to
summarize many types of hidden-service activity, such as the number of
hidden services, the number of hidden-service clients, and the amount of
-hidden-service traffic. Table~\ref{table:count_stats} lists the statistics
+hidden-service traffic.
+\begin{comment}
+Table~\ref{table:count_stats} lists the statistics
for which it could be useful to release counts.
\begin{table}
\center
@@ -1281,14 +1419,14 @@ Number of server rendezvous with unknown rendezvous cookie &
\hline
\end{tabular}
\end{table}
+\end{comment}
To release a single count, we will use ideas from differential privacy
-to provide strong protection for a single hidden-service action, such as
-making a small number of descriptor lookups or sending all but an
-abnormally-large amount of traffic. However, we wish to
+to provide strong protection for a single hidden-service action. However,
+we wish to
continually publish statistics over time, and as a result differential
-privacy, using a per-user or per-service privacy notion that applies over
-time, does not provide an adequate solution. The reasons are
+privacy, using a per-user or per-service privacy notion that includes
+actions over time, does not provide an adequate solution. The reasons are
(\emph{i}) mechanisms using global sensitivity~\cite{dwork-tcc2006} that
would apply over time require amounts of noise that grow with the
reporting period (potentially years in our case), and (\emph{ii})
@@ -1325,27 +1463,60 @@ issues by making the bin output itself noisy.
We suggest the following to privately publish a count $c$:
\begin{compactenum}
-\item Choose a bin granularity $\delta$, which should be larger than
-the amount by which a single action can change the count.
-\item Round $c$ to the nearest multiple of $\delta$, that is, let
-$\hat{c} = \delta[c/\delta]$, where $[]$ indicates the nearest integer
-function.
-\item Choose a value $\nu$ from the $\textsf{Lap}(\delta/\epsilon)$
+\item Repeatedly maintain a running count $c$ over a time period of length
+$t$.
+\item Choose a bin granularity $\delta_1$, which should be large enough
+to contain the change in a single statistic due to a \emph{repeated}
+action, that is, an action repeated over many
+reporting periods that we wish to hide any single instance of.
+\item Round $c$ to the nearest multiple of $\delta_1$, that is, let
+$\hat{c} = \delta_1[c/\delta_1]$, where $[]$ indicates the nearest integer
+function. Rounding up (as in Kadianakis et al.~\cite{torprop-hsstats}) or
+down can be performed instead for a similar effect on privacy, although
+they introduce a bias that should be corrected
+for~\cite{extrapolation-tor-techreport}.
+\item Choose a value $\nu$ from the $\textsf{Lap}(\delta_2/\epsilon)$
distribution. $\epsilon$ is the privacy
-parameter discussed at the beginning of Sec.~\ref{sec:obfuscation}.
-$\delta$ appears in the Laplace
-parameter because a single action could cause the bin center to change
-by at most $\delta$.
-\item Let the noisy count be $\tilde{c} = \hat{c} + \nu$. Publish
-$\tilde{c}$.
+parameter discussed in Sec.~\ref{sec:obfuscation}.
+$\delta_2$ is the amount by which a single (non-persistent) action that
+we wish to hide could change the bin center.
+\item Let the noisy count be $\tilde{c} = \hat{c} + \nu$.
+\item Wait until all activity contributing to count $c$ should have
+completed, and then publish $\tilde{c}$.
\end{compactenum}
+\paragraph{Example 1:} Kadianakis et al.~\cite{torprop-hsstats} use this
+algorithm to publish \textsf{NumRPCells} over periods of length
+$t = 24$ hours. The waiting period after collection is implicit in the
+length of time until the next extra-info descriptor is published by the
+relay, which happens every 18 hours unless certain relay properties
+change~\cite{dir-spec}. They use $\delta_1 = 1024$, $\delta_2 = 2048$, and
+$\epsilon = 0.3$. This setting of $\delta_1$ hides any repeated transfer
+of up to 1024 cells. Tor cells have up to 509 payload
+bytes~\cite{tor-spec}, and so this is 509 KiB. This setting of $\delta_2$
+hides any group of up to 2048 transferred cells (1018 KiB) (e.g. a
+connection in which a client downloads 1 MB). One way of interpreting the
+privacy from $\epsilon=0.3$ is from a Bayesian
+perspective~\cite{ganta-kdd2008}, where if a priori, for any given set of
+other transfers, the adversary believed a given transfer of up to 2048
+cells did (or didn't) occur, then the statistics can only increase his
+confidence by at most $1-e^{0.3}$ (i.e. $\sim35\%$).
+
+\paragraph{Example 2:} Kadianakis et al.~\cite{torprop-hsstats} use this
+algorithm to publish \textsf{NumHSDirIDs} over periods of length
+$t = 24$ hours, waiting to publish with the relay's next extra-info
+descriptor. They use $\delta_1 = 8$, $\delta_2 = 8$, and
+$\epsilon = 0.3$. These settings for $\delta_1$ and $\delta_2$ hide any
+single or repeated publication of any given group of at most 8 onion
+services (e.g. a set of 8 or fewer related onion addresses that are
+aliases for the same onionsite).
\subsection{Distributions} \label{subsec:distributions}
For many statistics, it would be very helpful to understand the
-distribution of values. For example, such information about descriptor
-fetches could reveal if most hidden services are never used or if
-there are a few hidden services that constitute most HS activity.
+distribution of values. For example, such information about the
+number of cells on a single circuit could reveal if most
+hidden-service traffic is produced by relatively few connections.
+\begin{comment}
Table~\ref{table:dist_stats} lists the statistics for which it could
be useful to release information about a distribution.
\begin{table}
@@ -1399,6 +1570,7 @@ Number of descriptor fetch requests for non-existent descriptor &
\hline
\end{tabular}
\end{table}
+\end{comment}
Following are several potential ways to publish information about a
distribution:
@@ -1413,8 +1585,8 @@ To protect individual privacy when releasing these kinds of data,
we would again like to protect activity over time and also provide
particularly-strong protection for a single activity. This is quite
straightforward to do for publishing
-histograms, simply by applying the techniques that we developed for counts
-to each
+histograms simply by applying the techniques that we have developed for
+counts to each
count in the histogram. Thus we suggest using histograms in this way to
report distribution data, as follows:
\begin{compactenum}
@@ -1423,108 +1595,62 @@ values of the statistic (we use the term ``buckets'' to distinguish
these from bins that will limit the granularity of each bucket). Each
extra bucket will result in a certain additional amount of noise being
added, but including more values in a bucket (i.e. increasing its width)
-reduces its accuracy. Therefore, the number of width of the buckets should
-be balanced while also
+reduces its accuracy. Therefore, the number and width of the buckets
+should be balanced while also
choosing buckets to capture the most useful distinctions
for the statistic under consideration (e.g. deciding between relative and
absolute accuracy).
+\item Over repeated time periods of length $t$, maintain running counts
+$c_i$, where $c_i$ is the count of observed values in the $i$th bucket.
\item For the $i$th bucket, the count $c_i$ of values in that bucket
-should be rounded to a chosen granularity $\delta$:
-$\hat{c_i} = \delta[c_i/\delta]$.
-$\delta$ should be larger than the amount by which a
-single activity could change the bucket count, where again the notion of a
-single activity depends on the context. The bins here serve the same
-purpose of protecting privacy over time that they did when publishing
-counts.
+should be rounded to a chosen granularity $\delta_1$:
+$\hat{c_i} = \delta_1[c_i/\delta_1]$.
+$\delta_1$ should be at least the amount by which a single instance of
+a repeated action that we want to hide could change the bucket count,
+where again the notion
+of a single action depends on the context. The $\delta_1$-sized bins here
+serve the same purpose of protecting privacy over time that they did when
+publishing a single count.
\item Fresh Laplace noise $\nu_i$ with distribution
-$\textsf{Lap}(2\delta/\epsilon)$ should be added to the center of the
-bin of the $i$th bucket. Let the resulting value be
+$\textsf{Lap}(2\delta_2/\epsilon)$ should be added to the center of the
+bin of the $i$th bucket, where $\delta_2$ is at least the amount by which
+a single (non-repeated) action that we want to hide could change the bin
+center. Let the resulting value be
$\tilde{c_i} = \hat{c_i} + \nu_i$. The value $2$ appears in the Laplace
parameter because modifying a single entry in the
histogram can change two values: the value of the bucket it was changed
from and the value of the bucket it was changed to.
-\item Publish each noisy bucket count, $\tilde{c_i}$, $1\le i\le k$.
+\item Wait until the activity contributing to the histogram should have
+completed, and then publish each noisy bucket count,
+$\tilde{c_i}$, $1\le i\le k$.
\end{compactenum}
-\subsection{Statistics aggregation}
-Up to this point it has been suggested that individual Tor relays collect
-and publish these statistics. However, there are many drawbacks to
-collecting all statistics in this manner. The main advantage is that it is
-straightforward to implement. Here we describe some of the problems and
-outline potential solutions.
-
-One problem with publishing statistics in a way that
-is attributable to specific Tor relays is that in some cases it is
-inherently insecure. For example, if each relay reported the number of
-client introduction requests it received
-(Sec.~\ref{subsubsec:num_intros_from_clients}), then an adversary that
-knows the \texttt{.onion} address of an HS (and thus can obtain its
-introduction points) could infer how many client connections the HS
-received, especially if there were few other HSes sharing its IPs. The
-general issue is that for certain types of HS activities different HSes
-or clients will make use of different relays in a way that may be
-known by the adversary. Table~\ref{table:stats_needing_aggregation} lists
-those statistics for which this is an issue.
-\begin{table}
-\center
-\caption{Per-relay statistics at high risk to leak private information}
-\label{table:stats_needing_aggregation}
-\begin{tabular}{|l|l|}
-\hline
-\textbf{Description} & \textbf{Section}\\
-\hline
-Number of descriptor fetch requests &
-\ref{subsubsec:num_desc_fetches}\\
-\hline
-Number of introductions received from clients &
-\ref{subsubsec:num_intros_from_clients}\\
-\hline
-Reasons for terminating established introduction points &
-\ref{subsubsec:reasons_end_ips}\\
-\hline
-Number of discarded client introductions by reason &
-\ref{subsubsec:num_discarded_client_intros}\\
-\hline
-Time from establishing introduction point to tearing down circuit &
-\ref{subsubsec:time_intro_to_teardown}\\
-\hline
-Number of descriptor updates per service &
-\ref{subsubsec:num_decriptor_updates}\\
-\hline
-Time between last and first published descriptor with same identifier &
-\ref{subsubsec:time_first_last_descriptor_update}\\
-\hline
-Number of descriptor fetch requests by service identity &
-\ref{subsubsec:num_descriptor_fetches_per_hs}\\
-\hline
-Number of introductions received by established introduction point &
-\ref{subsubsec:num_intros_per_circ}\\
-\hline
-Time from establishing introduction point to receiving first client
-introduction & \ref{subsubsec:time_ip_est_to_introduce1}\\
-\hline
-\end{tabular}
-\end{table}
-
-Another problem is that with per-relay statistics much more total noise
-needs to be added than is necessary if only network-wide totals are
-ultimately desired. Note that the rounding and noise applied to each
-relay's statistics (Secs. \ref{subsec:distributions} \&
-\ref{subsec:counts})
-would provide equivalent protection if applied to the same statistics for
-the entire network. This would reduce the amount of added noise and
-rounding inaccuracy by a factor of $m$, where $m$ is the number of relays.
-
-There are many possible ways to improve both the security and accuracy
-of Tor statistics via aggregation using well-studied cryptographic
-techniques, including
+\section{Future Work}
+There are many possible ways to expand the hidden-service statistics that
+we can safely gather and to improve their accuracy. One promising option
+is to securely perform aggregation
+such that no single party can learn more than the final aggregate value.
+This would allow collecting statistics that cannot be reported by each
+relay individually, as previously discussed.
+
+In addition, secure aggregation could reduce the amount of noise that
+needs to be added. With per-relay reporting, rounding and noise is
+applied to each relay's statistics when using the algorithms described
+in Sections~\ref{subsec:counts} \& \ref{subsec:distributions}. However,
+the same noise would provide equivalent protection if applied only once
+to the statistics after network-wide aggregation. This would reduce
+the amount of added noise and rounding inaccuracy by a factor of $m$,
+where $m$ is the number of relays.
+
+There are some promising approaches to secure statistics aggregation
+including
\begin{compactitem}
-\item Have the relays run a secure multiparty computation (SMC) protocol
-that produces the desired statistics with any privacy-preserving
-modifications included (e.g. added noise).
-\item Take the approach of PrivEx~\cite{elahi-ccs2014} and use a separate
-set of ``tally servers'' that secret-share statistics and use homomorphic
-encryption to aggregate counts.
+\item Have the relays provide inputs to a secure multiparty computation
+(SMC) protocol that produces the desired statistics with any
+privacy-preserving modifications included (e.g. added
+noise)~\cite{privda-acsac2014}.
+\item Use a customized aggregation protocol, such at that of Elahi et
+al.~\cite{elahi-ccs2014}
\item Anonymize statistics reports, either via Tor itself or via a shuffle
run over a separate set of servers. For statistics that could be
attributable to a small set of relays by their values alone (e.g. a large
@@ -1532,8 +1658,8 @@ number of rendezvous data cells is likely to come from a small set of
large relays), break up the values into minimum amounts.
\end{compactitem}
-Adoption of these approaches faces a couple of main challenges. Those
-issues and our suggestions for handling them are
+Adoption of these approaches faces some challenges specific to their use
+in Tor. Those issues and some suggestions for handling them are
\begin{compactenum}
\item Implementation difficulty: Making use of sophisticated
cryptographic tools, such as non-standard cryptosystems or novel
@@ -1551,20 +1677,18 @@ are not excessively influenced by outliers. For example, we could use a
median instead of a mean as the basis for a sum.
\end{compactenum}
-
-\section{Recommendation}
-\label{sec:recommendation}
-
-\section*{Next steps in writing this report}
-
-\begin{itemize}
-\item Add results from private testing network to list of statistics.
-\item Figure out which of the failure statistics actually make sense, by
-looking at the code.
-\item Decide how to evaluate helpfulness and harmfulness of statistics, in
-an objective way, ideally using the stated evaluation criteria.
-\end{itemize}
+\section{Conclusion}
+Statistics about the usage and performance of Tor hidden services could
+serve many valuable purposes. However, doing so in a privacy-preserving
+manner is challenging. We have described the kinds of hidden-service
+statistics that Tor relays are in a position to collect and the main
+criteria
+on which collection procedures should be evaluated. In particular, we have
+provided a list of privacy goals for Tor hidden services that goes beyond
+simply hiding the server's location. Furthermore, we explored how
+statistics might be published in a privacy-preserving way, and we gave two
+specific algorithms for publishing broadly useful count and distribution
+statistics.
\bibliography{hidden-service-stats}
-\end{document}
-
+\end{document}
\ No newline at end of file
diff --git a/2015/hidden-service-stats/protocol.odg b/2015/hidden-service-stats/protocol.odg
index 729c8eb..7a26776 100644
Binary files a/2015/hidden-service-stats/protocol.odg and b/2015/hidden-service-stats/protocol.odg differ