commit e8d6663c6ea45deb4e8e3ceabc0be053480e78a3 Author: iwakeh iwakeh@torproject.org Date: Thu Apr 27 20:51:00 2017 +0100
Sources for tr-2017-04-001. --- ...ts-bridge-combined-aq-2016-01-01-2017-01-01.png | Bin 0 -> 22445 bytes ...ts-bridge-combined-va-2016-01-01-2017-01-01.png | Bin 0 -> 46376 bytes ...ats-bridge-country-aq-2016-01-01-2017-01-01.png | Bin 0 -> 10695 bytes ...ats-bridge-country-va-2016-01-01-2017-01-01.png | Bin 0 -> 24626 bytes ...-relay-country-aq-2016-01-01-2017-01-01-off.png | Bin 0 -> 22730 bytes ...-relay-country-va-2016-01-01-2017-01-01-off.png | Bin 0 -> 23626 bytes 2017/metrics-privacy/privacy-in-memory.tex | 1560 ++++++++++++++++++++ 2017/metrics-privacy/references.bib | 153 ++ 2017/metrics-privacy/tortechrep.cls | 1 + 9 files changed, 1714 insertions(+)
diff --git a/2017/metrics-privacy/images/userstats-bridge-combined-aq-2016-01-01-2017-01-01.png b/2017/metrics-privacy/images/userstats-bridge-combined-aq-2016-01-01-2017-01-01.png new file mode 100644 index 0000000..a01faf7 Binary files /dev/null and b/2017/metrics-privacy/images/userstats-bridge-combined-aq-2016-01-01-2017-01-01.png differ diff --git a/2017/metrics-privacy/images/userstats-bridge-combined-va-2016-01-01-2017-01-01.png b/2017/metrics-privacy/images/userstats-bridge-combined-va-2016-01-01-2017-01-01.png new file mode 100644 index 0000000..d61cf9c Binary files /dev/null and b/2017/metrics-privacy/images/userstats-bridge-combined-va-2016-01-01-2017-01-01.png differ diff --git a/2017/metrics-privacy/images/userstats-bridge-country-aq-2016-01-01-2017-01-01.png b/2017/metrics-privacy/images/userstats-bridge-country-aq-2016-01-01-2017-01-01.png new file mode 100644 index 0000000..572482b Binary files /dev/null and b/2017/metrics-privacy/images/userstats-bridge-country-aq-2016-01-01-2017-01-01.png differ diff --git a/2017/metrics-privacy/images/userstats-bridge-country-va-2016-01-01-2017-01-01.png b/2017/metrics-privacy/images/userstats-bridge-country-va-2016-01-01-2017-01-01.png new file mode 100644 index 0000000..b3eea7e Binary files /dev/null and b/2017/metrics-privacy/images/userstats-bridge-country-va-2016-01-01-2017-01-01.png differ diff --git a/2017/metrics-privacy/images/userstats-relay-country-aq-2016-01-01-2017-01-01-off.png b/2017/metrics-privacy/images/userstats-relay-country-aq-2016-01-01-2017-01-01-off.png new file mode 100644 index 0000000..b808134 Binary files /dev/null and b/2017/metrics-privacy/images/userstats-relay-country-aq-2016-01-01-2017-01-01-off.png differ diff --git a/2017/metrics-privacy/images/userstats-relay-country-va-2016-01-01-2017-01-01-off.png b/2017/metrics-privacy/images/userstats-relay-country-va-2016-01-01-2017-01-01-off.png new file mode 100644 index 0000000..a6e966b Binary files /dev/null and b/2017/metrics-privacy/images/userstats-relay-country-va-2016-01-01-2017-01-01-off.png differ diff --git a/2017/metrics-privacy/privacy-in-memory.tex b/2017/metrics-privacy/privacy-in-memory.tex new file mode 100644 index 0000000..72cdf2a --- /dev/null +++ b/2017/metrics-privacy/privacy-in-memory.tex @@ -0,0 +1,1560 @@ +\documentclass{tortechrep} +\usepackage{url} +\usepackage{amsthm} +\usepackage{thmtools} +\usepackage{mathtools} +\usepackage{hyperref} +\usepackage{comment} +\usepackage{fancyvrb} +\usepackage{fancyhdr} +\usepackage[Q=yes]{examplep} +\usepackage{marginnote} +\usepackage[light]{draftcopy} +\usepackage{graphicx} +\usepackage{caption} +\usepackage{subcaption} + +%%% useful definitions and settings +\setcounter{tocdepth}{1} + +%%% no break in verbatim/code. +\newcommand{\Qx}[1]{\mbox{\Q{#1}}} + +%%% + +\begin{document} +%%%% Settings inside document-env. +% general setting for Verbatim environment +\fvset{frame=leftline,numbers=left,numbersep=2pt,gobble=4,stepnumber=1} +\VerbatimFootnotes +\graphicspath{{./images/}} + +%%%% + +\title{Privacy analysis of Tor's in-memory statistics} +\author{Karin Herm\The Tor Project\ iwakeh$\bowtie$torproject.org}% avoiding spam + +\reportid{2017-04-001} +\date{April 2017} +\maketitle +\tableofcontents + +\begin{abstract} + This report analyzes which possibly sensitive, potentially + personally identifying data is stored in memory of Tor relays and + bridges or reported to the directory authorities and makes + suggestions to reduce the collection and temporary storage of such + data. % +\end{abstract} +\pagebreak +\section{Introduction}\label{intro} % +Tor network metrics and the underlying data have been available for +many years by now and proven to be a valuable source for analyzing and +improving the network as well as for censorship detection.% +\footnote{Network analysis estimation of cell traffic, estimation of + onion services induced traffic and user count estimation as well as + censorship detection \cite{tr200908001,tr201210001,tr201504001} and + all data and visualizations on + \href{https://metrics.torproject.org%7D%7BMetricsWeb%7D.%7D% + +Tor Metrics' data collecting and processing chain handles various types +of data ranging from raw data as measured by running Tor servers% +\footnote{% + Here and in the following {\em Tor server} refers to relays and + bridges and other parts of the Tor network fulfilling a server + role. % + The term {\em client} is used for Tor instances simply connecting to + the network. % + Tor servers report different statistics depending on their + configuration. % + A bridge, a normal relay, an entry guard relay, etc., they all have + access to different data and report different statistics. % +} % +to preprocessed and aggregated data ready for further statistical work +and as a basis for visualizations. % + +This report aims at improving privacy protection {\em before} any data +is reported. % +Of primary interest is the identification of possibly harmful data +that is not a necessary part of the running Tor server, e.g.~data held +in-memory or written to files for providing network/router metrics +reports or data written to logs for informative purposes. % + +Section \ref{overview} provides an overview of the Tor Metrics system, +its privacy goals, and a more detailed explanation of the metrics +collection process as well the associated data. % +In-memory data with possibly negative impact to client privacy is +identified in section \ref{privacy-im}. % +Section \ref{mitigate} surveys several measures to reduce privacy +impact. % +Building hereon section \ref{detail} details the changes necessary for +an implementation of the favored +solution. % +The suggestions made in this report for reducing privacy impact go +beyond the scope of a single project and some will need further work +to reach the implementation stage. % +The summary in section \ref{summary} takes account of this and also +sketches possible next steps. % + +\section{Background} \label{overview}% + +This report assumes the reader to be familiar with the Tor software +and Tor network and to some extent with the functionality and data +offered by Tor Metrics. % + +The following sections first present an overview of the data +processing chain in Tor Metrics and summarize the privacy goals behind +Tor Metrics data collection. % +Sub-section~\ref{mc} provides a description of the processes of +measuring and counting implemented in Tor servers for metrics purposes +and shows where in the code the processing of the data takes place. % + +\subsection{Tor Metrics} \label{overview-metrics}% +Tor servers running as relays or bridges publish their presence and +capabilities to the directory authorities in form of simple +files, the descriptors. % + +The current system of collecting data about the Tor network is built +on descriptors, which are mainly produced and distributed for the +operation of the network except for extra-info descriptors, which also +provide metrics about the network. % +This way of measuring the network generates minimal overhead for the +network's operation and the data produced is freely available% +\footnote{See \cite[dir-spec.txt]{torspec} about how to retrieve + descriptors.} % +to anyone who cares to collect it. % +The descriptors are machine and human readable and the knowledge +required to make use of them is published in Tor's +specification~\cite{torspec}. % + +All descriptors available at a fixed point in time give a good picture +of the current status of the network. % +In order to collect these {\em pictures} and combine them to a {\em + history} Tor Metrics introduced CollecTor,% +\footnote{% + The main instance is \url{https://collector.torproject.org%7D. % + Since $2016$ there are also several mirror instances sharing their + data to gather even more of the ephemeral descriptors and other Tor + network related data.} % +which gathers and archives the \emph{raw facts} in form of +descriptors% +\footnote{Actually, some data, e.g., bridge descriptors, are + pre-processed in order to remove possibly privacy critical + information, but for the current report they can be still considered + raw data.} % +about the Tor network. % + +A descriptor document only carries information about a certain point +in time, more exactly a time interval, as for example, a measurement +interval for extra-info descriptors or the consensus, which applies to +the entire network for its valid time interval. % +Tor Metrics also provides machine and human centered services that +create aggregated and enriched data from the descriptor collection. % +The central services by Tor Metrics building on CollecTor are Onionoo% +\footnote{\url{https://oninoo.torproject.org%7D%7D % +and MetricsWeb.% +\footnote{\url{https://metrics.torproject.org%7D%7D % +Onionoo aggregates the descriptors available at CollecTor and provides +current and historic data about {\em currently running} Tor servers. % +Based on Onionoo there is an ecosystem of clients building +visualizations and other results helping users to find the piece of +information they need.% +\footnote{See \url{https://metrics.torproject.org/development.html%7D + for development tools and + \url{https://metrics.torproject.org/operation.html%7D for user + centered services.} % + +MetricsWeb uses CollecTor's data for providing {\em the history} of +the entire Tor network in form of aggregated and enriched data sets, +which serve as the basis for the numerous visualizations on MetricsWeb +and can be freely downloaded for further use. % + +\subsection{Privacy goals} \label{overview-privacy}% + +The goals of a privacy and anonymity network like Tor are not easily +combined with extensive data gathering, but at the same time data is +needed for monitoring and improving the network and detecting possible +censorship events or attacks against the network. % +Safety and privacy concerns regarding data collection by Tor Metrics +is guided by the Safety Board's guidelines.\footnote{% + See + \url{https://research.torproject.org/safetyboard.html%5C#guidelines%7D. +} % +Safety and privacy assessment is usually done informally by discussion +during the proposal process\footnote{% + The proposal process is defined in + \cite[proposals/001-process.txt]{torspec} and security and anonymity + implication should be part of any proposal (cf. \cite[line 114 of + proposals/001-process.txt]{torspec}). } % +for changes to the Tor source, and/or supported by closer analysis in +form of Tor Tech Reports, for example, the introduction of onion +service statistics was backed by a Tor Tech Report \cite{tr201504001}, +which substantiated the privacy standards implemented and the +statistical accuracy of the data to be collected.% +\footnote{See also the related blog post + \url{https://blog.torproject.org/blog/some-statistics-about-onions%7D.%7D % + +It is out of scope of this report and will be future work to provide +such an assessment for both privacy and statistical accuracy +throughout the data-verse of Tor Metrics. % +Until such background is available security and privacy assessment +will be based on the guidelines, best practices, and heuristic +arguments. % +The current report focuses on in-memory data and considers the +scenario that an attacker gains access to in-memory storage. % +Thus, any run-time data for normal processing as well as the in- and +outgoing traffic are also available to the intruder. % +Hence, at most events/data that occurred and were finalized {\em + before} the breach can potentially be protected.% +\footnote{ % + The goal that an adversary cannot learn the state of the measurement + before time of compromise, is usually referred to as \emph{forward + privacy.} } % +Another goal is to reduce reporting of potentially privacy problematic +data. % + +\subsection{Measuring and counting} \label{mc} % + +Tor instances keep data in-memory and on disk for normal operation, +for facilitation of local administration of the Tor server, and for +reporting metrics data. % +The latter is mainly accomplished by uploading extra-info descriptors +to authorities. % +For a quick orientation about the structure of these descriptors two +examples of extra-info descriptors can be found in the appendix on +page~\pageref{descriptors}. % + +\subsubsection{Server internal processing} \label{server-proc} + +Servers write their measurements and counting results to separate +files, the ``stats files'', which are located in sub-folder \Qx{stats} +of a configurable path. % +These files are parsed and their content is assembled to form an +extra-info descriptor, which will be uploaded to an authority. % +The upload of extra-info descriptors happens together with the upload +of the server descriptor. % + +The callback \Qx{check_descriptor} runs every minute, checks, if +descriptors have to be uploaded, if necessary, it creates the server +descriptor and the extra-info descriptor, which is populated from +previously prepared stats files: dirreq-stats, hidserv-stats, +entry-stats, buffer-stats, exit-stats, conn-stats. % + +Writing of stats files is triggered by two callbacks, +\Qx{write_stats_file_callback}% +\footnote{As defined in \cite[\Qx{src/or/main.c:1702-1747}]{torgit}.} +and \Qx{record_bridge_stats_callback}.% +\footnote{As defined in + \cite[\Qx{src/or/main.c:1752-1777}]{torgit}.} % +These callbacks are registered to be run regularly after their first +start after one second.% +\footnote{Scheduled by the code in + \cite[\Qx{src/or/main.c:1265-1275}]{torgit} and the callback's + return values.} % +Afterwards, the corresponding tasks are run in their own intervals, +i.e., after running for the first time the next interval is currently +limited to maximal one hour and the actual interval will be the +smallest demanded by the respective sub-tasks. % +Given that all configuration options for statistics are enabled the +following functions are called from \Qx{write_stats_file_callback}: % +\begin{description} +\item [\Qx{rep_hist_buffer_stats_write}:] statistics about cell + processing for monitoring relay performance (cf.~\ref{circ}) +\item [\Qx{geoip_dirreq_stats_write}:] directory statistics + (cf.~\ref{dirreq}) +\item [\Qx{geoip_entry_stats_write}:] entry contact statistics + (cf.~\ref{clienthist}) +\item [\Qx{rep_hist_hs_stats_write}:] onion services statistics + (cf.~\ref{hidserv}) +\item [\Qx{rep_hist_exit_stats_write}:] exit traffic statistics + (cf.~\ref{exit}) +\item [\Qx{rep_hist_conn_stats_write}:] traffic statistics between + relays (cf.~\ref{conn}). +\item [\Qx{rep_hist_desc_stats_write}:] statistics about served + descriptors (only for bridge authorities). +\end{description} +The \Qx{record_bridge_stats_callback} only triggers one function: +\Qx{geoip_bridge_stats_write}, which writes bridge connection +statistics (see \ref{clienthist}). % + +All of these functions verify, if their individual measurement +interval has elapsed. % +If so, they assemble their respective data, reset the data collecting +structures, and write the data to files in the configured statistics +directory. % +This process is similar for all stats-files, but not identical. % +Some of the concerned functions handle the reset of the measurement +structures in-memory immediately after assembling the data to be +written and others only reset after a successful write. % +For example, \Qx{geoip_entry_stats_write} only resets the data +structure when writing succeeds, which can cause data retention for +more than the intended $24$ hour interval and +\Qx{geoip_bridge_stats_write} doesn't remove client IPs from memory +until the {\em next} interval's statistics are going to be written, +which leads to a usual retention time of up to 48 hours.% +\footnote{Some of these difference were introduced on purpose, e.g., + the 48 hour interval seems to be due to a technical choice for + bridge metrics, as it is already mentioned in the introduction of + the extra-info proposal + cf.~\cite[proposals/166-statistics-extra-info-docs.txt]{torspec}. % +} % +In \Qx{geoip_entry_stats_write} the removal of older client data is +only performed, if the interval for the next reporting is reached,% +\footnote{See \cite[\Qx{src/or/geoip.c}:1627-1654]{torgit}.} % +and \Qx{geoip_remove_old_clients} removes clients older than the +current report interval of 24 hours, which is the argument +\Qx{start_of_dirreq_stats_interval} and then removes the data after +computing and writing statistics.% +\footnote{ \cite[\Qx{src/or/geoip.c}:1648]{torgit}} % +Thus, if writing fails +there could be up to 48 hours of client data available in-memory.% +\footnote{Cf.~\cite[\Qx{src/or/geoip.c}:1644,1645]{torgit}} % +For bridge clients ip connections the retention time is usually more +than 24 hours, because the old clients are removed% +\footnote{In function \Qx{geoip_bridge_stats_write} + \cite[\Qx{src/or/geoip.c}:1492-1530]{torgit}.} % +{\em before} statistics computation and here only those from {\em + before} the current reporting interval. % + +\subsection{Data structures} + +This section describes in-memory storage structures for all data +collected for metrics purposes and explains how these structures are +maintained during a measurement interval. % + +The following assumes some familiarity with the data fields of extra-info +descriptors.% +\footnote{% + Two example descriptors are printed in appendix \ref{descriptors}. % +} % +The descriptions are grouped by the extra-info descriptor target field +and exclude fields that are not in the focus of this analysis, +e.g. identity, digests, statistic interval end times. % + +\subsubsection{Directory requests counts}\label{dirreq} +In order to derive usage by country Tor servers keep track of the +originating country of directory requests. % +The resulting data is written to extra-info field \Qx{dirreq-v3-reqs} +as a list of mappings from two-letter country codes% +\footnote{ GeoIp codes usually refer to countries, but in some cases + to other kinds of jurisdiction. % + For the topic treated in this report it does no harm to simply refer + to countries in all cases. } % +to the number of requests for v3 network statuses from that country, +rounded up to the nearest multiple of 8. % + +During run-time the counts are stored in a list of +\mbox{\Qx{geoip_country_t}} structures% +\footnote{ As defined in \cite[\Qx{src/or/geoip.c:55-59}]{torgit}. +} % +without binning or obfuscation. % +The count \Qx{n_v3_ns_requests} is increased when a client is +recorded.% +\footnote{\label{noteclient}% + This happens by calling function \Qx{geoip_note_client_seen} in + \cite[\Qx{src/or/geoip.c}:560-613]{torgit}.} % +The map of \Qx{geoip_country_t} structures is reset% +\footnote{See function \Qx{geoip_dirreq_stats_write} in + \cite[\Qx{src/or/geoip.c}:1284-1312]{torgit}. } % +after writing the derived values to the stats file. % + +\subsubsection{Connecting client counts}\label{clienthist} + +Connecting clients use the Tor network and their count is tracked in +regard to originating country and in case of bridges also the +transport used and the IP version. % +The resulting data is written to the fields \Qx{bridge-ips}, +\Qx{bridge-ip-transports} and \Qx{bridge-ip-versions} as well as +\Qx{dirreq-v3-ips} and \Qx{entry-ips}, of which the latter two are +currently not used in Tor Metrics. % + +In order to avoid repeated counting of the same client IP connecting +the client IPs are stored in-memory in maps of \Qx{clientmap_entry_t}% +\footnote{Defined in \cite[\Qx{src/or/geoip.c}:475-491]{torgit}.} % +without binning or obfuscation. % + +The data reported in \Qx{bridge-ips} is used for all MetricsWeb graphs +about bridge user counts and together with \Qx{bridge-ip-transports}, +which is a list of mappings from pluggable transport names to the +number of unique IP addresses that have connected using that pluggable +transport, for MetricsWeb's +\href{https://metrics.torproject.org/userstats-bridge-transport.html%7D% +{\emph{Bridge users by transport}} and +\href{https://metrics.torproject.org/userstats-bridge-combined.html%7D% +{\emph{Bridge users by country and transports}} graphs. % +The values from \Qx{bridge-ip-versions}, which is a list of unique IP +addresses that have connected to the bridge per protocol family, are +used for MetricsWeb's +\href{https://metrics.torproject.org/userstats-bridge-version.html%7D% +{\emph{Bridge users by IP version}} graph. % + +All the values above are reported rounded to the next multiple of eight. % +The counts are taken from the clientmap, binned, and written to the +file \Qx{stats/bridge-stats}. % +All countries with at least one count are reported. % + + +\subsubsection{Directory response counts} +Another field used to derive client contacts is \Qx{dirreq-v3-resp}, +from which the success count of responses made by the Tor server is +currently used to determine the client count of bridges. % +Field \Qx{dirreq-v3-resp} reports a list of mappings from response +statuses to the number of requests for v3 network statuses that were +answered with that response status, rounded up to the nearest multiple +of eight. % +All response statuses with at least one response are reported. % + +Counts by response status are stored in a simple array without +obfuscation\footnote{% + Array definition \cite[\Qx{src/or/geoip.c:640}]{torgit} and array + processing \cite[\Qx{src/or/geoip.c:644-656}]{torgit}.} % +and the binned values are computed just before writing statistics to +file \Qx{stats/dirreq-stats}, the array is reset after writing the +statistics file successfully.% +\footnote{\label{reset-dirreq}% + In function \Qx{geoip_reset_dirreq_stats} + cf.~\cite[\Qx{src/or/geoip.c:1179-1208}]{torgit}.} + +\subsubsection{Server bandwidth metrics}\label{bw} +The fields \Qx{write-history} and \Qx{read-history} declare how much +bandwidth the Tor server has used recently. Usage is divided into +intervals of currently four hours. % +The end of the most recent interval of the measurements is given. % +Values are the number of bytes used in the last intervals, ordered +from oldest to newest. % +Stored in struct \Qx{bw_array_t} using circular arrays for maxima and +totals.% +\footnote{Cf.~\cite[\Qx{src/or/rephist.c:1209-1236}]{torgit}} % + +Similarly the extra-info descriptor fields \Qx{dirreq-write-history} +and \Qx{dirreq-read-history}% +\footnote{Assembled in \cite[\Qx{src/or/rephist.c:1497-1550}]{torgit}} % +declare how much bandwidth the Tor server has spent on answering +directory requests. % +These values are cut at the value of the configured max bandwidth for +reporting. They are also stored in struct \Qx{bw_array_t} (as +\Qx{write-history} and \Qx{read-history}). + +All four \Qx{*-history} values are stored without obfuscation or +binning and are only cutoff and rounded down to 1KB before they are +reported.% +\footnote{The in-memory values are not changed, cf.~function + \Qx{rep_hist_fill_bandwidth_history} in + \cite[\Qx{src/or/rephist.c:1448-1491}]{torgit}.} % + +\subsubsection{Directory download metrics} +\Qx{dirreq-v3-direct-dl} and \Qx{dirreq-v3-tunneled-dl} provide +statistics about possible failures in the download process of v3 +network statuses. % +The list currently contains values for \Qx{complete}, \Qx{timeout}, +and \Qx{running}. Values are stored in a map of +\Qx{dirreq_map_entry_t} types.% +\footnote{\cite[\Qx{src/or/geoip.c:700-714}]{torgit} } % + +The values are rounded to the next multiple of 4 before printing +statistics and only printed when the rounded value of \Qx{complete} is +bigger than 16. After writing stats the values are +cleared.\footnote{% + Cf.~footnote \ref{reset-dirreq}, page \pageref{reset-dirreq}. } + +\subsubsection{Circuit metrics}\label{circ} + +\Qx{cell-*} Data is derived from circuits% +\footnote{See \Qx{circuit_t} in + \cite[\Qx{src/or/or.h}:2943-3084]{torgit}.} % +held in-memory for normal operation. The values are derived at report +time and statistics for disposed circuits are stored at the time of +their disposal. % +After assembling the data, which will be written to the +\Qx{buffer-stats} file, the data structure used is reset. % + +\subsubsection{Onion services metrics}\label{hidserv} + +Onion services metrics are reported mainly in two fields: +\Qx{hidserv-rend-relayed-cells} and \Qx{hidserv-dir-onions-seen}. % +\Qx{hidserv-rend-relayed-cells} reports the approximate number of +relay cells seen in either direction on a circuit after receiving and +successfully processing a rendezvous cell. % +The original measurement value is obfuscated only for reporting% +\footnote{\label{dirspec212}% + Cf.~\cite[section 2.1.2 of dir-spec.txt]{torspec}} % +and stored in-memory as part of the \Qx{hs_stat_t} structure without +binning or obfuscation.% +\footnote{Cf.~\cite[\Qx{src/or/rephist.c}:3002-3009]{torgit}} % +\Qx{hidserv-dir-onions-seen} reports the approximate number of unique +onion-service identities seen in descriptors published to and accepted +by this onion-service directory. % +The original measurement value is obfuscated only for reporting,% +\footnote{See footnote \ref{dirspec212}, page \pageref{dirspec212}.} % +whereas this value is derived from the \Qx{hs_stat_t} structure, +which contains a clear list of digests of the onion services' public +keys. % +The in-memory struct is reset after creating the report string for the +statistics file \Qx{hidserv-stats}.% +\footnote{% + Reported metrics of onion services are binned and obfuscated using + the Laplace distribution. The exact parameters are defined in + \cite[\Qx{src/or/rephist.c}:3112-3133]{torgit}.} % + +\subsubsection{Exit traffic metrics}\label{exit} +The fields \Qx{exit-streams-opened}, \Qx{exit-kibibytes-written}, +\Qx{exit-kibibytes-read} contain information about exit traffic. % +Data for all three fields are kept in arrays.% +\footnote{Cf.~\cite[\Qx{src/or/rephist.c}:2067-2072]{torgit}} % +The exact values for all ports are stored in-memory. % +The reported number of opened exit streams to a port is rounded up to +the nearest multiple of four, the other two values are rounded to the +next $1024$ bytes.% +\footnote{% + The calculation of the reported values is performed in + \Qx{rep_hist_format_exit_stats} + \cite[\Qx{src/or/rephist.c}:2120-2269]{torgit} } % +All in-memory counters are erased after computing the metrics.% +\footnote{ Cf.~\cite[\Qx{src/or/rephist.c:2291}]{torgit}.} % + +\subsubsection{Connection metrics} \label{conn} % +The \Qx{conn-bi-direct} line is filled from simple counters.% +\footnote{Cf.~function \Qx{rep_hist_format_conn_stats} + \cite[\Qx{src/or/rephist.c:2903-2922}]{torgit}. } % +The data reported is used for network and relay related statistics, +which are provided by MetricsWeb as one of the +\href{https://metrics.torproject.org/connbidirect.html%7D% +{performance related graphs}. % +The counters are reset immediately after statistics computation +independent of write success.% +\footnote{Cf.~function \Qx{rep_hist_conn_stats_write} line $2942$ + \cite[\Qx{src/or/rephist.c:2928-2952}]{torgit}. } % + +\subsubsection{Unused extra-info descriptor fields} \label{unused} % + +The data of the following extra-info descriptor fields are currently +not used anywhere in Tor Metrics: % +\begin{itemize} +\item all \Qx{cell-*} fields, +\item all \Qx{exit-*} fields, +\item \Qx{dirreq-v3-direct-dl} and \Qx{dirreq-v3-tunneled-dl}. +\end{itemize} + +It might be a premature decision to simply stop reporting these +unused values in extra-info descriptors, because the reason for not +using them could be lack of awareness that they are reported or a lack +of resources to put them to use. % +For example, the values from exit-node related fields, i.e., +\Qx{exit-*}, could be used to address questions related to exit data, +which are asked in research (e.g.~in \cite{privcount} cf.~\ref{priv}), +and to introduce new statistics and graphs in MetricsWeb as well as +making aggregate data sets available. % +On the other hand, concerns where raised that providing the +\Qx{exit-*} statistics would enable attacks that could uniquely +identify the applications used by clients or for fingerprinting unusal +port etc.% +\footnote{These were brought to Tor Metrics attention by Rob Jansen + who addressed the topic in + \href{https://lists.torproject.org/pipermail/metrics-team/2016-January/000057.html... + {Tor Metrics' mailing list}: % + ``Tor is classifying its traffic into ports, which could uniquely + identify the application being used by the client. They also track + bandwidth usage per port (and per exit); again, this is bad for + those using a random or unique looking ports (that a given exit does + not see very often) because it could be used to create a + fingerprint. Intersection attacks become easier with this + information.'' % + + \Qx{cell-*} statistics are percieved less critical, but still: % + ``This provides queue timings and number of cells being processed at + a relay. The number of cells can be used to compute bandwidth of + circuits. It may be possible to launch some attacks that create + several circuits with the intent of moving which decile buckets some + legitimate circuits get placed into, but this is less worrisome of + an attack than the others.'' % +} % +In general, ending the collection of currently unused data should be +considered carefully and not hastened. % +The future assessment of all Metrics' data will be the right project +to address the question of whether to keep or drop the collection of +currently unused metrics. % + +\subsubsection{Other data} +This section concentrates on data gathered or written for other +purposes than filling an extra-info descriptor. % + +\subsubsection*{Stats Heartbeat} + +The function \Qx{log_heartbeat}% +\footnote{\cite[src/or/status.c:91-165]{torgit}} performs some checks +to determine the state of the running relay/bridge, but also logs some +statistics about client connections. % +\Qx{log_heartbeat} is one of the periodic event callbacks.% +\footnote{\cite[\Qx{src/or/main.c:1193-1220}]{torgit}} % +Function \Qx{format_client_stats_heartbeat}% +\footnote{\cite[\Qx{src/or/geoip.c:1457-1488}]{torgit}} computes the +exact number of different client connections for the last six hours +using \Qx{client_history} unless turned off or set to a different +interval in property \Qx{HeartbeatPeriod}. % +In addition, the number ob bytes written and read by the Tor server +process is logged.% +\footnote{Cf.~\cite[\Qx{src/or/main.c:159-162}]{torgit}.} % + +\subsubsection*{Logging}\label{debuglog} +The debug level logs client data in addition to operational data. +\begin{description} +\item[\Qx{geoip_note_client_seen}] logs the client seen with the + transport used in debug mode.% + \footnote{See line 582 in \Qx{geoip_note_client_seen} (cf.~footnote + \ref{noteclient}, page \pageref{noteclient}).} +\item[\Qx{geoip_get_transport_history}] logs the true total number of + clients and the true numbers for each transport type in debug + level.\footnote{Cf.~\cite[\Qx{src/or/geoip.c:863,885,900}]{torgit}.} +\item[\Qx{rep_hist_note_exit_bytes}] logs for each port the true number + of bytes read and written in debug mode.% + \footnote{Cf.~\cite[\Qx{src/or/rephist.c:2313}]{torgit}.} +\item[\Qx{rep_hist_note_exit_stream_opened}] logs the port to which an + exit stream was opened in debug mode.% + \footnote{Cf.~\cite[\Qx{src/or/rephist.c:2325}]{torgit}.} +\end{description} + + +\section{Possible privacy issues} \label{privacy-im}% + +Tor servers configured to keep statistics and report extra-info +descriptors% +\footnote{Reporting of extra-info descriptors can be turned off or + limited via configuration. % + It is assumed that reporting and logging options are enabled, i.e., + Tor server options like \Qx{BridgeRecordUsageByCountry}, the various + \Qx{*Statistics} etc.~are set to $1$.} % +have a reporting interval of 24 hours. % +The following types of data are held in-memory up to this interval or +even longer depending on the type of data and time of collection. % +\begin{itemize} +\item Client IPs from various types of contacts to a server, i.e., + contacts to bridges, to entry relays, to directory + mirrors. % +\item Public key digests of onion services and cell counts + (cf.~\ref{hidserv}); +\item bandwidth used generally and bandwidth consumed for serving + directories (cf.~\ref{bw}); +\item exit traffic stream count as well as exit bytes written and read + (cf.~\ref{exit}). +\end{itemize} + +The most critical data in the above list are client IPs and related +information.% +\footnote{\label{jansen}% + These were also mentioned as most critical by Rob Jansen in + his mail to + \href{https://lists.torproject.org/pipermail/metrics-team/2016-January/000057.html... + {Tor Metrics' mailing list}: % + + ``[unique ips per country code] + *-ips (there are many of these, e.g. "entry-ips") + Usually this involves storing individual user IP addresses in + memory (in order to track uniqueness) over some period of time + (usually 24 hours), sometimes for longer than the user would have + otherwise been known to Tor (if a user's session is 1 hour, Tor + could remember the IP for at most 23 additional hours). This is + reported, e.g., per entry; there are many cases in the data where it + is very likely that only one user is connecting to a guard from a + given country (because it is rounded up to 8). Users in small + countries have the greatest risk (intersection attacks become really + easy).'' % +} % + +The following extra-info fields depend on code and in-memory +structures used for storing the client IPs: % +\begin{itemize} +\item Provided by bridges: + \begin{itemize} + \item unique client count by country of origin for every contact + in field \Qx{bridge-ips}, + \item IP version in \Qx{bridge-ip-versions}, and + \item transport used in \Qx{bridge-ip-transports}. + \end{itemize} +\item Relays and bridges report: % + \begin{itemize} + \item unique client count by country of origin for directory + requests \Qx{dirreq-v3-ips} for successful responses. + \end{itemize} +\item Entry guards report \Qx{entry-ips}, i.e., the unique client count by + country of origin for every contact. % +\end{itemize} + +Some of these fields, namely \Qx{dirreq-v3-ips} and \Qx{entry-ips}, +are currently not used further up in the Tor Metrics data processing +chain, but others support vital client statistics about the Tor +network.% +\footnote{% + The bridge client count estimates are built on \Qx{bridge-ips}, + \Qx{bridge-ip-versions}, \Qx{bridge-ip-transports}.} % +Section \ref{mitigate} explores the options for keeping these +statistics and reducing or even avoiding the in-memory storage of +lists of IP addresses of Tor clients. % + +The client data sets of MetricsWeb and the visualizations based on +them occasionally cause questions about privacy implications of small +client counts per country or per country and transport. % +Section \ref{small} gives some examples and provides information about +client counts per country and other parameters. % +These concerns are raised for tables and graphs at the aggregated data +level, but the underlying data is tightly connected to the IP +addresses collected in-memory. % +Thus, it makes sense to also address this privacy issue in the current +report, which is done in section \ref{obfuscation}. % + +\subsection{Small clients counts} \label{small}% +Small countries usually have very tiny Tor client counts, examples for +Antarctica and Vatican City are shown figures \ref{antarctica} (page +\pageref{antarctica}) and \ref{vatican} (page \pageref{vatican}). % + +\begin{figure}[!h] + \centering + \begin{subfigure}[b]{0.45\textwidth} + \centering + \includegraphics[width=\textwidth]{userstats-relay-country-aq-2016-01-01-2017-01-01-off.png} + \caption{Users connecting to relays.} + \label{antarctica-relay} + \end{subfigure} + % + \begin{subfigure}[b]{0.45\textwidth} + \centering + \includegraphics[width=\textwidth]{userstats-bridge-country-aq-2016-01-01-2017-01-01.png} + \caption{User(s) connecting to bridge(s).} + \label{antarctica-bridge} + \end{subfigure} + % + \begin{subfigure}[b]{0.45\textwidth} + \centering + \includegraphics[width=\textwidth]{userstats-bridge-combined-aq-2016-01-01-2017-01-01.png} + \caption{User by transport.} + \label{antarctica-combi} + \end{subfigure} + \caption{Antarctica Tor usage 2016. MetricsWeb is the source of + all graphs (see table \ref{urls}). } + \label{antarctica} +\end{figure} + +\begin{figure}[!h] + \centering + \begin{subfigure}[b]{0.45\textwidth} + \centering + \includegraphics[width=\textwidth]{userstats-relay-country-va-2016-01-01-2017-01-01-off.png} + \caption{Users connecting to relays.} + \label{vatican-relay} + \end{subfigure} +% + \begin{subfigure}[b]{0.45\textwidth} + \centering + \includegraphics[width=\textwidth]{userstats-bridge-country-va-2016-01-01-2017-01-01.png} + \caption{User(s) connecting to bridge(s).} + \label{vatican-bridge} + \end{subfigure} + % + \begin{subfigure}[b]{0.45\textwidth} + \centering + \includegraphics[width=\textwidth]{userstats-bridge-combined-va-2016-01-01-2017-01-01.png} + \caption{User(s) by transport.} + \label{vatican-combi} + \end{subfigure} + \caption{Vatican City Tor usage 2016. MetricsWeb is the source of + all graphs (see table \ref{urls})} + \label{vatican} +\end{figure} +Counts of clients from Antarctica directly connecting to the Tor +network during the year $2016$ are graphed in figure +\ref{antarctica-relay}, the even smaller count of bridge users from +Antarctica in 2016 in figure \ref{antarctica-bridge}, and figure +\ref{antarctica-combi} breaks the bridge connection down into the type +of transport used. % +Similarly Tor client count during $2016$ for Vatican City. % +Most notably, bridge users from Antarctica (\ref{antarctica-combi}) +and Vatican City (see figure \ref{vatican-combi}) seem to be all +distinguished by the type of transport they use. % + +These two are not even the most extreme examples in terms of client +counts, Vatican City has a median of 13 users in $2016$ and +Antarctica a median of 8. % +For $2016$ there are 25 countries with a median user number less than +ten. % +Table \ref{median-low} (page \pageref{median-low}) shows the count of +countries with less than $m$ median users per day in $2016$, and as +contrast table \ref{median-high} lists the count of countries with +median user numbers starting at $1000$. % + +These small counts of distinguishable subsets of Tor clients look +problematic concerning privacy.% +\footnote{% +Also cf.~footnote \ref{jansen}, page \pageref{jansen}. % +} % + +\section{Mitigate privacy impact}\label{mitigate} % + +The following sections take a look at various +techniques/mechanisms/systems to reduce privacy impact reaching from +privacy aware counting in \ref{elaborate} over using Tor external data +gathering systems in \ref{priv} to exploring the options of simply +avoiding the collection of problematic data in~\ref{implchange}. % + +Many of the techniques and measurements listed in the following +sections are far from being implemented and would need extensive work +to be useful in practice. % +Hence, the following should be read as a description of what might be +possible and not as what will be implemented in the near future. % +A more concrete list of what could be implemented in the nearer future +is given in sections \ref{conclusion} and \ref{summary}. % + +\subsection{Counting, surveys, sketches}\label{elaborate}% + +Counting of unique items is na"{\i}vely done by keeping a unique list +of these items. % +For finding an approximate count of unique items this could be avoided +trading in accuracy of the resulting metrics. % +The following sections discuss mechanisms for counting without keeping +all items in-memory. % + +\subsubsection{Probabilistic counting}\label{count} +Estimating the count of unique items, e.g., connecting clients, +without storing all items registered during the measurement interval +could be solved by probabilistic counting as proposed in +\cite{fm85}. % +Without any additional randomization this would give a part of the +clients additional privacy by plausible deniability depending on the +used hash function\footnote{But it cannot prevent the identification + of certain IPs with high probability (for example, cf.~\cite[section + 4.1.1]{hambolu14} or \cite[section 2.2]{ts11}).} and certainly +provide another barrier for an attacker to determine client IPs from +the data held in-memory. % +Compared to the current scenario this could provide a gain in privacy +for the IP counting task. % +In addition, error estimates and efficiency of the probabilistic +counting method are known (see \cite{fm85}) and would provide a basis +for computing the aggregate statistics from the individual reports. % + +The steps necessary for deploying such a solution require extensive +effort: for the actual implementation the hash function used and size +of sketches as well as the accuracy of the count estimate need to be +chosen. % +The intended accuracy leads to the decision between using the simple +algorithm or the algorithm with stochastic averaging. % +The metrics derived might need to be adjusted depending on the now +available error estimates. % + +\subsubsection{Privacy preserving surveys}\label{survey} +Clients connecting to a Tor server could be viewed as entities taking +a survey. % +A recent approach with even differential privacy guarantees\footnote{ + See \cite[section 3]{epk14} for definition and proves of their + differential privacy claims for RAPPOR. } is the method proposed in +\cite[RAPPOR - Randomized Aggregateable Privacy Preserving Ordinal +Response]{epk14}. % +RAPPOR is based on client side generation of noisy sketches and a +machine learning approach for evaluating these sketches to calculate +estimates for the statistics of interest. % +Clients need to compute an initial noisy sketch from their data, which +is called permanent randomized response, and use this permanent +response to produce an again obfuscated sketch, the instantaneous +response, as actual report. % +The instantaneous response sketch would have to be part of all those +connections made by the client that are used for statistics, e.g., it +would need to be added to a directory request. % + +In total, the changes necessary for implementing a protocol like +RAPPOR are extensive: changes to the client code, the Tor server code, +the communication protocol, and the final processing for deriving the +wanted estimates. % +A survey setting trusting client generated data sketches would also +open room for spam or manipulation of the metrics taken. % + +\subsection{Metrics systems proposed by Tor related research} +\label{priv} % +With the progress of privacy research during the last years metrics +systems for collecting network data in a privacy conscious manner were +proposed. % +Two systems explicitly targeting metrics collection from the Tor +network, which not only provide the design and privacy assessment of +their system, but also make the code-base from their respective proof +of concept and reference implementations freely available, are +PrivEx~\cite{privex} and PrivCount~\cite{privcount}. % +It is out of the scope of this report to suggest or discuss any +replacements or additional metrics systems for the current file based +Tor Metrics system. % +Still, looking closely at PrivEx and PrivCount provides valuable +insight about what they deem potentially privacy endangering data and +what data of interest might not yet be available through Tor +Metrics. % + +PrivEx \cite{privex} proposes a metrics system running separately from +Tor instances and introduces its own network of various types of +server instances. % +The data processed is retrieved from adapted Tor server instances via +the controller protocol, which is extended for PrivEx purposes. % + +PrivCount builds on one collection scheme introduced by PrivEx and +extends its collection ability as well as some operational +properties. % +The data collecting instances of the PrivCount network also use the +controller protocol, i.e., an extended version of the currently +implemented protocol, to retrieve the data of interest from the Tor +server they are collecting from. % + +The main purpose of PrivEx' reference implementation is the +combination of in- and out-going traffic of the Tor network. % +In particular, identifying the number of connections made from Tor +clients to possibly censored web-addresses, which gives an estimate +about Tor usage for censorship circumvention. % + +PrivCount focusses on entry and exit statistics. % +This comprises client counts at the entry nodes, which are collected +via the extended controller protocol and not based on the Tor server +internal client IP list, and various metrics for traffic exiting the +network. % +PrivCount's exit statistics are concerned with streams exiting via +certain ports and the influence of exit policies on exit traffic.% +\footnote{% + The authors of \cite{privcount} don't address why the data provided + in the various extra-info descriptor fields \Qx{exit-*} is + insufficient or how the data overlaps. % +} % + +In general, an externally operated metrics system is quite expensive +to maintain compared to the current Tor Metrics system. % +Furthermore, newly implemented controller events for retrieving data +could be also a data source for an attacker, if not properly secured +by the server operator. % +It would require additional operation of metrics server instances, +additional maintenance of the code-base, and additional processes to +integrate the new data sources into the existing ones. % +In addition, the privacy properties of such system and the security of +their implementation would be more difficult to assess from external +parties than the current descriptor based Tor Metrics system. % + +\subsection{Mitigating implementation changes}% +\label{implchange} % + +The following measures are directly derived from source code analyses +of both the metrics related Tor server code and the core Tor Metrics +code for data aggregation and client count estimation. % +They are generally concerned with avoiding data gathering and reducing +the availability of sensitive data via other channels like logging or +controller events. % + +\subsubsection{Reduce duration of in-memory data retention}% +\label{retention} % + +Tor servers configured to report statistics keep client IP addresses +and associated information in-memory for at least one measurement +interval of 24 hours. % +Unfortunately, the current code retains these IPs and related +information for even up to two such measurement intervals (in case of +bridges), because the old data originating from the previous interval +is only released before writing statistics about the current +measurement interval. % +Erasing data immediately after computing statistics would more than +half the retention time. % + +\subsubsection{Avoid problematic logging and controller events}% +\label{problemlog} % + +Some of the possibly harmful data held in-memory for providing metrics +is currently also used for logging and responding to controller +clients. % + +The controller protocol is defined in \cite[control-spec.txt]{torspec} +and allows triggering of the heartbeat log message (\cite[section 3.7, +control-spec.txt]{torspec}). % +Another request defined in \cite[sections 3.9 and 4.1.14, +control-spec.txt]{torspec} to receive information from bridges about +recent client connections.% +\footnote{In particular this is the clients-seen event, which is used +by nyx \url{arm.torproject.org}.} +The replies contain complete counts by country and transport (also see +\Qx{geoip_bridge_stats_write}). % + +Using the option \Qx{HeartbeatPeriod} a Tor server can be configured +to write a recurring log message, which serves the purpose of +informing the operator that the server is still running and +working. % +The minimal reporting interval is $30$ minutes and the statement +logged contains the exact number of different client connections for +the last six hours. % +In addition, the heartbeat log message can be triggered (without any +time constraints) by a controller client signal. % + +Additional logging of collected data, e.g., client counts per +transport, exit port opened and exit bytes read/written, takes place +in debug mode (cf.~\ref{debuglog}). % + +In order to improve Tor client privacy these functionalities ought to +be changed to only report data unrelated to client IPs and only about +time intervals equal or bigger than the chosen reporting intervals for +extra-info descriptors. % + +\subsubsection{Replace problematic data sources} \label{avoid} % +Client IPs are currently only kept in-memory for deriving estimates of +bridge client counts where at the same time the estimates for direct +Tor client counts are derived from counts of successful directory +requests taking multiple requests into account as these occur usually +as a constant factor for each client. % +There is no reason, why the estimations should differ and the IP lists +in-memory became obsolete, if the same estimation method for bridge +client counts would be supplied.% +\footnote{% + The question about removing the map and corresponding measurements + from the code that hold client IP addresses was raised a while ago, + for details see Tor Bugtracker \cite[ticket #15469]{tortrac}. } % +This would cause the estimates to be even more comparable and also +reduce configuration and simplify the metrics related code of the +Tor. % + +Such a removal would affect the following extra-info descriptor +fields: % +\begin{itemize} +\item \Qx{dirreq-v3-ips}, +\item \Qx{entry-ips}, +\item \Qx{bridge-ips}, +\item \Qx{bridge-ip-versions}, and +\item \Qx{bridge-ip-transports}. +\end{itemize} % +As all extra-info descriptor fields regarding entries and bridges are +concerned, the two fields \Qx{bridge-stats-end} and +\Qx{entry-stats-end} would loose their meaning and could also be +omitted. % + +The fields \Qx{dirreq-v3-ips} and \Qx{entry-ips} are currently not +used for any statistics or data sets provided by Tor Metrics and could +be dropped. % + +All other fields from above are the basis for bridge client count +estimates.% +\footnote{For details see section \ref{cstat}. % + The directly connecting client count is entirely based on + \Qx{dirreq-v3-reqs}, which is not derived from a clientmap + structure. } % +The field \Qx{dirreq-v3-reqs} is also available in extra-info +descriptors uploaded by bridges% +\footnote{% + In $2016$ roughly $80%$ of bridge extra-info descriptors that + provided \Qx{bridge-ips} also contained \Qx{dirreq-v3-reqs}. % +} % +and could be used for clients by country count for replacing +\Qx{bridge-ips}. % +The fields \Qx{bridge-ip-versions} and \Qx{bridge-ip-transports} are +used to estimate fractions of the client counts that have their origin +in a certain country or use a certain IP version. % +These could be filled by counting countries and versions of the +occurring requests registered in \Qx{dirreq-v3-reqs}, of course the +corresponding aggregated statistics and estimates need to be +adapted. % +All fields mentioned above could be dropped and two new fields for +both relay and bridge extra-info descriptors need to be added; in +particular, \Qx{dirreq-v3-versions} and \Qx{dirreq-v3-transports}.% +\footnote{For consistent naming it might be useful to change the field + name \Qx{dirreq-v3-reqs} to \Qx{dirreq-v3-countries}.} % +This would lead to less fields in extra-info descriptors, increased +privacy, and provide more comparable estimates for relays and +bridges. % +A more detailed description and analysis of the included processing +changes for generating estimates is given in section \ref{detail}. % + +\subsubsection{Obfuscate stored and reported data}% +\label{obfuscation} % + +Client counts per country can be very low on a server basis, e.g., +roughly $80%$ of counts reported in extra-info descriptors for the +three biggest Tor user groups (de, ru, us) only report the lowest +count possible. % +Raising the available threshold constants for reporting total client +counts and client counts by country% +\footnote{ As defined in + \cite[\Qx{src/or/geoip.c:658-667}]{torgit}.} % +cannot be used as mitigation measure as it also would render most of +the client count estimates useless. % +Instead of using thresholds a white list could be introduced that +lists all countries for which the count should be recorded. % +Only countries on the white list would be added to the counting array +and all others would be obfuscated by summing them under \Qx{other}. % +The list itself could be provided in an easily parsable text format +added to Tor server source code. % + +There are two ways to choose countries for the white list: either by +population size or by Tor usage based on Tor client count +statistics. % +A choice by population count at a threshold of $2,000,000$ would lead +to a list of $147$ white listed countries.% +\footnote{% + According to the World Factbook + \cite[\href{https://www.cia.gov/library/publications/the-world-factbook/rankorder/2119ra... + {countries by population size} and + \href{https://www.cia.gov/library/publications/the-world-factbook/rankorder/rawdat... + {raw data}]{cia}. % +} % +Using the Tor usage approach a cut-off at a daily mean of $1000$ Tor +clients would generate a list of $97$ countries based on data from +$2016$ (cf.~table \ref{median-high}). % + +Either choice of generating the white-list would need to be +re-adjusted yearly or more often, which would also cause additional +maintenance work. % +The second approach would be more difficult to adjust, because once a +white-listing mechanism is introduced the data for adjusting won't be +available anymore from Tor Metrics statistics and would need to be +generated by other means. % + +It should also be evaluated, if client directory responses (field +\Qx{dirreq-v3-resp}) and the client count related fields proposed in +section \ref{implchange} (client counts by country +\Qx{dirreq-v3-countries}, version \Qx{dirreq-v3-versions} and +transport \Qx{dirreq-v3-transports}) even when not based on in-memory +client IP lists should be obfuscated. % +In order to obtain obfuscation for both the in-memory counts and the +reported results noise addition at counter initialization seems to be +an efficient measure on first glance. % +For onion service statistics Tor Metrics implemented the generation of +Laplace noise,% +\footnote{\cite[proposals/238-hs-relay-stats.txt]{torspec} and + \cite{tr201501001,tr201504001}} % +which could be applied in the current scenario and fosters code reuse +of critical parts like the Laplace noise generation. % + +But, a simulation applying noise to collected data and processing the +resulting data further for use in MetricsWeb showed that the +additional noise would render the existing statistics very +inaccurate.% +\footnote{A closer look at the involved statistics: % + The current estimations for user counts by country rely on a sum of + reported data. % + In the sketched obfuscation scenario this sum would also contain a + sum $W_n$ of Laplace random values, where $n$ is the number of + reported values for the particular country. % + The standard deviation of $W_n$ depends on the obfuscation + parameters and on $\sqrt n$. % + A daily median (mean) of reports from relays is around $1300$ (mean: + $2200$) and $550$ (mean $150$) for bridges. % + Such values are not tolerable in the current estimation process and + obfuscation should only be introduced with additional measures to + keep the existing accuracy. } % +Thus, it is advisable to conduct further research and wait for the +already planned assessment for both privacy and statistical accuracy +throughout the data-verse of Tor Metrics. % + +\subsection{Conclusion} \label{conclusion} + +Integrating counting systems or parts thereof as discussed in section +\ref{elaborate} would require extensive design and implementation work +for changes of the current Tor source code and also for the +aggregating and estimation code further up in the Tor Metrics +processing chain. % + +Applying the measurement systems outlined in section \ref{priv} in Tor +Metrics would mean a step toward using a second totally different and +more costly manner of measuring Tor. % +If introduced in addition to the current file based system +(cf.~\ref{overview}) the cost of operation would be very high and the +actual problem of in-memory retention is not addressed as the Priv* +systems use Tor's internal accounting of client connections and other +measured data. % + +The changes necessary for the third approach in \ref{avoid} affect +both the Tor server and Tor Metrics code bases in very clear ways, +which consist mostly in code reduction, streamlining, and using +different fields of already parsed extra-info descriptors. % +Thus, it seems to be the most feasible answer for improving privacy +current Tor server code. % +The other measures listed in section \ref{implchange} would easily fit +into the changes necessary for applying \ref{avoid} or be obsolete +with the introduction of these changes. % + +The details and various steps of avoiding and reducing data collection +are given in section~\ref{detail}.% + +\section{Impact of implementation changes} \label{detail} % + +Section \ref{avoid} sketches a solution for avoiding the in-memory +storage of client IP for client count metrics by replacing the source +of vital estimates. % +First the actual methods for client count estimation are discussed in +\ref{cstat}. % +Based on this the changes necessary are detailed in +\ref{metrics-change} before identifying the changes to the Tor server +code and the possible side effects in \ref{changes}. % + +\subsection{Client related estimates} \label{cstat} % + +The current method of estimating client numbers was introduced in +$2013$ for both bridge and relay clients% +\footnote{See ticket \cite[#8462]{tortrac} and related code + \url{https://gitweb.torproject.org/metrics-web.git/log/?qt=grep%5C&q=8462%7D. + The code was integrated into Tor Metrics code during $2015$.} % +to replace an estimation method based on the number of unique IP +addresses making connections to Tor servers. % +The daily estimate uses values taken from extra-info descriptors, in +particular the count of daily directory responses (respectively +requests) and the number of bytes written delivering the directory +data.% +\footnote{Values taken from extra-info descriptor fields + \Qx{dirreq-v3-reqs}, \Qx{dirreq-v3-resp} and + \Qx{dirreq-write-history}.} % +According to \cite{tr201210001} it suffices to estimate the total +number of directory requests to bridges and relays, from which the +client count is calculated directly. % + +The data from extra-info descriptors used for bridge related estimates +is also available for relays. % +Thus, it seems natural to apply the same formula for estimating relay +client numbers. % +Looking at the code the implementations differ for bridge and relay +clients. % +For relay clients the code diverts from the estimation method +explained in \cite{tr201210001} and uses request counts per country.% +\footnote{Listed in descriptor field \Qx{dirreq-v3-reqs}. % + The relevant code can be found in \cite[from line 91 of + \Qx{modules/clients/src/org/torproject/metrics/clients/Main.java}]{mwgit}. % +} % +The raw data is taken from extra-info descriptor field +\Qx{dirreq-v3-reqs} and used to fill clients by country counts as well +as the entire count of clients for this relay. % + +Bridge client counts are implemented as suggested in +\cite{tr201210001} and are estimated from directory request responses +as well as contact IP counts,% +\footnote{\cite[starting at line 230 of + \Qx{modules/clients/src/org/torproject/metrics/clients/Main.java}]{mwgit} +} % +which are derived from \Qx{dirreq-v3-resp}. % +The total value of client contacts is taken from descriptor field +\Qx{dirreq-v3-resp} (the successful responses) and counts of +connections from different countries is derived from the field +\Qx{bridge-ips}. % +The fractions for \Qx{version} use descriptor field +\Qx{bridge-ip-versions} and \Qx{transport} is derived from +\Qx{bridge-ip-transports}. % +The bridge client number estimates per country build on the estimate +for the number of total clients and derive the client numbers by +applying the fraction per country estimated from the number of +connections made by country, i.e., \Qx{bridge-ips}. % +The current method of estimating caused unlikely results for the +number of bridge clients by country. % +The discussion and analysis of these problematic results% +\footnote{A discussion via Tor Bugtracker \cite[ticket + #18167]{tortrac} began a year ago considering the usage of various + fields for bridge client per country estimation, i.e., + \Qx{bridge-ips} vs.~\Qx{dirreq-v3-reqs}. % + + Most bridges report \Qx{dirreq-v3-reqs} already, for $2016$ almost + $90%$ of bridges reporting \Qx{bridge-ips} also provided the field + \Qx{dirreq-v3-reqs} in their extra-info descriptor. % +} % +suggests that switching to a calculation of bridge client count +estimates that uses the same extra-info descriptor fields as direct +client count estimates would even improve the estimate. % + +\subsection{Data changes} \label{data-change} % + +The change proposed in \ref{avoid} would result in dropping the fields +\Qx{bridge-stats-end}, \Qx{bridge-ips}, \Qx{bridge-ip-versions}, +\Qx{bridge-ip-transports}, \Qx{entry-ips}, \Qx{entry-stats-end}, and +\Qx{dirreq-v3-ips} from extra-info descriptors. % +Two additional fields \Qx{dirreq-v3-transports} and +\Qx{dirreq-v3-versions} need to be added in order to keep the current +Tor Metrics statistics about client counts.% +\footnote{% + The naming is chosen along the current naming scheme that includes + the string \Qx{v3}. % + It might be useful to drop this string from all of the \Qx{dirreq-*} + descriptor fields. % +} % + + +\subsection{Metrics changes} \label{metrics-change} % + +As explained in \ref{cstat} the client count estimates for relays are +already independent of descriptor fields that are to be dropped. % +Using the same estimation approach for bridges would lead to more +comparable and even more accurate results (cf.~\ref{cstat}). % +The necessary code changes for MetricsWeb would result in unified +processing of the two extra-info descriptor types. % +The changes necessary for metrics-lib/DescripTor would in general +result in providing the two new methods for the additional fields, but +are free of changes to the parsing logic. % + +Another affected code base of Tor Metrics would be Onionoo, which uses +the fields \Qx{bridge-ips}, \Qx{bridge-ip-versions}, and +\Qx{bridge-ip-transports} for providing additional information in +bridge {\em client documents}.% +\footnote{% + See section \emph{Bridge clients objects} of the Onionoo protocol + definition \url{https://onionoo.torproject.org/#clients%7D and Onionoo + source code \cite[class + \Qx{org.torproject.onionoo.updater.ClientStatusUpdater}]{ogit}. % +} % +The relevant Onionoo protocol fields depending on \Qx{bridge-*} +descriptor fields are still in {\em beta} stage and could either be +removed or simply be filled from the new fields available, which is a +minor code change.% +\footnote{% + Taking into account that the new fields would have multiple counts + per day and client and would need to be adjusted with the factor + used for the total client count estimation. } % + +\subsection{Tor server changes and side effects} \label{changes} % + +When describing the code changes one needs to make choices; and the +choice here was to describe the maximal code reduction possible, but +of course there is room to alter the proposed changes and still reach +the intended goal. % + +The following gives a terse overview of the code changes necessary for +Tor server according to section \ref{data-change}. % +Also provided are possible Tor server configuration simplifications, +and side effects or changes regarding logging and controller +functionality. % + +\subsubsection{Server changes} +With the changes to extra-info descriptor proposed in +\ref{data-change} the Tor server options +\begin{itemize} +\item \Qx{BridgeRecordUsageByCountry} and +\item \Qx{EntryStatistics} +\end{itemize} +could be omitted and replaced by option \Qx{DirReqStatistics}, which +could be used for all types of servers alike. % + +The alterations for the metrics providing code in Tor servers would +mostly be code removal. % +The description follows the process of collection as described in +\ref{server-proc} in order to cover all affected places in the server +code. % + +The \Qx{record_bridge_stats_callback}% +\footnote{ \cite[\Qx{src/or/main.c}:1752-1777]{torgit} } % +could be omitted entirely together with the following functions: +\begin{itemize} +\item \Qx{geoip_bridge_stats_init}, % +\item \Qx{geoip_bridge_stats_write}, % +\item \Qx{geoip_get_transport_history}, % +\item \Qx{geoip_get_client_history}, and % +\item \Qx{geoip_format_bridge_stats} (also see \ref{control-change} for + controller related changes). % +\end{itemize} +The second main metrics callback \Qx{write_stats_file_callback} would +be kept, but shortened to not provide entry statistics anymore. % +The affected functions would be: % +\begin{itemize} +\item \Qx{geoip_entry_stats_write}, +\item \Qx{geoip_format_entry_stats}, +\item \Qx{geoip_reset_entry_stats}. +\end{itemize} +Other functions for handling \Qx{clientmap}s: +\begin{itemize} +\item \Qx{geoip_remove_old_clients}, +\item \Qx{remove_old_client_helper_}, +\item \Qx{geoip_get_client_history}. +\end{itemize} +In order to record versions and transports for bridges lists of new +structs \Qx{geoip_version_t} and \Qx{geoip_transport_t} similar to +\Qx{geoip_country_t} would need to be defined.% +\footnote{ \cite[\Qx{src/or/geoip.c:55-59}]{torgit}} % +The function \Qx{geoip_note_client_seen} would need to be adapted to +fill the new structures for recording client data. % +In addition, the code for handling client ip statistics would need to +be removed and the code for filling the new lists of +\Qx{geoip_version_t} and \Qx{geoip_transport_t} would need to be +added. % + +Changes would also be necessary for \Qx{geoip_dirreq_stats_write}, +which is called from \Qx{write_stats_file_callback}. % +This function would need to be adapted to omit writing the dropped +descriptor fields and add the new descriptor fields derived from the +above mentioned structures. % +Any calls to \Qx{geoip_note_client_seen} with action +\Qx{GEOIP_CLIENT_CONNECT} could also be removed. % + +\subsubsection{Affected controller events} \label{control-change} + +Once \Qx{clientmap} structures and related code are removed from +Tor server code the controller code also needs to be changed. % +The functions \Qx{format_bridge_stats_controller} and +\Qx{control_event_clients_seen} would either need to be removed or +adapted to the new structures for recording the counts. % + +Another affected controller function is +\Qx{format_client_stats_heartbeat}, which would need to be adapted to +not report the client counts by country anymore. % + +\section{Summary} \label{summary} % + +The previous sections of this report describe the Tor Metrics +processing chain and the data provided by Tor Metrics with the aim to +identify several ways to improve privacy regarding data held in-memory +for clients of the Tor network. % +Possible mitigation measures are surveyed and the most feasible +approach was detailed in section~\ref{detail}. % + +Many of the discussed improvements generate a workload for several +future projects and some also need further research. % +Nevertheless, a recommendation for a list of first changes can be +derived: % + +\begin{itemize} +\item Replace the current server internal counting mechanism in order + to avoid holding client IPs in-memory. % + This leads to the immediate privacy improvement of not keeping + client IPs in-memory for statistical purposes. +\item Use \Qx{dirreq-v3-reqs} for client count estimation (for both + bridges and relays, as suggested in \ref{avoid}). % + This would keep the statistics on client count as accurate as before + without relying on client IP lists. % +\item Base the new fields \Qx{dirreq-v3-versions} and + \Qx{dirreq-v3-transports} on the current counting mechanism used for + \Qx{dirreq-v3-reqs}. % + This would also keep the statistics based on client count as + accurate as before without relying on client IP lists. % +\item Remove controller protocol parts that rely on the old client + count mechanism. % + This would avoid reporting privacy impacting data to the control + port. +\item Remove unnecessary logging of vital data or tie the logging to + test-mode for avoiding privacy impacting data in logs. +\end{itemize} + +These changes have a clearly defined scope and would result in privacy +improvement. % +Identifying immediate changes for implementation and defining future +changes for metrics collection is based on the following steps: % +\begin{itemize} +\item Distill a change proposal for the Tor server changes chosen to + be implemented. +\item Provide several Tor server patches for the changes identified + above. +\item Provide patches for the necessary adaptions in the Tor Metrics + processing chain. +\item Assess privacy questions as raised in this report and + statistical accuracy throughout the data-verse of Tor Metrics. % + Also assess the introduction of more obfuscation measures for + various client counts without impacting estimation accuracy. % + In addition, the removal of unused data fields from extra-info + descriptors (as identified in section \ref{unused}) should be + addressed and evaluated. % +\end{itemize} + +The assessment listed in the last item above is in part a consequence +of this report, which is planned to start in the second half of +$2017$. % + +\pagebreak + +\appendix +\section{Appendix} % +\subsection{Tables} +\begin{table}[!h] +\begin{subtable}{0.45 \textwidth } + \begin{tabular}{|c||c|c|c|c|c| +}\hline + $m$ & 10 & 50 & 130 & 210 & 340 \\hline + $C$ & 25 & 52 & 87 & 103 & 116 \\hline + \end{tabular} + \caption{Count $C=|{\rm{median}(c) < m}|$ of countries with median + of daily users in $2016$ less than the given limit $m$.} + \label{median-low} +\end{subtable}\quad +\begin{subtable}{0.45 \textwidth } + \begin{tabular}{|c||c|c|c|c|c|c|c|}\hline + $m$ in $10^3$ & 1 & 5 & 10 & 50 & 100 & 200 & 300 \\hline + $C$ & 97 & 51 & 26 & 5 & 4 & 2 & 1 \\hline + \end{tabular} + \caption{Count $C=|{\rm{median}(c) > m}|$ of countries with median + of daily users in $2016$ higher than the given limit $m$ thousands.} + \label{median-high} +\end{subtable} +\caption{Count of countries with median of daily users in $2016$. + There are roughly $250$ countries, and $433.5$ is the median of the median + daily client count of all countries in $2016$.} +\end{table} +\begin{table}[!h] + \begin{tabular}[h]{|r|l|}\hline + Figure & Source \\hline + \ref{antarctica-relay} & {\tiny \url{https://metrics.torproject.org/userstats-relay-country.html?start=2016-01-01... + \ref{antarctica-bridge} & {\tiny \url{https://metrics.torproject.org/userstats-bridge-country.html?start=2016-01-0... + \ref{antarctica-combi} & {\tiny \url{https://metrics.torproject.org/userstats-bridge-combined.html?start=2016-01-... + \ref{vatican-relay} & {\tiny \url{https://metrics.torproject.org/userstats-relay-country.html?start=2016-01-01... + \ref{vatican-bridge} & {\tiny \url{https://metrics.torproject.org/userstats-bridge-country.html?start=2016-01-0... + \ref{vatican-combi} & {\tiny \url{https://metrics.torproject.org/userstats-bridge-combined.html?start=2016-01-... + \end{tabular} + \caption{Graph source URLs.} + \label{urls} +\end{table} +\pagebreak +\subsection{Extra-info descriptor examples}\label{descriptors} % +\subsubsection{Bridge extra-info descriptor}\label{bei} % +\begin{Verbatim}[gobble=0,fontsize=\fontsize{3mm}{1mm}] +@type bridge-extra-info 1.3 +extra-info Unnamed EF93668E48BD4F8DB9DF6D4CFCBF1A7BB5EC7CC2 +master-key-ed25519 a3febLYkK9UmKf4PDhrw/cTefN1l5X0LsAt7BqdcrLM +published 2017-03-01 17:14:17 +write-history 2017-03-01 14:10:28 (14400 s) 2253824,1248256,1308672,489472,592896,300560384 +read-history 2017-03-01 14:10:28 (14400 s) 6366208,5633024,7112704,4847616,6200320,306330624 +dirreq-write-history 2017-03-01 14:10:28 (14400 s) 1581056,683008,673792,36864,33792,662528 +dirreq-read-history 2017-03-01 14:10:28 (14400 s) 56320,9216,4096,4096,2048,2048 +geoip-db-digest C14DF5AE94101562DEACDD296278B0EFA3EA26E5 +geoip6-db-digest A88A828020A558D37F97CF683D4521270F0511A2 +dirreq-stats-end 2017-03-01 15:01:19 (86400 s) +dirreq-v3-ips in=8,mx=8,ru=8 +dirreq-v3-reqs in=8,mx=8,ru=8 +dirreq-v3-resp ok=8,not-enough-sigs=0,unavailable=0,not-found=0,not-modified=0,busy=0 +dirreq-v3-direct-dl complete=0,timeout=0,running=0 +dirreq-v3-tunneled-dl complete=8,timeout=0,running=0 +transport scramblesuit +transport obfs3 +transport obfs4 +transport fte +bridge-stats-end 2017-03-01 15:03:06 (86400 s) +bridge-ips in=8,ir=8,mx=8,ru=8 +bridge-ip-versions v4=8,v6=0 +bridge-ip-transports obfs3=8,obfs4=8,scramblesuit=8 +router-digest-sha256 50hLT2H4vDO42C/fRWIgV5j3CTldi+ZMPyY3V0IYQSE +router-digest 76BC2C857FDBED685085B16E3852799EF81A7B86 +\end{Verbatim} + +\subsubsection{Relay extra-info descriptor}\label{rei} % + +\begin{Verbatim}[gobble=0,fontsize=\fontsize{3mm}{1mm}] +@type extra-info 1.0 +extra-info Pounet27TorRelay EFE68EB2D54E657B5BBF4EB18627646F8DCF66C9 +published 2016-12-04 13:01:45 +write-history 2016-12-04 10:18:03 (14400 s) 57720832,70514688,199539712,... +read-history 2016-12-04 10:18:03 (14400 s) 64663552,74992640,199556096,498191360,... +dirreq-write-history 2016-12-04 10:18:03 (14400 s) 2048,652288,1426432,937984,... +dirreq-read-history 2016-12-04 10:18:03 (14400 s) 4096,13312,23552,263168,24576,7168 +geoip-db-digest C1EB5237F2FBAF63381D8551157F13D12EFCCA25 +geoip6-db-digest 1F99B6B0EC78E9DB34D61AE7E0FC261D558E8E5D +dirreq-stats-end 2016-12-03 13:24:35 (86400 s) +dirreq-v3-ips de=8,ua=8 +dirreq-v3-reqs de=8,ua=8 +dirreq-v3-resp ok=8,not-enough-sigs=0,unavailable=0,not-found=0,not-modified=0,busy=0 +dirreq-v3-direct-dl complete=0,timeout=0,running=0 +dirreq-v3-tunneled-dl complete=4,timeout=8,running=0 +hidserv-stats-end 2016-12-03 18:35:50 (86400 s) +hidserv-rend-relayed-cells 2876020 delta_f=2048 epsilon=0.30 bin_size=1024 +hidserv-dir-onions-seen 254 delta_f=8 epsilon=0.30 bin_size=8 +entry-stats-end 2016-12-03 18:35:50 (86400 s) +entry-ips us=1064,it=504,fr=472,de=456,es=408,br=224,ru=216,jp=208,pl=192,gb=128,ar=120,th=104,ua=104,nl=88,ca=80,in=80,bg=72,se=72,at=56,mx=56,gr=48,tw=48,au=40,be=40,ch=40,cz=40,id=40,ro=40,sa=40,co=32,pt=32,ve=32,ae=24,cl=24,eg=24,hu=24,il=24,ma=24,my=24,ng=24,pe=24,za=24,dk=16,dz=16,ec=16,hk=16,hr=16,ie=16,lt=16,lv=16,ph=16,pk=16,rs=16,sg=16,sk=16,sn=16,tn=16,tr=16,vn=16,??=8,al=8,am=8,ao=8,az=8,ba=8,bd=8,bf=8,bh=8,bj=8,bn=8,bo=8,by=8,cd=8,ci=8,cm=8,cn=8,cr=8,cy=8,do=8,ee=8,fi=8,ge=8,gh=8,gp=8,gt=8,gu=8,hn=8,iq=8,ir=8,is=8,jm=8,jo=8,ke=8,kh=8,kr=8,kz=8,la=8,lb=8,lk=8,lr=8,lu=8,ly=8,md=8,mg=8,mk=8,mr=8,mt=8,mu=8,ni=8,no=8,np=8,nz=8,om=8,pa=8,pf=8,pr=8,ps=8,py=8,qa=8,re=8,sc=8,si=8,sv=8,sy=8,tg=8,tt=8,uy=8,xk=8,ye=8 +cell-stats-end 2017-01-30 18:35:50 (86400 s) +cell-processed-cells 5430,23,10,8,7,4,4,3,2,1 +cell-queued-cells 0.38,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00 +cell-time-in-queue 56,0,0,0,0,0,0,0,0,3 +cell-circuits-per-decile 15573 +conn-bi-direct 2016-12-03 18:35:50 (86400 s) 1417304,46267,48669,100569 +router-sig-ed25519 pTwQjRcWzRYJyyhIdfcLia2vhVpn0GgRth7+IpNbyvnATzs5UjQv6v72WSNg8mwg9RzdOpDd+zMQrf5clUnEDA +router-signature +-----BEGIN SIGNATURE----- +M7Ru2Lfaul9AUcmfZ6VFeOkc5kfOmlkQmbescB0aBAYFr0YaC+qbVZKhPEEvNB8d +s6TBjpW5zWmqnDyLNI8klOFtt1Nm0k76Vfb/0Cx5jfiTx0ViyXC0zC0VBG1jmUkX +FxMvXwC049xv2JVXvUupe83xt/13OIgDV0Z8kWYR64g= +-----END SIGNATURE----- +\end{Verbatim} +\pagebreak +\bibliography{references}% +\cite{fm85,epk14,hambolu14,ts11} were recommended by Nick Mathewson + (cf.~\href{https://trac.torproject.org/projects/tor/ticket/15469%7D + {Tor bug tracker ticket #15469}, last accessed 2017-04-05). + +\end{document} diff --git a/2017/metrics-privacy/references.bib b/2017/metrics-privacy/references.bib new file mode 100644 index 0000000..71a4eba --- /dev/null +++ b/2017/metrics-privacy/references.bib @@ -0,0 +1,153 @@ +@misc{torspec, + author = {Roger Dingledine and Nick Mathewson}, + title = {Tor Protocol Specification}, + howpublished = {\url{https://gitweb.torproject.org/torspec.git/%7D%7D, + note = {{\small Commit 8eee5024f66d4816d63b341550c01ba4ab059bfc}} +} + +@misc{mwgit, + author = {The Tor Project}, + title = {Metrics Web Source Code}, + howpublished = {\url{https://gitweb.torproject.org/metrics-web.git/tree%7D%7D, + note = {{\small Commit 8bf149b0a89227c56e97a228b2558cacfcecc158}} +} + +@misc{ogit, + author = {The Tor Project}, + title = {Onionoo source code}, + howpublished = {\url{https://gitweb.torproject.org/onionoo.git/tree%7D%7D, + note = {{\small Commit 5b219203b8781b27518133ad7d76e636e82d7fe5}} +} + +@misc{torgit, + author = {The Tor Project}, + title = {Tor Source Code}, + howpublished = {\url{https://gitweb.torproject.org/tor.git/tree%7D%7D, + note = {{\small Commit a3ce303432f35a6f06f63f0679b9bb577f88dc3c}} +} + +@misc{cia, + author= {{Washington, DC: Central Intelligence Agency}}, + title={The World Factbook 2013-14}, + year={2013}, + howpublished={\url{https://www.cia.gov/library/publications/the-world-factbook/index.html%7D%7D, + note = {Accessed 2017-04-24} +} + +@article{fm85, + author = {Flajolet, Philippe and Martin, G. Nigel}, + title = {Probabilistic Counting Algorithms for Data Base Applications}, + journal = {J. Comput. Syst. Sci.}, + issue_date = {September 1985}, + volume = {31}, + number = {2}, + year = {1985}, + issn = {0022-0000}, + pages = {182--209}, + numpages = {28}, + url = {http://dx.doi.org/10.1016/0022-0000(85)90041-8%7D, + doi = {10.1016/0022-0000(85)90041-8}, + acmid = {5215}, + publisher = {Academic Press}, + address = {Orlando, FL, USA}, +} + +@inproceedings{epk14, + title={RAPPOR: Randomized Aggregatable Privacy-Preserving Ordinal Response}, + author={Erlingsson, {'U}lfar and Pihur, Vasyl and Korolova, Aleksandra}, + booktitle={Proceedings of the 2014 ACM SIGSAC Conference on Computer and Communications Security}, + pages={1054--1067}, + year={2014}, + organization={ACM} +} + +@mastersthesis{hambolu14, + title={Privacy Preserving Statistics}, + author={Oluwakemi Hambolu}, + year={2014}, + school={Clemson University, South Carolina, USA}, + type={{Master of Science in Computer Engineering}} +} + +@conference{ts11, + author = "Florian Tschorsch and Bj{"o}rn Scheuermann", + title = "Distributed Privacy-Aware User Counting", + year = 2011, + booktitle = "HotPETs '11: 4th Workshop on Hot Topics in Privacy Enhancing Technologies", + url = "https://petsymposium.org/2011/papers/hotpets11-final5Tschorsch.pdf" +} + +@techreport{tr201501001, + author = {David Goulet and Aaron Johnson and George Kadianakis and Karsten Loesing}, + title = {Extrapolating network totals from hidden-service statistics}, + institution = {The Tor Project}, + number = {2015-01-001}, + year = {2015}, + url = {https://research.torproject.org/techreports/extrapolating-hidserv-stats-2015... +} + +@techreport{tr201210001, + author = {Karsten Loesing}, + title = {Counting daily bridge users}, + institution = {The Tor Project}, + number = {2012-10-001}, + year = {2012}, + url = {https://research.torproject.org/techreports/counting-daily-bridge-users-2012... +} + +@techreport{tr201504001, + author = {George Kadianakis and Karsten Loesing}, + title = {Hidden-service statistics reported by relays}, + institution = {The Tor Project}, + number = {2015-04-001}, + year = {2015}, + url = {https://research.torproject.org/techreports/hidden-service-stats-2015-04-28...., +} + +@techreport{tr200908001, + author = {Karsten Loesing}, + title = {Analysis of Circuit Queues in Tor}, + institution = {The Tor Project}, + number = {2009-08-001}, + year = {2009}, + url = {https://research.torproject.org/techreports/bufferstats-2009-08-25.pdf%7D, +} + +@techreport{tr201109001, + author={George Danezis}, + number={2011-09-001}, + institution = {The Tor Project}, + title={An anomaly-based censorship-detection system for Tor}, + year={2011}, + url={https://research.torproject.org/techreports/detector-2011-09-09.pdf%7D, +} + +@misc{tortrac, + author = {{The Tor Project}}, + title = {{Tor Bugtracker}}, + howpublished = {\url{https://trac.torproject.org/%7D%7D, +} + +@inproceedings{privcount, + title = {Safely Measuring Tor}, + author = {Rob Jansen and Aaron Johnson}, + booktitle = {Proceedings of the 23rd ACM Conference on Computer and Communications Security (CCS '16)}, + pages= {1553--1567}, + doi = {10.1145/2976749.2978310}, + year = {2016}, + month = {October} +} + +@inproceedings{privex, + author = {Elahi, Tariq and Danezis, George and Goldberg, Ian}, + title = {{PrivEx:} Private Collection of Traffic Statistics for Anonymous Communication Networks}, + booktitle = {Proceedings of the 2014 ACM SIGSAC Conference on Computer and Communications Security}, + series = {CCS '14}, + year = {2014}, + isbn = {978-1-4503-2957-6}, + location = {Scottsdale, Arizona, USA}, + pages = {1068--1079}, + numpages = {12}, + url = {http://doi.acm.org/10.1145/2660267.2660280%7D, + doi = {10.1145/2660267.2660280}, +} diff --git a/2017/metrics-privacy/tortechrep.cls b/2017/metrics-privacy/tortechrep.cls new file mode 120000 index 0000000..4c24db2 --- /dev/null +++ b/2017/metrics-privacy/tortechrep.cls @@ -0,0 +1 @@ +../../tortechrep.cls \ No newline at end of file