commit 2e519c8da7946467e3615f05d0da1a12be920fe8 Author: Arturo Filastò arturo@filasto.net Date: Wed Jan 27 12:04:53 2016 +0100
Update to reflect normalisations performed in pipeline the specs of the base and the HTTP data formats --- data-formats/df-000-base.md | 166 +++++++++++----------- data-formats/df-001-httpt.md | 328 ++++++++++++++++++++++++++++--------------- 2 files changed, 301 insertions(+), 193 deletions(-)
diff --git a/data-formats/df-000-base.md b/data-formats/df-000-base.md index d0d5378..c1431e3 100644 --- a/data-formats/df-000-base.md +++ b/data-formats/df-000-base.md @@ -14,113 +14,116 @@ In this directory shall go only the data format specifications of Test templates. The Test specific data formats should go in the specification of the test.
-All data produced from ooniprobe tests is in YAML formatted. +Data produced by the ooniprobe client can be either in YAML or JSON format.
-Every test that is interested in reporting with ooniprobe MUST use such data -format. +YAML is used when writing reports to the users filesystem, while JSON is used +as a format for the published processed reports.
# Base test data format
This specification is of the basic data format common to all ooniprobe test outputs.
-Data Format Version: df-000-base-001 +Every entry contains the following common fields. The `test_keys` key will +contain instead all the keys that are specific to the test in question. + +The JSON data format is made up of a series of JSON documents separated by +newline characters. + +Data Format Version: 0.2.0
## Specification
- ########################################### - # OONI Probe Report for HTTP Requests test - # Wed Jan 30 21:03:56 2013 - ########################################### - --- - options: - A dict containing the keys and values of options passed to the test. +``` +{ + "input": "If the test takes an input this will contain the value of it" + " these can be for example URLs, hostnames, IPs, etc.",
- probe_asn: - The AS Number of the probe (prefixed by AS, ex. AS1234) or null if includeasn is set to false. + "input_hashes": "A list of the SHA256 hash of encoded " + "as hex of the inputs to this test.",
- probe_cc: - The two letter country code of the probe or null if inlcudecc is set to false. + "id": "This is an identifier of this particular measurement",
- probe_ip: - The IPv4 address of the probe or null if includeip is set to false. + "bucket_date": "A date in the format of %Y-%m-%d that indicates " + "when the report was processed by the data pipeline"
- software_name: - The name of the software that has generated such report (ex. ooniprobe). + "data_format_version": "0.1.0|0.2.0",
- software_version: - The version of the software that has generated such report (ex. 0.0.10). + "report_filename": "{bucket_date}/{timestamp as '%Y%m%dT%h%M%sZ'}-{probe_cc}-{probe_asn}-{test_name}-{report_id}-{data_format_version}-{probe|backend}.json",
- start_time: - The time at which the test was started in seconds since epoch. + "options": ["A list of options passed to the test as command line arguments"],
- test_name: - The name of the test that such report is for (ex. HTTP Requests). + "probe_asn": "The AS Number of the probe (prefixed by AS, ex. AS1234) " + "or AS0 if includeasn is set to false.",
- test_version: - The version of the test that such report is for (ex. 0.0.10). + "probe_cc": "The two letter country code of the probe or ZZ if " + "inlcudecountry is set to false.",
- data_format_version: - The version string of the data format being used by the test (ex. httpt-000) - - report_id: - A 64 character mixed case string that is generated by the client used to identify the report. + "probe_ip": "The IPv4 address of the probe or 127.0.0.1 if " + "includeip is set to false.",
- test_helpers: - A dictionary with as keys the names of the options and values the addresses of the test helpers used - ... + "probe_ip": "The name of the city of the probe or null if " + "includecity is set to false.",
-# Example output + "report_id": "20140130T111423Z_ELNkuajQzUWfktBupbfZUxseQDczEvEaIhtciykhoLSuiNiCCV",
- ########################################### - # OONI Probe Report for HTTP Invalid Request Line test - # Mon Jan 28 21:33:59 2013 - ########################################### - --- - options: - collector: null - help: 0 - logfile: null - parallelism: '10' - pcapfile: null - reportfile: null - resume: 0 - subargs: [-b, 'http://93.95.227.200'] - test: nettests/manipulation/http_invalid_request_line.py - probe_asn: null - probe_cc: null - probe_ip: null - software_name: ooniprobe - software_version: 0.0.10 - start_time: 1359401639.0 - test_name: HTTP Invalid Request Line - test_version: 0.1.3 - test_helpers: {backend: "http://93.95.227.200%22%7D - report_id: xxxxxxxXXXxXXXxxxxxxxxxxxxXXXxXXXxsxxxXXXxXXXxxxXXXxXXXxxxXXXxX - ... - - -# Report Entry data format - -Every iteration over an input given to a test will produce a Report Entry. - -A Report Entry is a YAML Stream as specified here: -http://www.yaml.org/spec/1.2/spec.html#id2801681 - -Here are specified the keys that will always be present inside of every report -entry. + "software_name": "The name of the software that has generated " + "such report (ex. ooniprobe)",
-## Specification + "software_version": "The version of the software used to generate this report", + + "backend_version": "The version of the backend that collected this measurement", + + "test_helpers": null,
-input: - The item we this specific test instance is referring to. null in case no - input is being iterated over. + "test_name": "The name of the test that generated " + "this measurement (ex. http_requests)",
-test_runtime: - `float` the runtime of the test + "test_version": "",
-test_start_time: - `float` seconds since epoch from the starting of the test. + "test_runtime": null, + + "test_start_time": "Timestamp of when the measurement was performed in " + "UTC time coordinates (ex. 2015-08-24 12:02:23)", + + "test_keys": { + "The keys that are specific to the test" + } +} +``` + +# Example output + +``` +{ + "bucket_date": "2015-11-22", + "data_format_version": "0.2.0", + "id": "07873c37-9441-47e3-93b8-94db10444c64", + "input": "http://example.com/", + "options": [ + "-f", + "37e60e13536f6afe47a830bfb6b371b5cf65da66d7ad65137344679b24fdccd1" + ], + "probe_asn": "AS0", + "probe_cc": "CH", + "probe_ip": "127.0.0.1", + "report_filename": "2015-11-22/20151122T103202Z-CH-AS0-http_requests-XsQk40qrhgvJEdbXAUFzYjbbGCBuEsc1UV5RAAFXo4hysiUo3qyTfo4NTr7MjiwN-0.1.0-probe.json", + "report_id": "XsQk40qrhgvJEdbXAUFzYjbbGCBuEsc1UV5RAAFXo4hysiUo3qyTfo4NTr7MjiwN", + "software_name": "ooniprobe", + "software_version": "1.3.1", + "backend_version": "1.1.4", + "test_helpers": {}, + "input_hashes": [ + "37e60e13536f6afe47a830bfb6b371b5cf65da66d7ad65137344679b24fdccd1" + ], + "test_name": "http_requests", + "test_runtime": 0.1842639446, + "test_start_time": "2015-11-22 10:32:02", + "test_version": "0.2.4" + "test_keys": { + }, +} +```
# Error strings
@@ -164,4 +167,3 @@ error_string: * This will be the error message if the task has timed out: `task_timed_out`
* Every other failure: 'unknown_failure %s' % str(failure.value) - diff --git a/data-formats/df-001-httpt.md b/data-formats/df-001-httpt.md index ed8fb63..8479a6d 100644 --- a/data-formats/df-001-httpt.md +++ b/data-formats/df-001-httpt.md @@ -1,6 +1,6 @@ # HTTPTest template data format
-Data Format Version: df-001-httpt-000 +Data Format Version: 0.2.0
This is the specification of the data format that every test that is based on ooni.templates.httpt.HTTPTest shall be using. @@ -10,116 +10,222 @@ data format.
## Specification
- --- - requests: - - request: - headers: - `dict` the headers of the request - body: - `string` the body of the response - - url: - `string` the URL of the request being made (if prefixed with 's' it means - the request was made via the Tor SOCKS proxy) - - method: - `string` the HTTP method being used - - response: - headers: - `dict` the headers of the response - body: - `string` the body of the response - - code: - `int` the response status code - - failure: - `string` (optional) this will be set if an error was returned. - For a list of error messages see the Error strings section of - df-000-base.md. - - - request: - etc. etc. - - socksproxy: - null if no socks proxy was used for this request or an IP port - combination (as a string) if a SOCKS proxy was used. - - agent: - either 'agent' if 30X redirects should not be followed or 'redirect' if - they should be followed. - - ... +``` +"agent": "agent|redirect depending on weither the client " + "will ignore 30X redirects or follow them.", + +"socksproxy": "null | IP:PORT of the socksproxy to be used to " + "perform the experiment requests on", + +"requests": [ + { + "failure": "This will contain an error string for why the " + "request failed or null if no failure occurred", + + "request": { + "body": "If the request of the client contains some payload it " + "will be in here, otherwise it is set to null", + + "headers": { + "Header-Name": "Header-Value" + }, + + "method": "GET|POST|PUT", + "tor": { + "exit_ip": "The address of the Tor exit used for the request or " + "null if Tor was not used or the test was run with an older version of ooniprobe.", + + "exit_name": "The name of the Tor exit used for the request or " + "null if Tor was not used or the test was run with an older version of ooniprobe.", + + "is_tor": "true|false depending on wether or not " + "this request was done over Tor or not." + }, + "url": "The URL of the request that has been performed." + }, + "response": { + "body": "The body of the response or null if not response was found. If the response is binary " + "then this will be a dictionary containing the format in which the binary data is encoded and " + "the encoded data (ex. {"format": "base64", "data": "AQI="}). " + "Currently the only type of format supported is base64.", + + "headers": { + "Header-Name": "Header-Value" + } + }, + "response_length": null + } +] +```
## Example output
- input: http://google.com/ - agent: agent - requests: - - request: - body: null - headers: - - - User-Agent - - - &id001 [Mozilla/4.0 (compatible; MSIE 5.0; Windows NT 5.1; .NET CLR 1.1.4322), - 'Internet Explorer 5, Windows XP'] - method: GET - url: http://google.com/ - response: - body: '' - code: 301 - headers: - - - Content-Length - - ['219'] - - - X-XSS-Protection - - [1; mode=block] - - - Expires - - ['Tue, 29 Jan 2013 14:29:19 GMT'] - - - Server - - [gws] - - - Connection - - [close] - - - Location - - ['http://www.google.com/'] - - - Cache-Control - - ['public, max-age=2592000'] - - - Date - - ['Sun, 30 Dec 2012 14:29:19 GMT'] - - - X-Frame-Options - - [SAMEORIGIN] - - - Content-Type - - [text/html; charset=UTF-8] - - request: - body: null - headers: - - - User-Agent - - - *id001 - method: GET - url: shttp://google.com/ - response: - body: '' - code: 301 - headers: - - - Content-Length - - ['219'] - - - X-XSS-Protection - - [1; mode=block] - - - Expires - - ['Tue, 29 Jan 2013 14:29:20 GMT'] - - - Server - - [gws] - - - Connection - - [close] - - - Location - - ['http://www.google.com/'] - - - Cache-Control - - ['public, max-age=2592000'] - - - Date - - ['Sun, 30 Dec 2012 14:29:20 GMT'] - - - X-Frame-Options - - [SAMEORIGIN] - - - Content-Type - - [text/html; charset=UTF-8] - socksproxy: null - - +``` +{ + "bucket_date": "2015-11-22", + "data_format_version": "0.2.0", + "id": "07873c37-9441-47e3-93b8-94db10444c64", + "input": "http://googleusercontent.com/", + "options": [ + "-f", + "37e60e13536f6afe47a830bfb6b371b5cf65da66d7ad65137344679b24fdccd1" + ], + "probe_asn": "AS0", + "probe_cc": "CH", + "probe_ip": "127.0.0.1", + "report_filename": "2015-11-22/20151122T103202Z-CH-AS0-http_requests-XsQk40qrhgvJEdbXAUFzYjbbGCBuEsc1UV5RAAFXo4hysiUo3qyTfo4NTr7MjiwN-0.1.0-probe.json", + "report_id": "XsQk40qrhgvJEdbXAUFzYjbbGCBuEsc1UV5RAAFXo4hysiUo3qyTfo4NTr7MjiwN", + "software_name": "ooniprobe", + "software_version": "1.3.1", + "test_helpers": {}, + "backend_version": "1.1.4", + "input_hashes": [ + "37e60e13536f6afe47a830bfb6b371b5cf65da66d7ad65137344679b24fdccd1" + ], + "probe_city": null, + "test_name": "http_requests", + "test_runtime": 0.1842639446, + "test_start_time": "2015-11-22 10:32:02", + "test_version": "0.2.4" + "test_keys": { + "agent": "agent", + "body_length_match": null, + "body_proportion": null, + "control_failure": "socks_host_unreachable", + "experiment_failure": "dns_lookup_error", + "factor": 0.8, + "headers_diff": null, + "headers_match": null, + "requests": [ + { + "failure": "dns_lookup_error", + "request": { + "body": null, + "headers": { + "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; de; rv:1.9.2) Gecko/20100115 Firefox/3.6" + }, + "method": "GET", + "tor": { + "exit_ip": false, + "exit_name": false, + "is_tor": false + }, + "url": "http://googleusercontent.com/" + }, + "response": { + "body": null, + "headers": {} + }, + "response_length": null + }, + { + "failure": "socks_host_unreachable", + "request": { + "body": null, + "headers": { + "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7" + }, + "method": "GET", + "tor": { + "exit_ip": null, + "exit_name": null, + "is_tor": true + }, + "url": "http://googleusercontent.com/" + }, + "response": { + "body": null, + "headers": {} + }, + "response_length": null + }, + { + "failure": "dns_lookup_error", + "request": { + "body": null, + "headers": { + "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; de; rv:1.9.2) Gecko/20100115 Firefox/3.6" + }, + "method": "GET", + "tor": { + "exit_ip": null, + "exit_name": null, + "is_tor": false + }, + "url": "http://googleusercontent.com/" + }, + "response": { + "body": null, + "headers": {} + }, + "response_length": null + }, + { + "failure": "dns_lookup_error", + "request": { + "body": null, + "headers": { + "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; de; rv:1.9.2) Gecko/20100115 Firefox/3.6" + }, + "method": "GET", + "tor": { + "exit_ip": null, + "exit_name": null, + "is_tor": false + }, + "url": "http://googleusercontent.com/" + }, + "response": { + "body": null, + "headers": {} + }, + "response_length": null + }, + { + "failure": "socks_host_unreachable", + "request": { + "body": null, + "headers": { + "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; de; rv:1.9.2) Gecko/20100115 Firefox/3.6" + }, + "method": "GET", + "tor": { + "exit_ip": null, + "exit_name": null, + "is_tor": true + }, + "url": "http://googleusercontent.com/" + }, + "response": { + "body": null, + "headers": {} + }, + "response_length": null + }, + { + "failure": "socks_host_unreachable", + "request": { + "body": null, + "headers": { + "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; de; rv:1.9.2) Gecko/20100115 Firefox/3.6" + }, + "method": "GET", + "tor": { + "exit_ip": null, + "exit_name": null, + "is_tor": true + }, + "url": "http://googleusercontent.com/" + }, + "response": { + "body": null, + "headers": {} + }, + "response_length": null + } + ], + "socksproxy": null, + "start_time": 1448184722.0 + } +} +```