
commit f7575c6a1fb8b755a744c20318195c659a85d060 Author: Damian Johnson <atagar@torproject.org> Date: Sun Jan 20 17:09:22 2019 -0800 Parse BandwidthFile as streams String parsing was a great spot to start, but highly memory inefficient. Parsing internally created multiple copies of our bandwidth file content as processed rather than working from a single copy of the bytes. --- stem/descriptor/bandwidth_file.py | 50 ++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/stem/descriptor/bandwidth_file.py b/stem/descriptor/bandwidth_file.py index 9c87385c..2e9fc216 100644 --- a/stem/descriptor/bandwidth_file.py +++ b/stem/descriptor/bandwidth_file.py @@ -15,6 +15,7 @@ Parsing for Bandwidth Authority metrics as described in Tor's """ import datetime +import io import time import stem.util.str_tools @@ -91,16 +92,17 @@ def _parse_file(descriptor_file, validate = False, **kwargs): def _parse_header(descriptor, entries): header = {} - lines = str(descriptor).split('\n') + content = io.BytesIO(descriptor.get_bytes()) - # skip the first line, which should be the timestamp + content.readline() # skip the first line, which should be the timestamp - if lines and lines[0].isdigit(): - lines = lines[1:] + while True: + line = content.readline().strip() - for line in lines: - if line == HEADER_DIV: - break + if not line: + break # end of the content + elif line == HEADER_DIV: + break # end of header elif line.startswith('node_id='): break # version 1.0 measurement @@ -117,7 +119,7 @@ def _parse_header(descriptor, entries): def _parse_timestamp(descriptor, entries): - first_line = str(descriptor).split('\n', 1)[0] + first_line = io.BytesIO(descriptor.get_bytes()).readline().strip() if first_line.isdigit(): descriptor.timestamp = datetime.datetime.utcfromtimestamp(int(first_line)) @@ -129,30 +131,30 @@ def _parse_body(descriptor, entries): # In version 1.0.0 the body is everything after the first line. Otherwise # it's everything after the header's divider. - div = '\n' if descriptor.version == '1.0.0' else HEADER_DIV + content = io.BytesIO(descriptor.get_bytes()) - if div in str(descriptor): - body = str(descriptor).split(div, 1)[1].strip() + if descriptor.version == '1.0.0': + content.readline() # skip the first line else: - body = '' + while content.readline().strip() != HEADER_DIV: + pass # skip the header measurements = {} - if body: - for line in body.split('\n'): - attr = dict(_mappings_for('measurement', line)) + for line in content.readlines(): + attr = dict(_mappings_for('measurement', line.strip())) - if 'node_id' not in attr: - raise ValueError("Every meaurement must include 'node_id': %s" % line) - elif attr['node_id'] in measurements: - # Relay is listed multiple times. This is a bug for the bandwidth - # authority that made this descriptor, but according to the spec - # should be ignored by parsers. + if 'node_id' not in attr: + raise ValueError("Every meaurement must include 'node_id': %s" % line.strip()) + elif attr['node_id'] in measurements: + # Relay is listed multiple times. This is a bug for the bandwidth + # authority that made this descriptor, but according to the spec + # should be ignored by parsers. - continue + continue - fingerprint = attr['node_id'].lstrip('$') # bwauths prefix fingerprints with '$' - measurements[fingerprint] = attr + fingerprint = attr['node_id'].lstrip('$') # bwauths prefix fingerprints with '$' + measurements[fingerprint] = attr descriptor.measurements = measurements
participants (1)
-
atagar@torproject.org