[stem/master] Rewriting reader documentation

commit 02d51e37e54aeefc77d906792d12e1bb711b89c5 Author: Damian Johnson <atagar@torproject.org> Date: Fri Mar 9 19:51:38 2012 -0800 Rewriting reader documentation Moving all of the examples and substantial documentation up into the header, and minorly revising how I intend for the module to be used. Not all of the methods in the header have been added yet. --- stem/descriptor/reader.py | 112 +++++++++++++++++++++++++++------------------ 1 files changed, 67 insertions(+), 45 deletions(-) diff --git a/stem/descriptor/reader.py b/stem/descriptor/reader.py index a3ad418..b0c8f9a 100644 --- a/stem/descriptor/reader.py +++ b/stem/descriptor/reader.py @@ -1,7 +1,8 @@ """ -Reads descriptors from local directories and archives. +Utilities for reading descriptors from local directories and archives. This is +mostly done through the DescriptorReader class, which is an iterator for the +descriptor data in a series of destinations. For example... -Example: my_descriptors = [ "/tmp/server-descriptors-2012-03.tar.bz2", "/tmp/archived_descriptors/", @@ -9,10 +10,53 @@ Example: reader = DescriptorReader(my_descriptors) + # prints the contents of all the descriptor files with reader: - # prints all of the descriptor contents for descriptor in reader: print descriptor + +This ignores files that cannot be processed due to read errors or unparsable +content. To be notified of skipped files you can register a listener with +register_skip_listener(). + +The DescriptorReader keeps track of the last modified timestamps for descriptor +files that it has read so it can skip unchanged files if ran again. This +listing of processed files can also be persisted and applied to other +DescriptorReaders. For instance, the following prints descriptors as they're +changed over the course of a minute, and picks up where it left off if ran +again... + + reader = DescriptorReader(["/tmp/descriptor_data"]) + + try: + processed_files = load_processed_files("/tmp/used_descriptors") + reader.set_processed_files(processed_files) + except: pass # could not load, mabye this is the first run + + with reader: + start_time = time.time() + + while time.time() - start_time < 60: + # prints any descriptors that have changed since last checked + for descriptor in reader: + print descriptor + + time.sleep(1) + + save_processed_files(reader.get_processed_files(), "/tmp/used_descriptors") + + +load_processed_files - Loads a listing of processed files. +save_processed_files - Saves a listing of processed files. + +DescriptorReader - Iterator for descriptor data on the local file system. + |- get_processed_files - provides the listing of files that we've processed + |- set_processed_files - sets our tracking of the files we have processed + |- start - begins reading descriptor data + |- stop - stops reading descriptor data + |- join - joins on the thread used to process descriptor data + |- __enter__ / __exit__ - manages the descriptor reader thread in the context + +- __iter__ - iterates over descriptor data in unread files """ import os @@ -29,7 +73,7 @@ def load_processed_files(path): path (str) - location to load the processed files dictionary from Returns: - dict of 'path (str) => last modified timestamp (int)' mappings + dict of 'path (str) => last modified unix timestamp (int)' mappings Raises: IOError if unable to read the file @@ -90,32 +134,6 @@ class DescriptorReader(threading.Thread): """ Iterator for the descriptor data on the local file system. This can process text files, tarball archives (gzip or bzip2), or recurse directories. - - This keeps track the last modified timestamps for descriptor files we have - used, and if you call restart() then this will only provide descriptors from - new files or files that have changed since them. - - You can also save this listing of processed files and later apply it another - DescriptorReader. For instance, to only print the descriptors that have - changed since the last ran... - - reader = DescriptorReader(["/tmp/descriptor_data"]) - - try: - processed_files = load_processed_files("/tmp/used_descriptors") - reader.set_processed_files(processed_files) - except: pass # could not load, mabye this is the first run - - # only prints descriptors that have changed since we last ran - with reader: - for descriptor in reader: - print descriptor - - save_processed_files(reader.get_processed_files(), "/tmp/used_descriptors") - - This ignores files that cannot be processed (either due to read errors or - because they don't contain descriptor data). The caller can be notified of - files that are skipped by restering a listener with register_skip_listener(). """ def __init__(self, targets): @@ -124,19 +142,12 @@ class DescriptorReader(threading.Thread): self.processed_files = {} self._stop_event = threading.Event() - def stop(self): - """ - Stops further reading of descriptors. - """ - - self._stop_event.set() - def get_processed_files(self): """ - For each file we have provided descriptor data for this provides a mapping - of the form... + For each file that we have read descriptor data from this provides a + mapping of the form... - absolute_path (str) => modified_time (int) + absolute path (str) => last modified unix timestamp (int) This includes entries set through the set_processed_files() method. @@ -149,17 +160,16 @@ class DescriptorReader(threading.Thread): def set_processed_files(self, processed_files): """ - Appends a dictionary of 'path => modified timestamp' mappings to our - listing of processed files. With the get_processed_files() method this can - be used to skip descriptors that we have already read. For instance... - + Sets the listing of the files we have processed. Most often this is useful + as a method for pre-populating the listing of descriptor files that we have + seen. Arguments: processed_files (dict) - mapping of absolute paths (str) to unix timestamps for the last modified time (int) """ - self.processed_files.update(processed_files) + self.processed_files = dict(processed_files) def register_skip_listener(self, listener): """ @@ -176,6 +186,13 @@ class DescriptorReader(threading.Thread): self.skip_listeners.append(listener) + def stop(self): + """ + Stops further reading of descriptor files. + """ + + self._stop_event.set() + def run(self): # os.walk(path, followlinks = True) # @@ -184,6 +201,11 @@ class DescriptorReader(threading.Thread): # # >>> mimetypes.guess_type("/home/atagar/Desktop/server-descriptors-2012-03.tar.bz2") # ('application/x-tar', 'bzip2') + # + # This only checks the file extension. To actually check the content (like + # the 'file' command) an option would be pymagic... + # https://github.com/cloudburst/pymagic + while not self._stop_event.isSet(): pass # TODO: implement
participants (1)
-
atagar@torproject.org