[tor-commits] [stem/master] Adding tarfile support to stem.descriptor.parse_file()

atagar at torproject.org atagar at torproject.org
Sat May 31 21:35:22 UTC 2014


commit a5596873fd544d79c53f0c0123caa89bdb5a9f72
Author: Damian Johnson <atagar at torproject.org>
Date:   Sat May 31 14:14:06 2014 -0700

    Adding tarfile support to stem.descriptor.parse_file()
    
    A while back Karsten tried to hand a tarfile to our parse_file() method and had
    confusing results...
    
      https://trac.torproject.org/projects/tor/ticket/10977
    
    Expanding our parse_file() function so it'll happily handle tarfiles and tar
    paths.
    
    Note that the DescriptorReader, which already had tar support, is keeping its
    own separate implementation. This is because using the parse_file()'s tar
    support has a couple drawbacks...
    
      1. The reader then couldn't stop in the middle of handling tarballs.
    
      2. If a tarball contains both descriptor and non-descriptor content then the
         DescriptorReader can handle that. parse_file(), however, raises an
         exception.
---
 docs/change_log.rst                        |    2 +
 stem/descriptor/__init__.py                |   55 +++++++++++++++++++++++++---
 stem/descriptor/reader.py                  |   24 +++---------
 stem/util/system.py                        |   34 ++++++++++++++++-
 test/integ/descriptor/server_descriptor.py |   31 ++++++++++++++++
 5 files changed, 121 insertions(+), 25 deletions(-)

diff --git a/docs/change_log.rst b/docs/change_log.rst
index 279f2bb..7f2982b 100644
--- a/docs/change_log.rst
+++ b/docs/change_log.rst
@@ -59,6 +59,7 @@ The following are only available within Stem's `git repository
 
  * **Descriptors**
 
+  * Added tarfile support to :func:`~stem.descriptor.__init__.parse_file` (:trac:`10977`)
   * Added microdescriptor's new identity and identity_type attributes (:spec:`22cda72`)
 
  * **Utilities**
@@ -116,6 +117,7 @@ and a myriad of smaller improvements and fixes.
   * Added :func:`stem.util.system.get_user`
   * Added :func:`stem.util.system.get_start_time`
   * Added :func:`stem.util.system.get_bsd_jail_path`
+  * Added :func:`stem.util.system.is_tarfile`
   * Added :func:`stem.util.connection.is_private_address`
 
  * **Website**
diff --git a/stem/descriptor/__init__.py b/stem/descriptor/__init__.py
index 2d7cc69..4270cb9 100644
--- a/stem/descriptor/__init__.py
+++ b/stem/descriptor/__init__.py
@@ -52,10 +52,12 @@ __all__ = [
 
 import os
 import re
+import tarfile
 
 import stem.prereq
 import stem.util.enum
 import stem.util.str_tools
+import stem.util.system
 
 try:
   # added in python 2.7
@@ -127,7 +129,7 @@ def parse_file(descriptor_file, descriptor_type = None, validate = True, documen
 
     my_descriptor_file = open(descriptor_path, 'rb')
 
-  :param str,file descriptor_file: path or opened file with the descriptor contents
+  :param str,file,tarfile descriptor_file: path or opened file with the descriptor contents
   :param str descriptor_type: `descriptor type <https://metrics.torproject.org/formats.html#descriptortypes>`_, this is guessed if not provided
   :param bool validate: checks the validity of the descriptor's content if
     **True**, skips these checks otherwise
@@ -143,14 +145,23 @@ def parse_file(descriptor_file, descriptor_type = None, validate = True, documen
     * **IOError** if unable to read from the descriptor_file
   """
 
-  # if we got a path then open that file for parsing
+  # Delegate to a helper if this is a path or tarfile.
+
+  handler = None
 
   if isinstance(descriptor_file, (bytes, unicode)):
-    with open(descriptor_file) as desc_file:
-      for desc in parse_file(desc_file, descriptor_type, validate, document_handler, **kwargs):
-        yield desc
+    if stem.util.system.is_tarfile(descriptor_file):
+      handler = _parse_file_for_tar_path
+    else:
+      handler = _parse_file_for_path
+  elif isinstance(descriptor_file, tarfile.TarFile):
+    handler = _parse_file_for_tarfile
+
+  if handler:
+    for desc in handler(descriptor_file, descriptor_type, validate, document_handler, **kwargs):
+      yield desc
 
-      return
+    return
 
   # The tor descriptor specifications do not provide a reliable method for
   # identifying a descriptor file's type and version so we need to guess
@@ -210,6 +221,38 @@ def parse_file(descriptor_file, descriptor_type = None, validate = True, documen
   raise TypeError("Unable to determine the descriptor's type. filename: '%s', first line: '%s'" % (filename, first_line))
 
 
+def _parse_file_for_path(descriptor_file, *args, **kwargs):
+  with open(descriptor_file, 'rb') as desc_file:
+    for desc in parse_file(desc_file, *args, **kwargs):
+      yield desc
+
+
+def _parse_file_for_tar_path(descriptor_file, *args, **kwargs):
+  # TODO: use 'with' for tarfile after dropping python 2.6 support
+  tar_file = tarfile.open(descriptor_file)
+
+  try:
+    for desc in parse_file(tar_file, *args, **kwargs):
+      desc._set_path(os.path.abspath(descriptor_file))
+      yield desc
+  finally:
+    if tar_file:
+      tar_file.close()
+
+
+def _parse_file_for_tarfile(descriptor_file, *args, **kwargs):
+  for tar_entry in descriptor_file:
+    if tar_entry.isfile():
+      entry = descriptor_file.extractfile(tar_entry)
+
+      try:
+        for desc in parse_file(entry, *args, **kwargs):
+          desc._set_archive_path(entry.name)
+          yield desc
+      finally:
+        entry.close()
+
+
 def _parse_metrics_file(descriptor_type, major_version, minor_version, descriptor_file, validate, document_handler, **kwargs):
   # Parses descriptor files from metrics, yielding individual descriptors. This
   # throws a TypeError if the descriptor_type or version isn't recognized.
diff --git a/stem/descriptor/reader.py b/stem/descriptor/reader.py
index 3fb4166..05c7533 100644
--- a/stem/descriptor/reader.py
+++ b/stem/descriptor/reader.py
@@ -85,6 +85,7 @@ import threading
 
 import stem.descriptor
 import stem.prereq
+import stem.util.system
 
 # flag to indicate when the reader thread is out of descriptor files to read
 FINISHED = 'DONE'
@@ -487,24 +488,10 @@ class DescriptorReader(object):
 
     target_type = mimetypes.guess_type(target)
 
-    # Checking if it's a tar file may fail due to permissions so failing back
-    # to the mime type...
-    #
-    #   IOError: [Errno 13] Permission denied: '/vmlinuz.old'
-    #
-    # With python 3 insuffient permissions raises an AttributeError instead...
-    #
-    #   http://bugs.python.org/issue17059
-
-    try:
-      is_tar = tarfile.is_tarfile(target)
-    except (IOError, AttributeError):
-      is_tar = target_type[0] == 'application/x-tar'
-
     if target_type[0] in (None, 'text/plain'):
       # either '.txt' or an unknown type
       self._handle_descriptor_file(target, target_type)
-    elif is_tar:
+    elif stem.util.system.is_tarfile(target):
       # handles gzip, bz2, and decompressed tarballs among others
       self._handle_archive(target)
     else:
@@ -529,9 +516,10 @@ class DescriptorReader(object):
       self._notify_skip_listeners(target, ReadFailed(exc))
 
   def _handle_archive(self, target):
-    # TODO: This would be nicer via the 'with' keyword, but tarfile's __exit__
-    # method was added sometime after python 2.5. We should change this when
-    # we drop python 2.5 support.
+    # TODO: When dropping python 2.6 support go back to using 'with' for
+    # tarfiles...
+    #
+    #   http://bugs.python.org/issue7232
 
     tar_file = None
 
diff --git a/stem/util/system.py b/stem/util/system.py
index 89317df..d24d34b 100644
--- a/stem/util/system.py
+++ b/stem/util/system.py
@@ -16,6 +16,8 @@ best-effort, providing **None** if the lookup fails.
 
   is_available - determines if a command is available on this system
   is_running - determines if a given process is running
+  call - runs the given system command and provides back the results
+
   get_name_by_pid - gets the name for a process by the given pid
   get_pid_by_name - gets the pid for a process by the given name
   get_pid_by_port - gets the pid for a process listening to a given port
@@ -25,9 +27,11 @@ best-effort, providing **None** if the lookup fails.
   get_start_time - provides the unix timestamp when the process started
   get_bsd_jail_id - provides the BSD jail id a given process is running within
   get_bsd_jail_path - provides the path of the given BSD jail
+
+  is_tarfile - checks if the given path is a tarball
   expand_path - expands relative paths and ~ entries
   files_with_suffix - provides files with the given suffix
-  call - runs the given system command and provides back the results
+
 
   get_process_name - provides our process' name
   set_process_name - changes our process' name
@@ -35,9 +39,11 @@ best-effort, providing **None** if the lookup fails.
 
 import ctypes
 import ctypes.util
+import mimetypes
 import os
 import platform
 import subprocess
+import tarfile
 import time
 
 import stem.util.proc
@@ -763,6 +769,32 @@ def get_bsd_jail_path(jid):
   return None
 
 
+def is_tarfile(path):
+  """
+  Returns if the path belongs to a tarfile or not.
+
+  .. versionadded:: 1.2.0
+
+  :param str path: path to be checked
+
+  :returns: **True** if the path belongs to a tarball, **False** otherwise
+  """
+
+  # Checking if it's a tar file may fail due to permissions so failing back
+  # to the mime type...
+  #
+  #   IOError: [Errno 13] Permission denied: '/vmlinuz.old'
+  #
+  # With python 3 insuffient permissions raises an AttributeError instead...
+  #
+  #   http://bugs.python.org/issue17059
+
+  try:
+    return tarfile.is_tarfile(path)
+  except (IOError, AttributeError):
+    return mimetypes.guess_type(path)[0] == 'application/x-tar'
+
+
 def expand_path(path, cwd = None):
   """
   Provides an absolute path, expanding tildes with the user's home and
diff --git a/test/integ/descriptor/server_descriptor.py b/test/integ/descriptor/server_descriptor.py
index 7b4645c..6d16add 100644
--- a/test/integ/descriptor/server_descriptor.py
+++ b/test/integ/descriptor/server_descriptor.py
@@ -4,6 +4,7 @@ Integration tests for stem.descriptor.server_descriptor.
 
 import datetime
 import os
+import tarfile
 import unittest
 
 import stem.control
@@ -15,8 +16,38 @@ import test.runner
 
 from test.integ.descriptor import get_resource
 
+TARFILE_PATH = os.path.join(os.path.dirname(__file__), 'data', 'descriptor_archive.tar')
+TARFILE_FINGERPRINTS = set([
+  u'B6D83EC2D9E18B0A7A33428F8CFA9C536769E209',
+  u'E0BD57A11F00041A9789577C53A1B784473669E4',
+  u'1F43EE37A0670301AD9CB555D94AFEC2C89FDE86',
+])
+
 
 class TestServerDescriptor(unittest.TestCase):
+  def test_with_tarfile_path(self):
+    """
+    Fetch server descriptors via parse_file() for a tarfile path.
+    """
+
+    descriptors = list(stem.descriptor.parse_file(TARFILE_PATH))
+    self.assertEqual(3, len(descriptors))
+
+    fingerprints = set([desc.fingerprint for desc in descriptors])
+    self.assertEqual(TARFILE_FINGERPRINTS, fingerprints)
+
+  def test_with_tarfile_object(self):
+    """
+    Fetch server descriptors via parse_file() for a tarfile object.
+    """
+
+    with tarfile.open(TARFILE_PATH) as tar_file:
+      descriptors = list(stem.descriptor.parse_file(tar_file))
+      self.assertEqual(3, len(descriptors))
+
+      fingerprints = set([desc.fingerprint for desc in descriptors])
+      self.assertEqual(TARFILE_FINGERPRINTS, fingerprints)
+
   def test_metrics_descriptor(self):
     """
     Parses and checks our results against a server descriptor from metrics.



More information about the tor-commits mailing list