[tor-commits] [stem/master] Resume parsing proc connections via split

atagar at torproject.org atagar at torproject.org
Mon May 14 20:43:56 UTC 2018


commit d0f402c6f0220087320c58cecb25b53d2bd6a156
Author: Damian Johnson <atagar at torproject.org>
Date:   Mon May 14 09:43:03 2018 -0700

    Resume parsing proc connections via split
    
    While back I put quite a bit of elbow grease into speeding up connection
    resolution. To this end commit 8da78a9 squeezed out a 30% improvement by
    parsing proc contents using indices rather than split.
    
    I was torn at the time since this made our code quite a bit more verbose and
    fragile, but all things being equal seemed worth the speed boost. However, in
    digging into connection resolution discrepancies Toralf spotted that when a
    relay has at least 10,000 connections the first column (sl) gets wider,
    throwing our parser off.
    
    Proc is clearly intended to be parsed via delimiters, not static indices. As
    such largely reverting commit 8da78a9.
---
 stem/util/proc.py      | 62 +++++++++++--------------------------------------
 test/unit/util/proc.py | 63 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 68 insertions(+), 57 deletions(-)

diff --git a/stem/util/proc.py b/stem/util/proc.py
index 6a316634..9969b59c 100644
--- a/stem/util/proc.py
+++ b/stem/util/proc.py
@@ -379,65 +379,29 @@ def connections(pid = None, user = None):
         continue  # ipv6 proc contents are optional
 
       protocol = proc_file_path[10:].rstrip('6')  # 'tcp' or 'udp'
-      is_tcp, is_ipv6 = protocol == 'tcp', proc_file_path.endswith('6')
-      title = ''
+      is_ipv6 = proc_file_path.endswith('6')
 
       try:
         with open(proc_file_path, 'rb') as proc_file:
-          title = proc_file.readline()
-
-          if b'local_address' in title:
-            laddr_start = title.index(b'local_address')
-            laddr_end = laddr_start + (8 if not is_ipv6 else 32)
-
-            lport_start = laddr_end + 1
-            lport_end = lport_start + 4
-          else:
-            raise IOError("title line missing 'local_address', %s" % title)
-
-          if b'rem_address' in title or b'remote_address' in title:
-            raddr_start = title.index(b'rem_address') if b'rem_address' in title else title.index(b'remote_address')
-            raddr_end = raddr_start + (8 if not is_ipv6 else 32)
-
-            rport_start = raddr_end + 1
-            rport_end = rport_start + 4
-          else:
-            raise IOError("title line missing 'remote_address', %s" % title)
-
-          if b'st' in title:
-            status_start = title.index(b'st')
-            status_end = status_start + 2
-          else:
-            raise IOError("title line missing 'st', %s" % title)
-
-          if b'retrnsmt' in title and b'uid' in title:
-            # unlike the above fields uid is right aligned
-            uid_start = title.index(b'retrnsmt') + 9
-            uid_end = title.index(b'uid') + 3
-          elif b'retrnsmt' not in title:
-            raise IOError("title line missing 'retrnsmt', %s" % title)
-          else:
-            raise IOError("title line missing 'uid', %s" % title)
-
-          if b'timeout' in title:
-            # inodes can lack a header, and are a dynamic size
-            inode_start = title.index(b'timeout') + 8
-          else:
-            raise IOError("title line missing 'timeout', %s" % title)
+          proc_file.readline()  # skip the first line
 
           for line in proc_file:
-            if inodes and line[inode_start:].split(b' ', 1)[0] not in inodes:
+            _, l_dst, r_dst, status, _, _, _, uid, _, inode = line.split()[:10]
+
+            if inodes and inode not in inodes:
               continue
-            elif process_uid and line[uid_start:uid_end].strip() != process_uid:
+            elif process_uid and uid != process_uid:
               continue
-            elif is_tcp and line[status_start:status_end] != b'01':
+            elif protocol == 'tcp' and status != b'01':
               continue  # skip tcp connections that aren't yet established
 
-            l_addr = _unpack_addr(line[laddr_start:laddr_end])
-            l_port = int(line[lport_start:lport_end], 16)
+            div = l_dst.find(':')
+            l_addr = _unpack_addr(l_dst[:div])
+            l_port = int(l_dst[div + 1:], 16)
 
-            r_addr = _unpack_addr(line[raddr_start:raddr_end])
-            r_port = int(line[rport_start:rport_end], 16)
+            div = r_dst.find(':')
+            r_addr = _unpack_addr(r_dst[:div])
+            r_port = int(r_dst[div + 1:], 16)
 
             if r_addr == '0.0.0.0' or r_addr == '0000:0000:0000:0000:0000:0000':
               continue  # no address
diff --git a/test/unit/util/proc.py b/test/unit/util/proc.py
index 562d618a..7ee4ecda 100644
--- a/test/unit/util/proc.py
+++ b/test/unit/util/proc.py
@@ -33,6 +33,13 @@ TCP6_CONTENT = b"""\
   10: 0000000000000000FFFF00004B9E0905:1466 0000000000000000FFFF00002186364E:951E 01 00000000:00000000 02:00090E70 00000000   106        0 41512577 2 0000000000000000 26 4 31 10 -1
 """
 
+SL_WIDTH_CHANGE = b"""\
+  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode
+   0: 11111111:1111 22222233:2244 01 44444444:44444444 55:55555555 66666666  1111        8 99999999
+   1: 22222222:2222 33333344:3355 01 44444444:44444444 55:55555555 66666666  1111        8 88888888
+12675: 33333333:3333 44444455:4466 01 44444444:44444444 55:55555555 66666666  1111        8 77777777
+""".rstrip()
+
 
 class TestProc(unittest.TestCase):
   @patch('stem.util.proc._get_line')
@@ -216,9 +223,7 @@ class TestProc(unittest.TestCase):
     udp = TITLE_LINE + b'\n A: BBBBBBBB:BBBB CCCCCCCC:CCCC DD EEEEEEEE:EEEEEEEE FF:FFFFFFFF GGGGGGGG 1111        H IIIIIIII'
 
     path_exists_mock.side_effect = lambda param: {
-      '/proc/net/tcp': True,
       '/proc/net/tcp6': False,
-      '/proc/net/udp': True,
       '/proc/net/udp6': False
     }[param]
 
@@ -227,12 +232,12 @@ class TestProc(unittest.TestCase):
       '/proc/net/udp': io.BytesIO(udp)
     }[param]
 
-    expected_results = [
+    expected = [
       Connection('17.17.17.17', 4369, '34.34.34.34', 8738, 'tcp', False),
       Connection('187.187.187.187', 48059, '204.204.204.204', 52428, 'udp', False),
     ]
 
-    self.assertEqual(expected_results, proc.connections(pid))
+    self.assertEqual(expected, proc.connections(pid))
 
   @patch('os.listdir')
   @patch('os.path.exists')
@@ -265,12 +270,12 @@ class TestProc(unittest.TestCase):
       '/proc/net/udp': io.BytesIO(TITLE_LINE),
     }[param]
 
-    expected_results = [
+    expected = [
       Connection('2a01:04f8:0190:514a:0000:0000:0000:0002', 443, '2001:0638:a000:4140:0000:0000:ffff:0189', 40435, 'tcp', True),
       Connection('2a01:04f8:0190:514a:0000:0000:0000:0002', 443, '2001:0858:0002:0002:aabb:0000:563b:1526', 44469, 'tcp', True),
     ]
 
-    self.assertEqual(expected_results, proc.connections(pid = pid))
+    self.assertEqual(expected, proc.connections(pid = pid))
 
   @patch('os.path.exists')
   @patch('pwd.getpwnam')
@@ -293,10 +298,52 @@ class TestProc(unittest.TestCase):
       '/proc/net/udp': io.BytesIO(TITLE_LINE),
     }[param]
 
-    expected_results = [
+    expected = [
       Connection('0000:0000:0000:0000:0000:ffff:0509:9e4b', 5222, '0000:0000:0000:0000:0000:ffff:4e36:8621', 38330, 'tcp', True),
       Connection('2a01:04f8:0190:514a:0000:0000:0000:0002', 5269, '2001:06f8:126f:0011:0000:0000:0000:0026', 50594, 'tcp', True),
       Connection('0000:0000:0000:0000:0000:ffff:0509:9e4b', 5222, '0000:0000:0000:0000:0000:ffff:4e36:8621', 38174, 'tcp', True),
     ]
 
-    self.assertEqual(expected_results, proc.connections(user = 'me'))
+    self.assertEqual(expected, proc.connections(user = 'me'))
+
+  @patch('os.listdir')
+  @patch('os.path.exists')
+  @patch('os.readlink')
+  @patch('stem.util.proc.open', create = True)
+  def test_high_connection_count(self, open_mock, readlink_mock, path_exists_mock, listdir_mock):
+    """
+    When we have over ten thousand connections the 'SL' column's width changes.
+    Checking that we account for this.
+    """
+
+    pid = 1111
+
+    listdir_mock.side_effect = lambda param: {
+      '/proc/%s/fd' % pid: ['1', '2', '3', '4'],
+    }[param]
+
+    readlink_mock.side_effect = lambda param: {
+      '/proc/%s/fd/1' % pid: 'socket:[99999999]',
+      '/proc/%s/fd/2' % pid: 'socket:[88888888]',
+      '/proc/%s/fd/3' % pid: 'socket:[77777777]',
+      '/proc/%s/fd/4' % pid: 'pipe:[30303]',
+      '/proc/%s/fd/5' % pid: 'pipe:[40404]',
+    }[param]
+
+    path_exists_mock.side_effect = lambda param: {
+      '/proc/net/tcp6': False,
+      '/proc/net/udp6': False
+    }[param]
+
+    open_mock.side_effect = lambda param, mode: {
+      '/proc/net/tcp': io.BytesIO(SL_WIDTH_CHANGE),
+      '/proc/net/udp': io.BytesIO(TITLE_LINE)
+    }[param]
+
+    expected = [
+      Connection('17.17.17.17', 4369, '51.34.34.34', 8772, 'tcp', False),
+      Connection('34.34.34.34', 8738, '68.51.51.51', 13141, 'tcp', False),
+      Connection('51.51.51.51', 13107, '85.68.68.68', 17510, 'tcp', False),
+    ]
+
+    self.assertEqual(expected, proc.connections(pid))



More information about the tor-commits mailing list