[or-cvs] r24005: {arm} When available using proc calls to speed up tor resource usa (in arm/trunk: . src src/interface src/interface/graphing src/util)

Damian Johnson atagar1 at gmail.com
Fri Dec 31 03:02:10 UTC 2010


Author: atagar
Date: 2010-12-31 03:02:10 +0000 (Fri, 31 Dec 2010)
New Revision: 24005

Added:
   arm/trunk/src/interface/graphing/resourceStats.py
Removed:
   arm/trunk/src/interface/graphing/psStats.py
Modified:
   arm/trunk/TODO
   arm/trunk/armrc.sample
   arm/trunk/src/interface/controller.py
   arm/trunk/src/interface/graphing/__init__.py
   arm/trunk/src/interface/headerPanel.py
   arm/trunk/src/settings.cfg
   arm/trunk/src/util/sysTools.py
   arm/trunk/src/util/torTools.py
   arm/trunk/src/util/uiTools.py
Log:
When available using proc calls to speed up tor resource usage samplings. The ps resoulution was deeply rooted in the header panel and ps graph so this turned out to be a more invasive change than I thought. Hopefully there aren't any more buggy edges cases left...
- proc resolution for tor resource usage (cpu and memory usage)
- providing proper cpu samplings rather than an average over the life of the process
- incrementing the uptime field of the header panel every second
- including an estimate of system call usage in the arm cpu metric
- using exponential backoff of ps/proc resource resolutions when calls fail or tor isn't running
- dropping support for graphing of cusotm ps attributes (feature was never used, kinda pointless, and incompatable with this enhancement)
- defaulting GETINFO cache hits to not log anything



Modified: arm/trunk/TODO
===================================================================
--- arm/trunk/TODO	2010-12-31 01:41:07 UTC (rev 24004)
+++ arm/trunk/TODO	2010-12-31 03:02:10 UTC (rev 24005)
@@ -86,6 +86,11 @@
             - http://www.linuxjournal.com/article/5737
 
 - Bugs
+  * When attaching to a TBB instance arm is degraded in several noticeable
+    ways:
+    * unable to query man page results
+    * autodetection for relative cookie path is broken
+    * path prefix detection (for instance to find the torrc) is broken
   * The default resolver isn't configurable.
   * The cpu usage spikes for scrollable content when the key's held. Try
     coalescing the events.

Modified: arm/trunk/armrc.sample
===================================================================
--- arm/trunk/armrc.sample	2010-12-31 01:41:07 UTC (rev 24004)
+++ arm/trunk/armrc.sample	2010-12-31 03:02:10 UTC (rev 24005)
@@ -6,7 +6,7 @@
 startup.events N3
 
 # Seconds between querying information
-queries.ps.rate 5
+queries.resourceUsage.rate 5
 queries.connections.minRate 5
 queries.refreshRate.rate 5
 
@@ -153,18 +153,6 @@
 features.graph.bw.accounting.rate 10
 features.graph.bw.accounting.isTimeLong false
 
-# Parameters for graphing ps stats
-# --------------------------------
-# primary/secondaryStat
-#   any numeric field provided by the ps command
-# cachedOnly
-#   determines if the graph should query ps or rely on cached results (this
-#   lowers the call volume but limits the graph's granularity)
-
-features.graph.ps.primaryStat %cpu
-features.graph.ps.secondaryStat rss
-features.graph.ps.cachedOnly true
-
 # Thread pool size for hostname resolutions
 # Determines the maximum number of concurrent requests. Upping this to around
 # thirty or so seems to be problematic, causing intermittently seizing.
@@ -196,6 +184,7 @@
 log.configEntryTypeError NOTICE
 log.torCtlPortClosed NOTICE
 log.torGetInfo DEBUG
+log.torGetInfoCache NONE
 log.torGetConf DEBUG
 log.torSetConf INFO
 log.torEventTypeUnrecognized NOTICE
@@ -206,8 +195,6 @@
 log.sysCallFailed INFO
 log.sysCallCacheGrowing INFO
 log.panelRecreated DEBUG
-log.graph.ps.invalidStat WARN
-log.graph.ps.abandon WARN
 log.graph.bw.prepopulateSuccess NOTICE
 log.graph.bw.prepopulateFailure NOTICE
 log.logPanel.prepopulateSuccess INFO
@@ -234,4 +221,7 @@
 log.cursesColorSupport INFO
 log.bsdJailFound INFO
 log.unknownBsdJailId WARN
+log.stats.failedProcResolution DEBUG
+log.stats.procResolutionFailover INFO
+log.stats.failedPsResolution INFO
 

Modified: arm/trunk/src/interface/controller.py
===================================================================
--- arm/trunk/src/interface/controller.py	2010-12-31 01:41:07 UTC (rev 24004)
+++ arm/trunk/src/interface/controller.py	2010-12-31 03:02:10 UTC (rev 24005)
@@ -14,7 +14,6 @@
 import curses.textpad
 import socket
 from TorCtl import TorCtl
-from TorCtl import TorUtil
 
 import headerPanel
 import graphing.graphPanel
@@ -28,7 +27,7 @@
 from util import conf, log, connections, hostnames, panel, sysTools, torConfig, torTools, uiTools
 import graphing.bandwidthStats
 import graphing.connStats
-import graphing.psStats
+import graphing.resourceStats
 
 CONFIRM_QUIT = True
 REFRESH_RATE = 5        # seconds between redrawing screen
@@ -539,7 +538,7 @@
   connections.RESOLVER_FINAL_FAILURE_MSG += " (connection related portions of the monitor won't function)"
   
   panels = {
-    "header": headerPanel.HeaderPanel(stdscr, startTime, config),
+    "header": headerPanel.HeaderPanel(stdscr, startTime),
     "popup": Popup(stdscr, 9),
     "graph": graphing.graphPanel.GraphPanel(stdscr),
     "log": logPanel.LogPanel(stdscr, loggedEvents, config)}
@@ -561,7 +560,7 @@
   
   # statistical monitors for graph
   panels["graph"].addStats("bandwidth", graphing.bandwidthStats.BandwidthStats(config))
-  panels["graph"].addStats("system resources", graphing.psStats.PsStats(config))
+  panels["graph"].addStats("system resources", graphing.resourceStats.ResourceStats())
   if not isBlindMode: panels["graph"].addStats("connections", graphing.connStats.ConnStats())
   
   # sets graph based on config parameter
@@ -639,6 +638,8 @@
   isResourceWarningGiven = False
   lastResourceCheck = startTime
   
+  lastSize = None
+  
   # TODO: come up with a nice, clean method for other threads to immediately
   # terminate the draw loop and provide a stacktrace
   while True:
@@ -715,9 +716,13 @@
       for panelKey in (PAGE_S + PAGES[page]):
         # redrawing popup can result in display flicker when it should be hidden
         if panelKey != "popup":
+          newSize = stdscr.getmaxyx()
+          isResize = lastSize != newSize
+          lastSize = newSize
+          
           if panelKey in ("header", "graph", "log", "config", "torrc"):
             # revised panel (manages its own content refreshing)
-            panels[panelKey].redraw()
+            panels[panelKey].redraw(isResize)
           else:
             panels[panelKey].redraw(True)
       
@@ -726,9 +731,16 @@
       currentTime = time.time()
       if currentTime - lastPerformanceLog >= CONFIG["queries.refreshRate.rate"]:
         cpuTotal = sum(os.times()[:3])
-        cpuAvg = cpuTotal / (currentTime - startTime)
+        pythonCpuAvg = cpuTotal / (currentTime - startTime)
+        sysCallCpuAvg = sysTools.getSysCpuUsage()
+        totalCpuAvg = pythonCpuAvg + sysCallCpuAvg
         
-        log.log(CONFIG["log.refreshRate"], "refresh rate: %0.3f seconds, average cpu usage: %0.3f%%" % (currentTime - redrawStartTime, 100 * cpuAvg))
+        if sysCallCpuAvg > 0.00001:
+          log.log(CONFIG["log.refreshRate"], "refresh rate: %0.3f seconds, average cpu usage: %0.3f%% (python), %0.3f%% (system calls), %0.3f%% (total)" % (currentTime - redrawStartTime, 100 * pythonCpuAvg, 100 * sysCallCpuAvg, 100 * totalCpuAvg))
+        else:
+          # with the proc enhancements the sysCallCpuAvg is usually zero
+          log.log(CONFIG["log.refreshRate"], "refresh rate: %0.3f seconds, average cpu usage: %0.3f%%" % (currentTime - redrawStartTime, 100 * totalCpuAvg))
+        
         lastPerformanceLog = currentTime
         
         # once per minute check if the sustained cpu usage is above 5%, if so
@@ -736,8 +748,8 @@
         # TODO: disabling this for now (scrolling causes cpu spikes for quick
         # redraws, ie this is usually triggered by user input)
         if False and not isResourceWarningGiven and currentTime > (lastResourceCheck + 60):
-          if cpuAvg >= 0.05:
-            msg = "Arm's cpu usage is high (averaging %0.3f%%)." % (100 * cpuAvg)
+          if totalCpuAvg >= 0.05:
+            msg = "Arm's cpu usage is high (averaging %0.3f%%)." % (100 * totalCpuAvg)
             
             if not isBlindMode:
               msg += " You could lower it by dropping the connection data (running as \"arm -b\")."
@@ -785,20 +797,26 @@
         # this appears to be a python bug: http://bugs.python.org/issue3014
         # (haven't seen this is quite some time... mysteriously resolved?)
         
+        # stops panel daemons
+        panels["header"].stop()
+        panels["log"].stop()
+        
+        panels["header"].join()
+        panels["log"].join()
+        
         # joins on utility daemon threads - this might take a moment since
         # the internal threadpools being joined might be sleeping
+        conn = torTools.getConn()
+        myPid = conn.getMyPid()
+        
+        resourceTracker = sysTools.getResourceTracker(myPid) if (myPid and sysTools.isTrackerAlive(myPid)) else None
         resolver = connections.getResolver("tor") if connections.isResolverAlive("tor") else None
+        if resourceTracker: resourceTracker.stop()
         if resolver: resolver.stop()  # sets halt flag (returning immediately)
         hostnames.stop()              # halts and joins on hostname worker thread pool
+        if resourceTracker: resourceTracker.join()
         if resolver: resolver.join()  # joins on halted resolver
         
-        # stops panel daemons
-        panels["header"].stop()
-        panels["log"].stop()
-        
-        panels["header"].join()
-        panels["log"].join()
-        
         conn.close() # joins on TorCtl event thread
         break
     elif key == curses.KEY_LEFT or key == curses.KEY_RIGHT:

Modified: arm/trunk/src/interface/graphing/__init__.py
===================================================================
--- arm/trunk/src/interface/graphing/__init__.py	2010-12-31 01:41:07 UTC (rev 24004)
+++ arm/trunk/src/interface/graphing/__init__.py	2010-12-31 03:02:10 UTC (rev 24005)
@@ -2,5 +2,5 @@
 Panels, popups, and handlers comprising the arm user interface.
 """
 
-__all__ = ["graphPanel.py", "bandwidthStats", "connStats", "psStats"]
+__all__ = ["graphPanel.py", "bandwidthStats", "connStats", "resourceStats"]
 

Deleted: arm/trunk/src/interface/graphing/psStats.py
===================================================================
--- arm/trunk/src/interface/graphing/psStats.py	2010-12-31 01:41:07 UTC (rev 24004)
+++ arm/trunk/src/interface/graphing/psStats.py	2010-12-31 03:02:10 UTC (rev 24005)
@@ -1,141 +0,0 @@
-"""
-Tracks configured ps stats. If non-numeric then this fails, providing a blank
-graph. By default this provides the cpu and memory usage of the tor process.
-"""
-
-from interface.graphing import graphPanel
-from util import conf, log, sysTools, torTools, uiTools
-
-# number of subsequent failed queries before giving up
-FAILURE_THRESHOLD = 5
-
-# attempts to use cached results from the header panel's ps calls
-HEADER_PS_PARAM = ["%cpu", "rss", "%mem", "etime"]
-
-DEFAULT_CONFIG = {"features.graph.ps.primaryStat": "%cpu",
-                  "features.graph.ps.secondaryStat": "rss",
-                  "features.graph.ps.cachedOnly": True,
-                  "log.graph.ps.invalidStat": log.WARN,
-                  "log.graph.ps.abandon": log.WARN}
-
-class PsStats(graphPanel.GraphStats):
-  """
-  Tracks ps stats, defaulting to system resource usage (cpu and memory usage).
-  """
-  
-  def __init__(self, config=None):
-    graphPanel.GraphStats.__init__(self)
-    self.failedCount = 0      # number of subsequent failed queries
-    
-    self._config = dict(DEFAULT_CONFIG)
-    if config: config.update(self._config)
-    
-    self.queryPid = torTools.getConn().getMyPid()
-    self.queryParam = [self._config["features.graph.ps.primaryStat"], self._config["features.graph.ps.secondaryStat"]]
-    
-    # If we're getting the same stats as the header panel then issues identical
-    # queries to make use of cached results. If not, then disable cache usage.
-    if self.queryParam[0] in HEADER_PS_PARAM and self.queryParam[1] in HEADER_PS_PARAM:
-      self.queryParam = list(HEADER_PS_PARAM)
-    else: self._config["features.graph.ps.cachedOnly"] = False
-    
-    # strips any empty entries
-    while "" in self.queryParam: self.queryParam.remove("")
-    
-    self.cacheTime = 3600 if self._config["features.graph.ps.cachedOnly"] else 1
-  
-  def getTitle(self, width):
-    return "System Resources:"
-  
-  def getRefreshRate(self):
-    # provides the rate at which the panel has new stats to display
-    if self._config["features.graph.ps.cachedOnly"]:
-      return int(conf.getConfig("arm").get("queries.ps.rate", 5))
-    else: return 1
-  
-  def getHeaderLabel(self, width, isPrimary):
-    avg = (self.primaryTotal if isPrimary else self.secondaryTotal) / max(1, self.tick)
-    lastAmount = self.lastPrimary if isPrimary else self.lastSecondary
-    
-    if isPrimary: statName = self._config["features.graph.ps.primaryStat"]
-    else: statName = self._config["features.graph.ps.secondaryStat"]
-    
-    # provides nice labels for failures and common stats
-    if not statName or self.failedCount >= FAILURE_THRESHOLD or not statName in self.queryParam:
-      return ""
-    elif statName == "%cpu":
-      return "CPU (%s%%, avg: %0.1f%%):" % (lastAmount, avg)
-    elif statName in ("rss", "size"):
-      # memory sizes are converted from MB to B before generating labels
-      statLabel = "Memory" if statName == "rss" else "Size"
-      usageLabel = uiTools.getSizeLabel(lastAmount * 1048576, 1)
-      avgLabel = uiTools.getSizeLabel(avg * 1048576, 1)
-      return "%s (%s, avg: %s):" % (statLabel, usageLabel, avgLabel)
-    else:
-      # generic label (first letter of stat name is capitalized)
-      statLabel = statName[0].upper() + statName[1:]
-      return "%s (%s, avg: %s):" % (statLabel, lastAmount, avg)
-  
-  def isVisible(self):
-    """
-    Hides graph if unable to fetch stats.
-    """
-    
-    if self.queryPid and self.queryParam and self.failedCount < FAILURE_THRESHOLD:
-      return graphPanel.GraphStats.isVisible(self)
-    else: return False
-  
-  def eventTick(self):
-    """
-    Processes a ps event.
-    """
-    
-    psResults = {} # mapping of stat names to their results
-    if self.queryPid and self.queryParam and self.failedCount < FAILURE_THRESHOLD:
-      queryCmd = "ps -p %s -o %s" % (self.queryPid, ",".join(self.queryParam))
-      psCall = sysTools.call(queryCmd, self.cacheTime, True)
-      
-      if psCall and len(psCall) == 2:
-        # ps provided results (first line is headers, second is stats)
-        stats = psCall[1].strip().split()
-        
-        if len(self.queryParam) == len(stats):
-          # we have a result to match each stat - constructs mapping
-          psResults = dict([(self.queryParam[i], stats[i]) for i in range(len(stats))])
-          self.failedCount = 0 # had a successful call - reset failure count
-      
-      if not psResults:
-        # ps call failed, if we fail too many times sequentially then abandon
-        # listing (probably due to invalid ps parameters)
-        self.failedCount += 1
-        
-        if self.failedCount == FAILURE_THRESHOLD:
-          msg = "failed several attempts to query '%s', abandoning ps graph" % queryCmd
-          log.log(self._config["log.graph.ps.abandon"], msg)
-    
-    # if something fails (no pid, ps call failed, etc) then uses last results
-    primary, secondary = self.lastPrimary, self.lastSecondary
-    
-    for isPrimary in (True, False):
-      if isPrimary: statName = self._config["features.graph.ps.primaryStat"]
-      else: statName = self._config["features.graph.ps.secondaryStat"]
-      
-      if statName in psResults:
-        try:
-          result = float(psResults[statName])
-          
-          # The 'rss' and 'size' parameters provide memory usage in KB. This is
-          # scaled up to MB so the graph's y-high is a reasonable value.
-          if statName in ("rss", "size"): result /= 1024.0
-          
-          if isPrimary: primary = result
-          else: secondary = result
-        except ValueError:
-          if self.queryParam != HEADER_PS_PARAM:
-            # custom stat provides non-numeric results - give a warning and stop querying it
-            msg = "unable to use non-numeric ps stat '%s' for graphing" % statName
-            log.log(self._config["log.graph.ps.invalidStat"], msg)
-            self.queryParam.remove(statName)
-    
-    self._processEvent(primary, secondary)
-

Added: arm/trunk/src/interface/graphing/resourceStats.py
===================================================================
--- arm/trunk/src/interface/graphing/resourceStats.py	                        (rev 0)
+++ arm/trunk/src/interface/graphing/resourceStats.py	2010-12-31 03:02:10 UTC (rev 24005)
@@ -0,0 +1,47 @@
+"""
+Tracks the system resource usage (cpu and memory) of the tor process.
+"""
+
+from interface.graphing import graphPanel
+from util import sysTools, torTools, uiTools
+
+class ResourceStats(graphPanel.GraphStats):
+  """
+  System resource usage tracker.
+  """
+  
+  def __init__(self):
+    graphPanel.GraphStats.__init__(self)
+    self.queryPid = torTools.getConn().getMyPid()
+  
+  def getTitle(self, width):
+    return "System Resources:"
+  
+  def getHeaderLabel(self, width, isPrimary):
+    avg = (self.primaryTotal if isPrimary else self.secondaryTotal) / max(1, self.tick)
+    lastAmount = self.lastPrimary if isPrimary else self.lastSecondary
+    
+    if isPrimary:
+      return "CPU (%0.1f%%, avg: %0.1f%%):" % (lastAmount, avg)
+    else:
+      # memory sizes are converted from MB to B before generating labels
+      usageLabel = uiTools.getSizeLabel(lastAmount * 1048576, 1)
+      avgLabel = uiTools.getSizeLabel(avg * 1048576, 1)
+      return "Memory (%s, avg: %s):" % (usageLabel, avgLabel)
+  
+  def eventTick(self):
+    """
+    Fetch the cached measurement of resource usage from the ResourceTracker.
+    """
+    
+    primary, secondary = 0, 0
+    if self.queryPid:
+      resourceTracker = sysTools.getResourceTracker(self.queryPid)
+      
+      if not resourceTracker.lastQueryFailed():
+        primary, _, secondary, _ = resourceTracker.getResourceUsage()
+        primary *= 100        # decimal percentage to whole numbers
+        secondary /= 1048576  # translate size to MB so axis labels are short
+    
+    self._processEvent(primary, secondary)
+

Modified: arm/trunk/src/interface/headerPanel.py
===================================================================
--- arm/trunk/src/interface/headerPanel.py	2010-12-31 01:41:07 UTC (rev 24004)
+++ arm/trunk/src/interface/headerPanel.py	2010-12-31 03:02:10 UTC (rev 24005)
@@ -32,35 +32,34 @@
 VERSION_STATUS_COLORS = {"new": "blue", "new in series": "blue", "obsolete": "red", "recommended": "green",  
                          "old": "red",  "unrecommended": "red",  "unknown": "cyan"}
 
-DEFAULT_CONFIG = {"queries.ps.rate": 5}
-
 class HeaderPanel(panel.Panel, threading.Thread):
   """
   Top area contenting tor settings and system information. Stats are stored in
   the vals mapping, keys including:
     tor/  version, versionStatus, nickname, orPort, dirPort, controlPort,
           exitPolicy, isAuthPassword (bool), isAuthCookie (bool)
-          *address, *fingerprint, *flags
+          *address, *fingerprint, *flags, pid, startTime
     sys/  hostname, os, version
-    stat/ *%torCpu, *%armCpu, *rss, *%mem, pid, *etime
+    stat/ *%torCpu, *%armCpu, *rss, *%mem
   
   * volatile parameter that'll be reset on each update
   """
   
-  def __init__(self, stdscr, startTime, config=None):
+  def __init__(self, stdscr, startTime):
     panel.Panel.__init__(self, stdscr, "header", 0)
     threading.Thread.__init__(self)
     self.setDaemon(True)
     
     self._isTorConnected = True
     self._lastUpdate = -1       # time the content was last revised
-    self._isLastDrawWide = False
-    self._isChanged = False     # new stats to be drawn if true
     self._isPaused = False      # prevents updates if true
     self._halt = False          # terminates thread if true
     self._cond = threading.Condition()  # used for pausing the thread
-    self._config = dict(DEFAULT_CONFIG)
     
+    # Time when the panel was paused or tor was stopped. This is used to
+    # freeze the uptime statistic (uptime increments normally when None).
+    self._haltTime = None
+    
     # The last arm cpu usage sampling taken. This is a tuple of the form:
     # (total arm cpu time, sampling timestamp)
     # 
@@ -75,8 +74,9 @@
     
     self._armCpuSampling = (sum(os.times()[:3]), startTime)
     
-    if config:
-      config.update(self._config, {"queries.ps.rate": 1})
+    # Last sampling received from the ResourceTracker, used to detect when it
+    # changes.
+    self._lastResourceFetch = -1
     
     self.vals = {}
     self.valsLock = threading.RLock()
@@ -152,13 +152,21 @@
     
     # Line 3 / Line 1 Right (system usage info)
     y, x = (0, leftWidth) if isWide else (2, 0)
-    if self.vals["stat/rss"] != "0": memoryLabel = uiTools.getSizeLabel(int(self.vals["stat/rss"]) * 1024)
+    if self.vals["stat/rss"] != "0": memoryLabel = uiTools.getSizeLabel(int(self.vals["stat/rss"]))
     else: memoryLabel = "0"
     
+    uptimeLabel = ""
+    if self.vals["tor/startTime"]:
+      if self._haltTime:
+        # freeze the uptime when paused or the tor process is stopped
+        uptimeLabel = uiTools.getShortTimeLabel(self._haltTime - self.vals["tor/startTime"])
+      else:
+        uptimeLabel = uiTools.getShortTimeLabel(time.time() - self.vals["tor/startTime"])
+    
     sysFields = ((0, "cpu: %s%% tor, %s%% arm" % (self.vals["stat/%torCpu"], self.vals["stat/%armCpu"])),
                  (27, "mem: %s (%s%%)" % (memoryLabel, self.vals["stat/%mem"])),
-                 (47, "pid: %s" % (self.vals["stat/pid"] if self._isTorConnected else "")),
-                 (59, "uptime: %s" % self.vals["stat/etime"]))
+                 (47, "pid: %s" % (self.vals["tor/pid"] if self._isTorConnected else "")),
+                 (59, "uptime: %s" % uptimeLabel))
     
     for (start, label) in sysFields:
       if start + len(label) <= rightWidth: self.addstr(y, x + start, label)
@@ -211,39 +219,55 @@
       # TODO: not sure what information to provide here...
       pass
     
-    self._isLastDrawWide = isWide
-    self._isChanged = False
     self.valsLock.release()
   
-  def redraw(self, forceRedraw=False, block=False):
-    # determines if the content needs to be redrawn or not
-    isWide = self.getParent().getmaxyx()[1] >= MIN_DUAL_COL_WIDTH
-    panel.Panel.redraw(self, forceRedraw or self._isChanged or isWide != self._isLastDrawWide, block)
-  
   def setPaused(self, isPause):
     """
     If true, prevents updates from being presented.
     """
     
-    self._isPaused = isPause
+    if not self._isPaused == isPause:
+      self._isPaused = isPause
+      if self._isTorConnected:
+        if isPause: self._haltTime = time.time()
+        else: self._haltTime = None
+      
+      # Redraw now so we'll be displaying the state right when paused
+      # (otherwise the uptime might be off by a second, and change when
+      # the panel's redrawn for other reasons).
+      self.redraw(True)
   
   def run(self):
     """
     Keeps stats updated, querying new information at a set rate.
     """
     
+    lastDraw = time.time() - 1
     while not self._halt:
-      timeSinceReset = time.time() - self._lastUpdate
-      psRate = self._config["queries.ps.rate"]
+      currentTime = time.time()
       
-      if self._isPaused or timeSinceReset < psRate or not self._isTorConnected:
-        sleepTime = max(0.5, psRate - timeSinceReset)
+      if self._isPaused or currentTime - lastDraw < 1 or not self._isTorConnected:
         self._cond.acquire()
-        if not self._halt: self._cond.wait(sleepTime)
+        if not self._halt: self._cond.wait(0.2)
         self._cond.release()
       else:
-        self._update()
-        self.redraw()
+        # Update the volatile attributes (cpu, memory, flags, etc) if we have
+        # a new resource usage sampling (the most dynamic stat) or its been
+        # twenty seconds since last fetched (so we still refresh occasionally
+        # when resource fetches fail).
+        # 
+        # Otherwise, just redraw the panel to change the uptime field.
+        
+        isChanged = False
+        if self.vals["tor/pid"]:
+          resourceTracker = sysTools.getResourceTracker(self.vals["tor/pid"])
+          isChanged = self._lastResourceFetch != resourceTracker.getRunCount()
+        
+        if isChanged or currentTime - self._lastUpdate >= 20:
+          self._update()
+        
+        self.redraw(True)
+        lastDraw += 1
   
   def stop(self):
     """
@@ -266,10 +290,14 @@
     
     if eventType == torTools.TOR_INIT:
       self._isTorConnected = True
+      if self._isPaused: self._haltTime = time.time()
+      else: self._haltTime = None
+      
       self._update(True)
-      self.redraw()
+      self.redraw(True)
     elif eventType == torTools.TOR_CLOSED:
       self._isTorConnected = False
+      self._haltTime = time.time()
       self._update()
       self.redraw(True)
   
@@ -324,8 +352,11 @@
       self.vals["sys/version"] = unameVals[2]
       
       pid = conn.getMyPid()
-      self.vals["stat/pid"] = pid if pid else ""
+      self.vals["tor/pid"] = pid if pid else ""
       
+      startTime = conn.getStartTime()
+      self.vals["tor/startTime"] = startTime if startTime else ""
+      
       # reverts volatile parameters to defaults
       self.vals["tor/fingerprint"] = "Unknown"
       self.vals["tor/flags"] = []
@@ -333,59 +364,42 @@
       self.vals["stat/%armCpu"] = "0"
       self.vals["stat/rss"] = "0"
       self.vals["stat/%mem"] = "0"
-      self.vals["stat/etime"] = ""
     
     # sets volatile parameters
-    volatile = {}
-    
     # TODO: This can change, being reported by STATUS_SERVER -> EXTERNAL_ADDRESS
     # events. Introduce caching via torTools?
     if self.vals["tor/address"] == "Unknown":
-      volatile["tor/address"] = conn.getInfo("address", self.vals["tor/address"])
+      self.vals["tor/address"] = conn.getInfo("address", self.vals["tor/address"])
     
-    volatile["tor/fingerprint"] = conn.getInfo("fingerprint", self.vals["tor/fingerprint"])
-    volatile["tor/flags"] = conn.getMyFlags(self.vals["tor/flags"])
+    self.vals["tor/fingerprint"] = conn.getInfo("fingerprint", self.vals["tor/fingerprint"])
+    self.vals["tor/flags"] = conn.getMyFlags(self.vals["tor/flags"])
     
-    # ps derived stats
-    psParams = ["%cpu", "rss", "%mem", "etime"]
-    if self.vals["stat/pid"]:
-      # if call fails then everything except etime are zeroed out (most likely
-      # tor's no longer running)
-      volatile["stat/%torCpu"] = "0"
-      volatile["stat/rss"] = "0"
-      volatile["stat/%mem"] = "0"
+    # ps or proc derived resource usage stats
+    if self.vals["tor/pid"]:
+      resourceTracker = sysTools.getResourceTracker(self.vals["tor/pid"])
       
-      # the ps call formats results as:
-      # %CPU   RSS %MEM     ELAPSED
-      # 0.3 14096  1.3       29:51
-      psRate = self._config["queries.ps.rate"]
-      psCall = sysTools.call("ps -p %s -o %s" % (self.vals["stat/pid"], ",".join(psParams)), psRate, True)
-      
-      if psCall and len(psCall) >= 2:
-        stats = psCall[1].strip().split()
-        
-        if len(stats) == len(psParams):
-          for i in range(len(psParams)):
-            paramName = psParams[i]
-            if paramName == "%cpu": paramName = "%torCpu"
-            volatile["stat/" + paramName] = stats[i]
+      if resourceTracker.lastQueryFailed():
+        self.vals["stat/%torCpu"] = "0"
+        self.vals["stat/rss"] = "0"
+        self.vals["stat/%mem"] = "0"
+      else:
+        cpuUsage, _, memUsage, memUsagePercent = resourceTracker.getResourceUsage()
+        self._lastResourceFetch = resourceTracker.getRunCount()
+        self.vals["stat/%torCpu"] = "%0.1f" % (100 * cpuUsage)
+        self.vals["stat/rss"] = str(memUsage)
+        self.vals["stat/%mem"] = "%0.1f" % (100 * memUsagePercent)
     
     # determines the cpu time for the arm process (including user and system
     # time of both the primary and child processes)
     
-    currentTime, totalCpuTime = time.time(), sum(os.times()[:3])
-    cpuDelta = totalCpuTime - self._armCpuSampling[0]
-    timeDelta = currentTime - self._armCpuSampling[1]
-    self.vals["stat/%armCpu"] = "%0.1f" % (100 * cpuDelta / timeDelta)
-    self._armCpuSampling = (totalCpuTime, currentTime)
+    totalArmCpuTime, currentTime = sum(os.times()[:3]), time.time()
+    armCpuDelta = totalArmCpuTime - self._armCpuSampling[0]
+    armTimeDelta = currentTime - self._armCpuSampling[1]
+    pythonCpuTime = armCpuDelta / armTimeDelta
+    sysCallCpuTime = sysTools.getSysCpuUsage()
+    self.vals["stat/%armCpu"] = "%0.1f" % (100 * (pythonCpuTime + sysCallCpuTime))
+    self._armCpuSampling = (totalArmCpuTime, currentTime)
     
-    # checks if any changes have been made and merges volatile into vals
-    
-    self._isChanged |= setStatic
-    for key, val in volatile.items():
-      self._isChanged |= self.vals[key] != val
-      self.vals[key] = val
-    
     self._lastUpdate = currentTime
     self.valsLock.release()
 

Modified: arm/trunk/src/settings.cfg
===================================================================
--- arm/trunk/src/settings.cfg	2010-12-31 01:41:07 UTC (rev 24004)
+++ arm/trunk/src/settings.cfg	2010-12-31 03:02:10 UTC (rev 24005)
@@ -46,6 +46,7 @@
 # [ARM_DEBUG] redrawing the log panel with the corrected content height (estimat was off by 4)
 # [ARM_DEBUG] GETINFO accounting/bytes-left (runtime: 0.0006)
 # [ARM_DEBUG] GETCONF MyFamily (runtime: 0.0007)
+# [ARM_DEBUG] Unable to query process resource usage from ps, waiting 6.25 seconds (unrecognized output from ps: ...)
 
 msg.BW READ:
 msg.DEBUG connection_handle_write(): After TLS write of
@@ -83,6 +84,7 @@
 msg.ARM_DEBUG GETINFO accounting/interval-end
 msg.ARM_DEBUG GETINFO accounting/hibernating
 msg.ARM_DEBUG GETCONF
+msg.ARM_DEBUG Unable to query process resource usage from ps
 
 # some config options are fetched via special values
 torrc.map HiddenServiceDir => HiddenServiceOptions

Modified: arm/trunk/src/util/sysTools.py
===================================================================
--- arm/trunk/src/util/sysTools.py	2010-12-31 01:41:07 UTC (rev 24004)
+++ arm/trunk/src/util/sysTools.py	2010-12-31 03:02:10 UTC (rev 24005)
@@ -6,7 +6,7 @@
 import time
 import threading
 
-from util import log, procTools
+from util import log, procTools, uiTools
 
 # mapping of commands to if they're available or not
 CMD_AVAILABLE_CACHE = {}
@@ -17,16 +17,44 @@
 CALL_CACHE_LOCK = threading.RLock() # governs concurrent modifications of CALL_CACHE
 
 PROCESS_NAME_CACHE = {} # mapping of pids to their process names
+RESOURCE_TRACKERS = {}  # mapping of pids to their resource tracker instances
 
-CONFIG = {"cache.sysCalls.size": 600,
+# Runtimes for system calls, used to estimate cpu usage. Entries are tuples of
+# the form:
+# (time called, runtime)
+RUNTIMES = []
+SAMPLING_PERIOD = 5 # time of the sampling period
+
+CONFIG = {"queries.resourceUsage.rate": 5,
+          "cache.sysCalls.size": 600,
           "log.sysCallMade": log.DEBUG,
           "log.sysCallCached": None,
           "log.sysCallFailed": log.INFO,
-          "log.sysCallCacheGrowing": log.INFO}
+          "log.sysCallCacheGrowing": log.INFO,
+          "log.stats.failedProcResolution": log.DEBUG,
+          "log.stats.procResolutionFailover": log.INFO,
+          "log.stats.failedPsResolution": log.INFO}
 
 def loadConfig(config):
   config.update(CONFIG)
 
+def getSysCpuUsage():
+  """
+  Provides an estimate of the cpu usage for system calls made through this
+  module, based on a sampling period of five seconds. The os.times() function,
+  unfortunately, doesn't seem to take popen calls into account. This returns a
+  float representing the percentage used.
+  """
+  
+  currentTime = time.time()
+  
+  # removes any runtimes outside of our sampling period
+  while RUNTIMES and currentTime - RUNTIMES[0][0] > SAMPLING_PERIOD:
+    RUNTIMES.pop(0)
+  
+  runtimeSum = sum([entry[1] for entry in RUNTIMES])
+  return runtimeSum / SAMPLING_PERIOD
+
 def isAvailable(command, cached=True):
   """
   Checks the current PATH to see if a command is available or not. If a full
@@ -230,9 +258,16 @@
     else: raise errorExc
   else:
     # log call information and if we're caching then save the results
-    msg = "system call: %s (runtime: %0.2f)" % (command, time.time() - startTime)
+    currentTime = time.time()
+    runtime = currentTime - startTime
+    msg = "system call: %s (runtime: %0.2f)" % (command, runtime)
     log.log(CONFIG["log.sysCallMade"], msg)
     
+    # append the runtime, and remove any outside of the sampling period
+    RUNTIMES.append((currentTime, runtime))
+    while RUNTIMES and currentTime - RUNTIMES[0][0] > SAMPLING_PERIOD:
+      RUNTIMES.pop(0)
+    
     if cacheAge > 0:
       CALL_CACHE_LOCK.acquire()
       CALL_CACHE[command] = (time.time(), results)
@@ -240,3 +275,218 @@
     
     return results
 
+def isTrackerAlive(pid):
+  """
+  Provides true if a running, singleton instance exists for the given pid,
+  false otherwise.
+  
+  Arguments:
+    pid - pid of the process being tracked
+  """
+  
+  if pid in RESOURCE_TRACKERS:
+    if RESOURCE_TRACKERS[pid].isAlive(): return True
+    else: del RESOURCE_TRACKERS[pid]
+  
+  return False
+
+def getResourceTracker(pid):
+  """
+  Provides a running singleton ResourceTracker instance for the given pid.
+  
+  Arguments:
+    pid - pid of the process being tracked
+  """
+  
+  if pid in RESOURCE_TRACKERS:
+    tracker = RESOURCE_TRACKERS[pid]
+    if tracker.isAlive(): return tracker
+    else: del RESOURCE_TRACKERS[pid]
+  
+  tracker = ResourceTracker(pid, CONFIG["queries.resourceUsage.rate"])
+  RESOURCE_TRACKERS[pid] = tracker
+  tracker.start()
+  return tracker
+
+class ResourceTracker(threading.Thread):
+  """
+  Periodically fetches the resource usage (cpu and memory usage) for a given
+  process.
+  """
+  
+  def __init__(self, processPid, resolveRate):
+    """
+    Initializes a new resolver daemon. When no longer needed it's suggested
+    that this is stopped.
+    
+    Arguments:
+      processPid  - pid of the process being tracked
+      resolveRate - time between resolving resource usage
+    """
+    
+    threading.Thread.__init__(self)
+    self.setDaemon(True)
+    
+    self.processPid = processPid
+    self.resolveRate = resolveRate
+    
+    self.cpuSampling = 0.0  # latest cpu usage sampling
+    self.cpuAvg = 0.0       # total average cpu usage
+    self.memUsage = 0       # last sampled memory usage in bytes
+    self.memUsagePercentage = 0.0 # percentage cpu usage
+    
+    # resolves usage via proc results if true, ps otherwise
+    self._useProc = procTools.isProcAvailable()
+    
+    # used to get the deltas when querying cpu time
+    self._lastCpuTotal = 0
+    
+    self.lastLookup = -1
+    self._halt = False      # terminates thread if true
+    self._valLock = threading.RLock()
+    self._cond = threading.Condition()  # used for pausing the thread
+    
+    # number of successful calls we've made
+    self._runCount = 0
+    
+    # sequential times we've failed with this method of resolution
+    self._failureCount = 0
+  
+  def getResourceUsage(self):
+    """
+    Provides the last cached resource usage as a tuple of the form:
+    (cpuUsage_sampling, cpuUsage_avg, memUsage_bytes, memUsage_percent)
+    """
+    
+    self._valLock.acquire()
+    results = (self.cpuSampling, self.cpuAvg, self.memUsage, self.memUsagePercentage)
+    self._valLock.release()
+    
+    return results
+  
+  def getRunCount(self):
+    """
+    Provides the number of times we've successfully fetched the resource
+    usages.
+    """
+    
+    return self._runCount
+  
+  def lastQueryFailed(self):
+    """
+    Provides true if, since we fetched the currently cached results, we've
+    failed to get new results. False otherwise.
+    """
+    
+    return self._failureCount != 0
+  
+  def run(self):
+    while not self._halt:
+      timeSinceReset = time.time() - self.lastLookup
+      
+      if timeSinceReset < self.resolveRate:
+        sleepTime = max(0.2, self.resolveRate - timeSinceReset)
+        
+        self._cond.acquire()
+        if not self._halt: self._cond.wait(sleepTime)
+        self._cond.release()
+        
+        continue # done waiting, try again
+      
+      newValues = {}
+      try:
+        if self._useProc:
+          utime, stime, startTime = procTools.getStats(self.processPid, procTools.STAT_CPU_UTIME, procTools.STAT_CPU_STIME, procTools.STAT_START_TIME)
+          totalCpuTime = float(utime) + float(stime)
+          cpuDelta = totalCpuTime - self._lastCpuTotal
+          newValues["cpuSampling"] = cpuDelta / timeSinceReset
+          newValues["cpuAvg"] = totalCpuTime / (time.time() - float(startTime))
+          newValues["_lastCpuTotal"] = totalCpuTime
+          
+          memUsage = int(procTools.getMemoryUsage(self.processPid)[0])
+          totalMemory = procTools.getPhysicalMemory()
+          newValues["memUsage"] = memUsage
+          newValues["memUsagePercentage"] = float(memUsage) / totalMemory
+        else:
+          # the ps call formats results as:
+          # %CPU   RSS %MEM     ELAPSED
+          # 0.3 14096  1.3       29:51
+          psCall = call("ps -p %s -o cputime,etime,rss,%%mem" % self.processPid)
+          
+          isSuccessful = False
+          if psCall and len(psCall) >= 2:
+            stats = psCall[1].strip().split()
+            
+            if len(stats) == 4:
+              try:
+                totalCpuTime = uiTools.parseShortTimeLabel(stats[0])
+                uptime = uiTools.parseShortTimeLabel(stats[1])
+                cpuDelta = totalCpuTime - self._lastCpuTotal
+                newValues["cpuSampling"] = cpuDelta / timeSinceReset
+                newValues["cpuAvg"] = totalCpuTime / uptime
+                newValues["_lastCpuTotal"] = totalCpuTime
+                
+                newValues["memUsage"] = int(stats[2]) * 1024 # ps size is in kb
+                newValues["memUsagePercentage"] = float(stats[3]) / 100.0
+                isSuccessful = True
+              except ValueError, exc: pass
+          
+          if not isSuccessful:
+            raise IOError("unrecognized output from ps: %s" % psCall)
+      except IOError, exc:
+        self._failureCount += 1
+        
+        if self._useProc:
+          if self._failureCount >= 3:
+            # We've failed three times resolving via proc. Warn, and fall back
+            # to ps resolutions.
+            msg = "Failed three attempts to get process resource usage from proc, falling back to ps (%s)" % exc
+            log.log(CONFIG["log.stats.procResolutionFailover"], msg)
+            
+            self._useProc = False
+            self._failureCount = 1 # prevents lastQueryFailed() from thinking that we succeeded
+          else:
+            # wait a bit and try again
+            msg = "Unable to query process resource usage from proc (%s)" % exc
+            log.log(CONFIG["log.stats.failedProcResolution"], msg)
+            self._cond.acquire()
+            self._cond.wait(0.5)
+            self._cond.release()
+        else:
+          # exponential backoff on making failed ps calls
+          sleepTime = 0.01 * (2 ** self._failureCount) + self._failureCount
+          msg = "Unable to query process resource usage from ps, waiting %0.2f seconds (%s)" % (sleepTime, exc)
+          log.log(CONFIG["log.stats.failedProcResolution"], msg)
+          self._cond.acquire()
+          self._cond.wait(sleepTime)
+          self._cond.release()
+      
+      # If this is the first run then the cpuSampling stat is meaningless
+      # (there isn't a previous tick to sample from so it's zero at this
+      # point). Setting it to the average, which is a fairer estimate.
+      if self.lastLookup == -1:
+        newValues["cpuSampling"] = newValues["cpuAvg"]
+          
+      # sets the new values
+      if newValues:
+        self._valLock.acquire()
+        self.cpuSampling = newValues["cpuSampling"]
+        self.cpuAvg = newValues["cpuAvg"]
+        self.memUsage = newValues["memUsage"]
+        self.memUsagePercentage = newValues["memUsagePercentage"]
+        self._lastCpuTotal = newValues["_lastCpuTotal"]
+        self.lastLookup = time.time()
+        self._runCount += 1
+        self._failureCount = 0
+        self._valLock.release()
+  
+  def stop(self):
+    """
+    Halts further resolutions and terminates the thread.
+    """
+    
+    self._cond.acquire()
+    self._halt = True
+    self._cond.notifyAll()
+    self._cond.release()
+

Modified: arm/trunk/src/util/torTools.py
===================================================================
--- arm/trunk/src/util/torTools.py	2010-12-31 01:41:07 UTC (rev 24004)
+++ arm/trunk/src/util/torTools.py	2010-12-31 03:02:10 UTC (rev 24005)
@@ -17,7 +17,7 @@
 
 from TorCtl import TorCtl, TorUtil
 
-from util import log, sysTools
+from util import log, procTools, sysTools, uiTools
 
 # enums for tor's controller state:
 # TOR_INIT - attached to a new controller or restart/sighup signal received
@@ -42,7 +42,7 @@
 CACHE_ARGS = ("version", "config-file", "exit-policy/default", "fingerprint",
               "config/names", "info/names", "features/names", "events/names",
               "nsEntry", "descEntry", "bwRate", "bwBurst", "bwObserved",
-              "bwMeasured", "flags", "pid", "pathPrefix")
+              "bwMeasured", "flags", "pid", "pathPrefix", "startTime")
 
 TOR_CTL_CLOSE_MSG = "Tor closed control connection. Exiting event thread."
 UNKNOWN = "UNKNOWN" # value used by cached information if undefined
@@ -50,6 +50,7 @@
           "features.pathPrefix": "",
           "log.torCtlPortClosed": log.NOTICE,
           "log.torGetInfo": log.DEBUG,
+          "log.torGetInfoCache": None,
           "log.torGetConf": log.DEBUG,
           "log.torSetConf": log.INFO,
           "log.torPrefixPathInvalid": log.NOTICE,
@@ -353,9 +354,12 @@
     if not isFromCache and result and param in CACHE_ARGS:
       self._cachedParam[param] = result
     
-    runtimeLabel = "cache fetch" if isFromCache else "runtime: %0.4f" % (time.time() - startTime)
-    msg = "GETINFO %s (%s)" % (param, runtimeLabel)
-    log.log(CONFIG["log.torGetInfo"], msg)
+    if isFromCache:
+      msg = "GETINFO %s (cache fetch)" % param
+      log.log(CONFIG["log.torGetInfoCache"], msg)
+    else:
+      msg = "GETINFO %s (runtime: %0.4f)" % (param, time.time() - startTime)
+      log.log(CONFIG["log.torGetInfo"], msg)
     
     self.connLock.release()
     
@@ -631,6 +635,17 @@
     if result == UNKNOWN: return ""
     else: return result
   
+  def getStartTime(self):
+    """
+    Provides the unix time for when the tor process first started. If this
+    can't be determined then this provides None.
+    """
+    
+    result = self._getRelayAttr("startTime", None)
+    
+    if result == UNKNOWN: return None
+    else: return result
+  
   def getStatus(self):
     """
     Provides a tuple consisting of the control port's current status and unix
@@ -1063,6 +1078,20 @@
         
         self._pathPrefixLogging = False # prevents logging if fetched again
         result = prefixPath
+      elif key == "startTime":
+        myPid = self.getMyPid()
+        
+        if myPid:
+          try:
+            if procTools.isProcAvailable():
+              result = float(procTools.getStats(myPid, procTools.STAT_START_TIME)[0])
+            else:
+              psCall = sysTools.call("ps -p %s -o etime" % myPid)
+              
+              if psCall and len(psCall) >= 2:
+                etimeEntry = psCall[1].strip()
+                result = time.time() - uiTools.parseShortTimeLabel(etimeEntry)
+          except: pass
       
       # cache value
       if result: self._cachedParam[key] = result

Modified: arm/trunk/src/util/uiTools.py
===================================================================
--- arm/trunk/src/util/uiTools.py	2010-12-31 01:41:07 UTC (rev 24004)
+++ arm/trunk/src/util/uiTools.py	2010-12-31 03:02:10 UTC (rev 24005)
@@ -298,13 +298,74 @@
   
   timeLabels = []
   
-  for countPerUnit, shortLabel, longLabel in TIME_UNITS:
+  for countPerUnit, _, _ in TIME_UNITS:
     if seconds >= countPerUnit:
       timeLabels.append(_getLabel(TIME_UNITS, seconds, 0, isLong))
       seconds %= countPerUnit
   
   return timeLabels
 
+def getShortTimeLabel(seconds):
+  """
+  Provides a time in the following format:
+  [[dd-]hh:]mm:ss
+  
+  Arguments:
+    seconds - source number of seconds for conversion
+  """
+  
+  timeComp = {}
+  
+  for amount, _, label in TIME_UNITS:
+    count = int(seconds / amount)
+    seconds %= amount
+    timeComp[label.strip()] = count
+  
+  labelPrefix = ""
+  if timeComp["day"]:
+    labelPrefix = "%i-%02i:" % (timeComp["day"], timeComp["hour"])
+  elif timeComp["hour"]:
+    labelPrefix = "%02i:" % timeComp["hour"]
+  
+  return "%s%02i:%02i" % (labelPrefix, timeComp["minute"], timeComp["second"])
+
+def parseShortTimeLabel(timeEntry):
+  """
+  Provides the number of seconds corresponding to the formatting used for the
+  cputime and etime fields of ps:
+  [[dd-]hh:]mm:ss
+  
+  If the input entry is malformed then this raises a ValueError.
+  
+  Arguments:
+    timeEntry - formatting ps time entry
+  """
+  
+  days, hours, minutes, seconds = 0, 0, 0, 0
+  errorMsg = "invalidly formatted ps time entry: %s" % timeEntry
+  
+  dateDivider = timeEntry.find("-")
+  if dateDivider != -1:
+    days = int(timeEntry[:dateDivider])
+    timeEntry = timeEntry[dateDivider+1:]
+  
+  timeComp = timeEntry.split(":")
+  if len(timeComp) == 3:
+    hours, minutes, seconds = timeComp
+  elif len(timeComp) == 2:
+    minutes, seconds = timeComp
+  else:
+    raise ValueError(errorMsg)
+  
+  try:
+    timeSum = int(seconds)
+    timeSum += int(minutes) * 60
+    timeSum += int(hours) * 3600
+    timeSum += int(days) * 86400
+    return timeSum
+  except ValueError:
+    raise ValueError(errorMsg)
+
 class Scroller:
   """
   Tracks the scrolling position when there might be a visible cursor. This



More information about the tor-commits mailing list