[or-cvs] r18535: {torflow} Update SoupDiffer to be more like JSDiffer. This also makes (torflow/trunk/NetworkScanners)

mikeperry at seul.org mikeperry at seul.org
Sat Feb 14 00:33:38 UTC 2009


Author: mikeperry
Date: 2009-02-13 19:33:38 -0500 (Fri, 13 Feb 2009)
New Revision: 18535

Modified:
   torflow/trunk/NetworkScanners/libsoat.py
   torflow/trunk/NetworkScanners/soat.py
Log:

Update SoupDiffer to be more like JSDiffer. This also makes
it pickleable, which means we can store it and keep diffs
across many Non-Tor fetches.



Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================
--- torflow/trunk/NetworkScanners/libsoat.py	2009-02-13 22:43:16 UTC (rev 18534)
+++ torflow/trunk/NetworkScanners/libsoat.py	2009-02-14 00:33:38 UTC (rev 18535)
@@ -370,35 +370,22 @@
             ret+=line+"\n"
 
       if soup and tor_soup and old_soup:
-        old_vs_new = SoupDiffer(old_soup, soup)
-        new_vs_old = SoupDiffer(soup, old_soup)
-        new_vs_tor = SoupDiffer(soup, tor_soup)
+        soupdiff = SoupDiffer(old_soup, soup)
 
-        # I'm an evil man and I'm going to CPU hell..
-        changed_tags = SoupDiffer.merge_tag_maps(
-                            old_vs_new.changed_tags_with_attrs(),
-                            new_vs_old.changed_tags_with_attrs())
+        more_tags = soupdiff.show_changed_tags(tor_soup)     
+        more_attrs = soupdiff.show_changed_attrs(tor_soup)
+        more_content = soupdiff.show_changed_content(tor_soup)
 
-        changed_attributes = SoupDiffer.merge_tag_maps(
-                                old_vs_new.changed_attributes_by_tag(),
-                                new_vs_old.changed_attributes_by_tag())
-
-        changed_content = bool(new_vs_old.changed_content() or old_vs_new.changed_content())
-
-        more_tags = new_vs_tor.more_changed_tags(changed_tags)     
-        more_attrs = new_vs_tor.more_changed_attrs(changed_attributes)
-        more_content = new_vs_tor.changed_content()
-
         if more_tags:
           ret += "\nTor changed tags:\n"
           ret += more_tags
         if more_attrs:
           ret += "\nTor changed attrs:\n"
           ret += more_attrs
-        if not changed_content and more_content:
+        if not soupdiff.content_changed and more_content:
           ret += "\nChanged Content:\n"
           ret += "\n".join(more_content)+"\n"
-        if (changed_content or not more_content) and not more_tags and not more_attrs:
+        if (soupdiff.content_changed or not more_content) and not more_tags and not more_attrs:
           ret += "\nSoupDiffer claims false positive.\n"
           jsdiff = JSSoupDiffer(old_soup)
           jsdiff.prune_differences(soup)
@@ -665,153 +652,119 @@
 class SoupDiffer:
   """ Diff two soup tag sets, optionally writing diffs to outfile. """
   def __init__(self, soup_old, soup_new):
-    self.soup_old = soup_old
-    self.soup_new = soup_new
+    tags_old = self._get_tags(soup_old)
+    tags_new = self._get_tags(soup_new)
+    self.tag_pool = tags_new | tags_old
+    self.changed_tag_map = {}
+    self._update_changed_tag_map(tags_old, tags_new)
+    self._update_changed_tag_map(tags_new, tags_old)
 
-  def changed_tags(self):
-    """ Return a list of tags changed or added to soup_new as strings """
-    tags_old = sets.Set(map(str, 
-           [tag for tag in self.soup_old.findAll() if isinstance(tag, Tag)]))
-    tags_new = sets.Set(map(str, 
-           [tag for tag in self.soup_new.findAll() if isinstance(tag, Tag)]))
-    ret = list(tags_new - tags_old)
-    ret.sort()
-    return ret
+    attrs_new = self._get_attributes(soup_new)
+    attrs_old = self._get_attributes(soup_old)
+    self.attr_pool = attrs_new | attrs_old
+    self.changed_attr_map = {}
+    self._update_changed_attr_map(attrs_new, attrs_old)
+    self._update_changed_attr_map(attrs_old, attrs_new)
 
-  def changed_tags_with_attrs(self):
+    cntnt_new = self._get_content(soup_new)
+    cntnt_old = self._get_content(soup_old)
+    self.content_pool = cntnt_new | cntnt_old
+    self.content_changed = bool(cntnt_new ^ cntnt_old) 
+    
+  def rebase(self, new_dir):
+    pass
+    # XXX
+
+  def _get_tags(self, soup):
+    return sets.Set(map(str, 
+           [tag for tag in soup.findAll() if isinstance(tag, Tag)]))
+
+  def _get_attributes(self, soup):
+    attr_soup = [(tag.name, tag.attrs) for tag in soup.findAll()]
+    attrs = sets.Set([])
+    for (tag, attr_list) in attr_soup:
+      for at in attr_list:
+        attrs.add((tag, at)) 
+    return attrs
+
+  def _get_content(self, soup):
+    return sets.Set(map(str, 
+      [tag for tag in soup.findAll() if not isinstance(tag, Tag)]))
+  
+  def _update_changed_tag_map(self, tags_old, tags_new):
     """ Create a map of changed tags to ALL attributes that tag
         has ever had (changed or not) """
-    changed_tags = {}
-    for tags in map(TheChosenSoup, self.changed_tags()):
+    changed_tags = list(tags_new - tags_old)
+    for tags in map(TheChosenSoup, changed_tags):
       for t in tags.findAll():
         if t.name not in changed_tags:
-          changed_tags[t.name] = sets.Set([])
+          self.changed_tag_map[t.name] = sets.Set([])
         for attr in t.attrs:
-          changed_tags[t.name].add(attr[0])
-    return changed_tags
+          self.changed_tag_map[t.name].add(attr[0])
 
+  def _update_changed_attr_map(self, attrs_old, attrs_new):
+    """ Transform the list of (tag, attribute) pairings for new/changed
+        attributes into a map. This allows us to quickly see
+        if any attributes changed for a specific tag. """
+    changed_attributes = list(attrs_new - attrs_old)
+    for (tag, attr) in changed_attributes:
+      if tag not in self.changed_attr_map:
+        self.changed_attr_map[tag] = sets.Set([])
+      self.changed_attr_map[tag].add(attr[0])
 
-  def has_more_changed_tags(self, tag_attr_map):
-    """ Returns true if we have additional tags with additional
-        attributes that were not present in tag_attr_map 
-        (returned from changed_tags_with_attrs) """
-    for tags in map(TheChosenSoup, self.changed_tags()):
-      for t in tags.findAll():
-        if t.name not in tag_attr_map:
-          return True
-        else:
-          for attr in t.attrs:
-            if attr[0] not in tag_attr_map[t.name] \
-                and attr[0] in attrs_to_check_map:
-              return True
-    return False
+  def _update_changed_content(self, content_old, content_new):
+    # FIXME: This could be tracked by parent tag+attr
+    if not self.content_changed:
+      self.content_changed = bool(content_old ^ content_new)
 
-  def more_changed_tags(self, tag_attr_map):
+  def prune_differences(self, soup):
+    tags = self._get_tags(soup)
+    attrs = self._get_attributes(soup)
+    cntnt = self._get_content(soup)
+
+    self._update_changed_tag_map(self.tag_pool, tags)
+    self._update_changed_attr_map(self.attr_pool, attrs)
+    self._update_changed_content(self.content_pool, cntnt)
+    self.tag_pool.union_update(tags)
+    self.attr_pool.union_update(attrs)
+    self.content_pool.union_update(cntnt)
+
+  def show_changed_tags(self, soup):
+    soup_tags = self._get_tags(soup)
+    new_tags = soup_tags - self.tag_pool
     ret = ""
-    for tags in map(TheChosenSoup, self.changed_tags()):
+    for tags in map(TheChosenSoup, new_tags):
       for t in tags.findAll():
-        if t.name not in tag_attr_map:
+        if t.name not in self.changed_attr_map:
           ret += " New Tag: "+str(t)+"\n"
         else:
           for attr in t.attrs:
-            if attr[0] not in tag_attr_map[t.name] \
+            if attr[0] not in self.changed_attr_map[t.name] \
                  and attr[0] in attrs_to_check_map:
               ret += " New Attr "+attr[0]+": "+str(t)+"\n"
     return ret
 
-  def _get_attributes(self):
-    attrs_old = [(tag.name, tag.attrs) for tag in self.soup_old.findAll()]
-    attrs_new = [(tag.name, tag.attrs) for tag in self.soup_new.findAll()]
-    attr_old = []
-    for (tag, attr_list) in attrs_old:
-      for attr in attr_list:
-        attr_old.append((tag, attr)) 
-    attr_new = []
-    for (tag, attr_list) in attrs_new:
-      for attr in attr_list:
-        attr_old.append((tag, attr)) 
-    return (attr_old, attr_new)
-    
-  def changed_attributes(self):
-    """ Return a list of attributes added to soup_new """
-    (attr_old, attr_new) = self._get_attributes()
-    ret = list(sets.Set(attr_new) - sets.Set(attr_old))
-    ret.sort()
-    return ret
-
-  def changed_attributes_by_tag(self):
-    """ Transform the list of (tag, attribute) pairings for new/changed
-        attributes into a map. This allows us to quickly see
-        if any attributes changed for a specific tag. """
-    changed_attributes = {}
-    for (tag, attr) in self.changed_attributes():
-      if tag not in changed_attributes:
-        changed_attributes[tag] = sets.Set([])
-      changed_attributes[tag].add(attr[0])
-    return changed_attributes 
-
-  def merge_tag_maps(tag_map1, tag_map2):
-    " Merges either two tag_attr_maps or two attrs_by_tag maps "
-    ret = copy.deepcopy(tag_map1)
-    for tag in tag_map2:
-      if tag not in ret:
-        ret[tag] = copy.deepcopy(tag_map2[tag])
-      else:
-        ret[tag].union_update(tag_map2[tag])
-    return ret
-  merge_tag_maps = Callable(merge_tag_maps)
-
-  def has_more_changed_attrs(self, attrs_by_tag):
-    """ Returns true if we have any tags with additional
-        changed attributes that were not present in attrs_by_tag
-        (returned from changed_attributes_by_tag) """
-    for (tag, attr) in self.changed_attributes():
-      if tag in attrs_by_tag:
-        if attr[0] not in attrs_by_tag[tag] \
-            and attr[0] in attrs_to_check_map:
-          return True
-      else:
-        return True
-    return False
-
-  def more_changed_attrs(self, attrs_by_tag):
+  def show_changed_attrs(self, soup):
+    soup_attrs = self._get_attributes(soup)
+    new_attrs = soup_attrs - self.attr_pool
     ret = ""
-    for (tag, attr) in self.changed_attributes():
-      if tag in attrs_by_tag:
-        if attr[0] not in attrs_by_tag[tag] \
+    for (tag, attr) in new_attrs:
+      if tag in self.changed_attr_map:
+        if attr[0] not in self.changed_attr_map[tag] \
             and attr[0] in attrs_to_check_map:
           ret += " New Attr "+attr[0]+": "+tag+" "+attr[0]+'="'+attr[1]+'"\n'
       else:
         ret += " New Tag: "+tag+" "+attr[0]+'="'+attr[1]+'"\n'
     return ret
 
-
-  def changed_content(self):
+  def show_changed_content(self, soup):
     """ Return a list of tag contents changed in soup_new """
-    tags_old = sets.Set(map(str, 
-      [tag for tag in self.soup_old.findAll() if not isinstance(tag, Tag)]))
-    tags_new = sets.Set(map(str, 
-      [tag for tag in self.soup_new.findAll() if not isinstance(tag, Tag)]))
-    ret = list(tags_new - tags_old)
+    content = self._get_content(soup)
+    ret = list(content - self.content_pool)
     ret.sort()
     return ret
 
-  def __str__(self):
-    tags = self.changed_tags()
-    out = "Tags:\n"+"\n".join(tags)
-    attrs = self.changed_attributes()
-    out += "\n\nAttrs:\n"
-    for (tag, a) in attrs:
-      out += a[0]+"="+a[1]+"\n"
-    content = self.changed_content()
-    out += "\n\nContent:\n"+"\n".join(map(str, content))
-    return out
 
-  def write_diff(self, outfile):
-    f = open(outfile, "w")
-    f.write(str(self))
-    f.close()
-
 class JSDiffer:
   def __init__(self, js_string):
     if HAVE_PYPY: self.ast_cnts = self._count_ast_elements(js_string)

Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py	2009-02-13 22:43:16 UTC (rev 18534)
+++ torflow/trunk/NetworkScanners/soat.py	2009-02-14 00:33:38 UTC (rev 18535)
@@ -195,8 +195,11 @@
       # Convert self.successes table from integers to sets.
       # Yes, this is a hack, and yes, it will bias results
       # away from the filter, but hey, at least it will still run.
+      self._pickle_revision = 1
+      
       for addr in self.successes.keys():
-        self.successes[addr] = sets.Set(xrange(0,self.successes[addr]))
+        if type(self.successes[addr]) == int:
+          self.successes[addr] = sets.Set(xrange(0,self.successes[addr]))
       plog("INFO", "Upgraded "+self.__class__.__name__+" to v1")
 
   def refill_targets(self):
@@ -1117,32 +1120,19 @@
     # 3. Compare list of changed tags for tor vs new and
     #    see if any extra tags changed or if new attributes
     #    were added to additional tags
-    old_vs_new = SoupDiffer(orig_soup, new_soup)
-    new_vs_old = SoupDiffer(new_soup, orig_soup)
-    new_vs_tor = SoupDiffer(new_soup, tor_soup)
+    soupdiff = SoupDiffer(orig_soup, new_soup)
+    
+    more_tags = soupdiff.show_changed_tags(tor_soup)     
+    more_attrs = soupdiff.show_changed_attrs(tor_soup)
+    more_content = soupdiff.show_changed_content(tor_soup)
 
-    # I'm an evil man and I'm going to CPU hell..
-    changed_tags = SoupDiffer.merge_tag_maps(
-                        old_vs_new.changed_tags_with_attrs(),
-                        new_vs_old.changed_tags_with_attrs())
-
-    changed_attributes = SoupDiffer.merge_tag_maps(
-                            old_vs_new.changed_attributes_by_tag(),
-                            new_vs_old.changed_attributes_by_tag())
-
-    changed_content = bool(new_vs_old.changed_content() or old_vs_new.changed_content())
-
-    more_tags = new_vs_tor.more_changed_tags(changed_tags)     
-    more_attrs = new_vs_tor.more_changed_attrs(changed_attributes)
-    more_content = new_vs_tor.changed_content()
- 
     # Verify all of our changed tags are present here 
-    if more_tags or more_attrs or (more_content and not changed_content):
+    if more_tags or more_attrs or (more_content and not soupdiff.content_changed):
       false_positive = False
       plog("NOTICE", "SoupDiffer finds differences for "+address)
       plog("NOTICE", "New Tags:\n"+more_tags)
       plog("NOTICE", "New Attrs:\n"+more_attrs)
-      if more_content and not changed_content:
+      if more_content and not soupdiff.content_changed:
         plog("NOTICE", "New Content:\n"+more_content)
     else:
       plog("INFO", "SoupDiffer predicts false_positive")



More information about the tor-commits mailing list