[tor-commits] [metrics-tasks/master] Add scripts written by nickm to analyze consensus diff sizes (#7009).

karsten at torproject.org karsten at torproject.org
Mon Jul 1 06:34:25 UTC 2013


commit 869036cceac9468db6daa7e3c9d878b41432a6a7
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Mon Jul 1 08:33:39 2013 +0200

    Add scripts written by nickm to analyze consensus diff sizes (#7009).
---
 task-7009/README              |    2 +
 task-7009/build_md_table_2.py |   20 +++++++
 task-7009/cmds5               |   65 +++++++++++++++++++++++
 task-7009/condiff.py          |   91 ++++++++++++++++++++++++++++++++
 task-7009/condiff2.py         |  115 +++++++++++++++++++++++++++++++++++++++++
 task-7009/cy.py               |  104 +++++++++++++++++++++++++++++++++++++
 task-7009/individually.py     |   43 +++++++++++++++
 task-7009/md-kludge.py        |   56 ++++++++++++++++++++
 task-7009/pairwise.py         |   86 ++++++++++++++++++++++++++++++
 task-7009/wround.py           |   39 ++++++++++++++
 10 files changed, 621 insertions(+)

diff --git a/task-7009/README b/task-7009/README
new file mode 100644
index 0000000..a5caa39
--- /dev/null
+++ b/task-7009/README
@@ -0,0 +1,2 @@
+Scripts written by nickm to analyze consensus diff sizes.
+
diff --git a/task-7009/build_md_table_2.py b/task-7009/build_md_table_2.py
new file mode 100755
index 0000000..9ab4051
--- /dev/null
+++ b/task-7009/build_md_table_2.py
@@ -0,0 +1,20 @@
+#!/usr/bin/python
+import sys
+
+result = {}
+
+for fname in sys.argv[1:]:
+   f = open(fname, 'r')
+   for line in f:
+      if line.startswith('r '):
+          lastR = line
+      if line.startswith('m 8') or (line.startswith('m ') and ",8" in line):
+          sha = line.split()[2]
+          assert sha.startswith("sha256=")
+          descID = lastR.split()[3]
+	  result[descID] = sha
+   f.close()
+  
+for k,v in result.iteritems():
+   print k,v
+
diff --git a/task-7009/cmds5 b/task-7009/cmds5
new file mode 100644
index 0000000..e2df77a
--- /dev/null
+++ b/task-7009/cmds5
@@ -0,0 +1,65 @@
+ ./individually.py gz */* >> RESULTS5
+ ./individually.py bz2 */* >> RESULTS5
+ ./individually.py xz */* >> RESULTS5
+ ./individually.py uncompressed */* >> RESULTS5
+
+ ./pairwise.py diff_gz 1 */* >> RESULTS5
+ ./pairwise.py diff_gz 2 */* >> RESULTS5
+ ./pairwise.py diff_gz 4 */* >> RESULTS5
+ ./pairwise.py diff_gz 6 */* >> RESULTS5
+ ./pairwise.py diff_gz 8 */* >> RESULTS5
+
+ ./pairwise.py diff_bz2 1 */* >> RESULTS5
+ ./pairwise.py diff_bz2 2 */* >> RESULTS5
+ ./pairwise.py diff_bz2 4 */* >> RESULTS5
+ ./pairwise.py diff_bz2 6 */* >> RESULTS5
+ ./pairwise.py diff_bz2 8 */* >> RESULTS5
+
+ ./pairwise.py diff_u_gz 1 */* >> RESULTS5
+ ./pairwise.py diff_u_gz 2 */* >> RESULTS5
+ ./pairwise.py diff_u_gz 4 */* >> RESULTS5
+ ./pairwise.py diff_u_gz 6 */* >> RESULTS5
+ ./pairwise.py diff_u_gz 8 */* >> RESULTS5
+
+ ./pairwise.py diff_u_bz2 1 */* >> RESULTS5
+ ./pairwise.py diff_u_bz2 2 */* >> RESULTS5
+ ./pairwise.py diff_u_bz2 4 */* >> RESULTS5
+ ./pairwise.py diff_u_bz2 6 */* >> RESULTS5
+ ./pairwise.py diff_u_bz2 8 */* >> RESULTS5
+
+ ./pairwise.py diff_e_gz 1 */* >> RESULTS5
+ ./pairwise.py diff_e_gz 2 */* >> RESULTS5
+ ./pairwise.py diff_e_gz 4 */* >> RESULTS5
+ ./pairwise.py diff_e_gz 6 */* >> RESULTS5
+ ./pairwise.py diff_e_gz 8 */* >> RESULTS5
+
+ ./pairwise.py diff_e_bz2 1 */* >> RESULTS5
+ ./pairwise.py diff_e_bz2 2 */* >> RESULTS5
+ ./pairwise.py diff_e_bz2 4 */* >> RESULTS5
+ ./pairwise.py diff_e_bz2 6 */* >> RESULTS5
+ ./pairwise.py diff_e_bz2 8 */* >> RESULTS5
+
+ ./pairwise.py condiff_gz 1 */* >> RESULTS5
+ ./pairwise.py condiff_gz 2 */* >> RESULTS5
+ ./pairwise.py condiff_gz 4 */* >> RESULTS5
+ ./pairwise.py condiff_gz 6 */* >> RESULTS5
+ ./pairwise.py condiff_gz 8 */* >> RESULTS5
+
+ ./pairwise.py condiff_bz2 1 */* >> RESULTS5
+ ./pairwise.py condiff_bz2 2 */* >> RESULTS5
+ ./pairwise.py condiff_bz2 4 */* >> RESULTS5
+ ./pairwise.py condiff_bz2 6 */* >> RESULTS5
+ ./pairwise.py condiff_bz2 8 */* >> RESULTS5
+
+ ./pairwise.py condiff2_gz 1 */* >> RESULTS5
+ ./pairwise.py condiff2_gz 2 */* >> RESULTS5
+ ./pairwise.py condiff2_gz 4 */* >> RESULTS5
+ ./pairwise.py condiff2_gz 6 */* >> RESULTS5
+ ./pairwise.py condiff2_gz 8 */* >> RESULTS5
+
+ ./pairwise.py condiff2_bz2 1 */* >> RESULTS5
+ ./pairwise.py condiff2_bz2 2 */* >> RESULTS5
+ ./pairwise.py condiff2_bz2 4 */* >> RESULTS5
+ ./pairwise.py condiff2_bz2 6 */* >> RESULTS5
+ ./pairwise.py condiff2_bz2 8 */* >> RESULTS5
+
diff --git a/task-7009/condiff.py b/task-7009/condiff.py
new file mode 100755
index 0000000..23c8a75
--- /dev/null
+++ b/task-7009/condiff.py
@@ -0,0 +1,91 @@
+#!/usr/bin/python
+import sys
+
+class Router:
+    def __init__(self, r_line):
+        assert r_line.startswith("r ")
+        self.ident = r_line.split()[2]
+        self.lines = [ r_line ]
+    def append(self, line):
+        self.lines.append(line)
+
+def splitfile(f):
+    header, body, footer = [], [], []
+    inHeader = True
+    inBody = False
+    inFooter = False
+    for line in f.readlines():
+        if inHeader and line.startswith("r "):
+            inBody = True
+            inHeader = False
+            curRouter = None
+        if inBody and line.startswith("directory-footer"):
+            inFooter = True
+            inBody = False
+
+        if inHeader:
+            header.append(line)
+
+        if inBody:
+            if line.startswith("r "):
+                curRouter = Router(line)
+                body.append(curRouter)
+            else:
+                curRouter.append(line)
+
+        if inFooter:
+            footer.append(line)
+
+    assert inFooter
+    return header, body, footer
+
+
+def main(f1, f2):
+    _, body1, _ = splitfile(f1)
+    header2, body2, footer2 = splitfile(f2)
+    assert footer2
+
+    for h in header2:
+        sys.stdout.write(h)
+
+    while body1 and body2:
+        if body1[0].ident < body2[0].ident:
+            print "-"
+            del body1[0]
+        elif body1[0].ident > body2[0].ident:
+            sys.stdout.write("* ")
+            for b in body2[0].lines:
+                sys.stdout.write(b)
+            del body2[0]
+        else: # same router
+            if body1[0].lines != body2[0].lines:
+                if len(body1[0].lines) != len(body2[0].lines):
+                    print >>sys.stderr, "<<%s>><<%s>>"%(body1[0].lines, body2[0].lines)
+                    sys.stdout.write("** ")
+                    for b in body2[0].lines:
+                        sys.stdout.write(b)
+                    del body2[0]
+                    del body1[0]
+                    continue
+
+                if body1[0].lines[0] == body2[0].lines[0]:
+                    print "."
+                else:
+                    sys.stdout.write(body2[0].lines[0])
+                for l1,l2 in zip(body1[0].lines[1:], body2[0].lines[1:]):
+                    if l1 != l2: sys.stdout.write(l2)
+	    del body1[0]
+            del body2[0]
+
+    for r in body2:
+        sys.stdout.write("* ")
+        for l in r.lines:
+           sys.stdout.write(l)
+
+    for f in footer2:
+        sys.stdout.write(f)
+
+f1 = open(sys.argv[1])
+f2 = open(sys.argv[2])
+
+main(f1,f2)
diff --git a/task-7009/condiff2.py b/task-7009/condiff2.py
new file mode 100755
index 0000000..c93ab9f
--- /dev/null
+++ b/task-7009/condiff2.py
@@ -0,0 +1,115 @@
+#!/usr/bin/python
+import sys
+
+class Router:
+    def __init__(self, r_line):
+        assert r_line.startswith("r ")
+        self.ident = r_line.split()[2]
+        self.lines = [ r_line ]
+    def append(self, line):
+        self.lines.append(line)
+
+
+def rdiff(r1,r2):
+    r1 = r1.split()
+    r2 = r2.split()
+    p1 = ["r"]
+    p2 = []
+    for i in xrange(len(r1)):
+       if r1[i] == r2[i]:
+           continue
+       p1.append(str(i))
+       p2.append(r2[i])
+    return "".join(p1) + " " + " ".join(p2)
+
+def sdiff(s1, s2):
+    s1 = set(s1.split()[1:])
+    s2 = set(s2.split()[1:])
+    minus = sorted(("-%s"%item) for item in s1 if item not in s2)
+    plus = sorted(("+%s"%item) for item in s2 if item not in s1)
+    return " ".join(["s"] + minus + plus)  
+
+def splitfile(f):
+    header, body, footer = [], [], []
+    inHeader = True
+    inBody = False
+    inFooter = False
+    for line in f.readlines():
+        if inHeader and line.startswith("r "):
+            inBody = True
+            inHeader = False
+            curRouter = None
+        if inBody and line.startswith("directory-footer"):
+            inFooter = True
+            inBody = False
+
+        if inHeader:
+            header.append(line)
+
+        if inBody:
+            if line.startswith("r "):
+                curRouter = Router(line)
+                body.append(curRouter)
+            else:
+                curRouter.append(line)
+
+        if inFooter:
+            footer.append(line)
+
+    assert inFooter
+    return header, body, footer
+
+
+def main(f1, f2):
+    _, body1, _ = splitfile(f1)
+    header2, body2, footer2 = splitfile(f2)
+    assert footer2
+
+    for h in header2:
+        sys.stdout.write(h)
+
+    while body1 and body2:
+        if body1[0].ident < body2[0].ident:
+            print "-"
+            del body1[0]
+        elif body1[0].ident > body2[0].ident:
+            sys.stdout.write("* ")
+            for b in body2[0].lines:
+                sys.stdout.write(b)
+            del body2[0]
+        else: # same router
+            if body1[0].lines != body2[0].lines:
+                if len(body1[0].lines) != len(body2[0].lines):
+                    print >>sys.stderr, "<<%s>><<%s>>"%(body1[0].lines, body2[0].lines)
+                    sys.stdout.write("** ")
+                    for b in body2[0].lines:
+                        sys.stdout.write(b)
+                    del body2[0]
+                    del body1[0]
+                    continue
+
+                if body1[0].lines[0] == body2[0].lines[0]:
+                    print "."
+                else:
+                    print rdiff(body1[0].lines[0],body2[0].lines[0])
+                for l1,l2 in zip(body1[0].lines[1:], body2[0].lines[1:]):
+                    if l1 != l2:
+                       if l2.startswith('s ') and l1.startswith('s '):
+                           print  sdiff(l1, l2)
+                       else:
+                           sys.stdout.write(l2)
+	    del body1[0]
+            del body2[0]
+
+    for r in body2:
+        sys.stdout.write("* ")
+        for l in r.lines:
+           sys.stdout.write(l)
+
+    for f in footer2:
+        sys.stdout.write(f)
+
+f1 = open(sys.argv[1])
+f2 = open(sys.argv[2])
+
+main(f1,f2)
diff --git a/task-7009/cy.py b/task-7009/cy.py
new file mode 100755
index 0000000..940d917
--- /dev/null
+++ b/task-7009/cy.py
@@ -0,0 +1,104 @@
+#!/usr/bin/python
+import sys
+
+class Router:
+    def __init__(self, r_line):
+        assert r_line.startswith("r ")
+        self.ident = r_line.split()[2]
+        self.lines = [ r_line ]
+    def append(self, line):
+        self.lines.append(line)
+
+
+def rdiff(r1,r2):
+    r1 = r1.split()
+    r2 = r2.split()
+    p1 = ["r"]
+    p2 = []
+    for i in xrange(len(r1)):
+       if r1[i] == r2[i]:
+           continue
+       p1.append(str(i))
+       p2.append(r2[i])
+    return "".join(p1) + " " + " ".join(p2)
+
+def splitfile(f):
+    header, body, footer = [], [], []
+    inHeader = True
+    inBody = False
+    inFooter = False
+    for line in f.readlines():
+        if inHeader and line.startswith("r "):
+            inBody = True
+            inHeader = False
+            curRouter = None
+        if inBody and line.startswith("directory-footer"):
+            inFooter = True
+            inBody = False
+
+        if inHeader:
+            header.append(line)
+
+        if inBody:
+            if line.startswith("r "):
+                curRouter = Router(line)
+                body.append(curRouter)
+            else:
+                curRouter.append(line)
+
+        if inFooter:
+            footer.append(line)
+
+    assert inFooter
+    return header, body, footer
+
+
+def main(f1, f2):
+    _, body1, _ = splitfile(f1)
+    header2, body2, footer2 = splitfile(f2)
+    assert footer2
+
+    for h in header2:
+        sys.stdout.write(h)
+
+    while body1 and body2:
+        if body1[0].ident < body2[0].ident:
+            print "-"
+            del body1[0]
+        elif body1[0].ident > body2[0].ident:
+            sys.stdout.write("* ")
+            for b in body2[0].lines:
+                sys.stdout.write(b)
+            del body2[0]
+        else: # same router
+            if body1[0].lines != body2[0].lines:
+                if len(body1[0].lines) != len(body2[0].lines):
+                    print >>sys.stderr, "<<%s>><<%s>>"%(body1[0].lines, body2[0].lines)
+                    sys.stdout.write("** ")
+                    for b in body2[0].lines:
+                        sys.stdout.write(b)
+                    del body2[0]
+                    del body1[0]
+                    continue
+
+                if body1[0].lines[0] == body2[0].lines[0]:
+                    print "."
+                else:
+                    print rdiff(body1[0].lines[0],body2[0].lines[0])
+                for l1,l2 in zip(body1[0].lines[1:], body2[0].lines[1:]):
+                    if l1 != l2: sys.stdout.write(l2)
+	    del body1[0]
+            del body2[0]
+
+    for r in body2:
+        sys.stdout.write("* ")
+        for l in r.lines:
+           sys.stdout.write(l)
+
+    for f in footer2:
+        sys.stdout.write(f)
+
+f1 = open(sys.argv[1])
+f2 = open(sys.argv[2])
+
+main(f1,f2)
diff --git a/task-7009/individually.py b/task-7009/individually.py
new file mode 100755
index 0000000..0868ef6
--- /dev/null
+++ b/task-7009/individually.py
@@ -0,0 +1,43 @@
+#!/usr/bin/python
+
+import os
+import sys
+import subprocess
+
+def uncompressed(fname):
+	return os.stat(fname).st_size
+
+def gz(fname):
+	try:
+		os.unlink("out.tmp")
+	except OSError, e:
+		pass
+	os.system("gzip -c -9 %s > out.tmp" % fname)
+	return os.stat("out.tmp").st_size 
+
+def bz2(fname):
+	os.unlink("out.tmp")
+	os.system("bzip2 -c -9 %s > out.tmp" % fname)
+	return os.stat("out.tmp").st_size 
+
+def xz(fname):	
+	os.unlink("out.tmp")
+	os.system("xz -c -9 %s > out.tmp" %fname)
+	return os.stat("out.tmp").st_size 
+
+func = globals()[sys.argv[1]]
+
+allvals = []
+total = 0L
+N = 0
+
+for fname in sys.argv[2:]:
+	n = func(fname)
+	N += 1
+	total += n
+	allvals.append(n)
+
+allvals.sort()
+print "%s: mean %s. median %s"%(sys.argv[1], total//N, allvals[N//2])
+
+	
diff --git a/task-7009/md-kludge.py b/task-7009/md-kludge.py
new file mode 100755
index 0000000..6f18c8b
--- /dev/null
+++ b/task-7009/md-kludge.py
@@ -0,0 +1,56 @@
+#!/usr/bin/python
+
+import sys
+import re
+import os
+
+def read_table(fn):
+  p = re.compile(r'^([^ ]*) sha256=([^ \n]*)')
+  bad = 0
+  t = {}
+  for line in open(fn):
+      m = p.match(line)
+      if not m:
+         bad += 1
+         continue
+      t[m.group(1)] = m.group(2)
+  print bad, "bad entries in", fn
+  return t
+
+
+def process(fn, t):
+   tmp = fn+".tmp"
+   inp = open(fn, 'r')
+   out = open(tmp, 'w')
+   h = m = 0
+   for line in inp:
+      if line.startswith('r '):
+          r = line.split()
+          desc_id = r[3]
+          del r[3]
+          print >>out, " ".join(r)
+          try:
+             md_id = t[desc_id]
+             h += 1
+          except KeyError:
+             md_id = desc_id #kluuuuuuuuudge!!!!!
+             m += 1 
+          print >>out, "m",md_id 
+      else:
+         out.write(line)
+   inp.close()
+   out.close()
+   os.rename(tmp, fn)
+   return h, m
+
+table = read_table("table.txt")
+
+hit = 0
+miss = 0
+
+for fn in sys.argv[1:]:
+  h,m = process(fn, table)
+  hit += h
+  miss += m
+
+print hit, miss
diff --git a/task-7009/pairwise.py b/task-7009/pairwise.py
new file mode 100755
index 0000000..30a498d
--- /dev/null
+++ b/task-7009/pairwise.py
@@ -0,0 +1,86 @@
+#!/usr/bin/python
+
+import os
+import sys
+import subprocess
+
+def echo(a,b):
+	print a, b
+	return 0
+
+def diff_gz(fn, fn2):
+	os.unlink("out.tmp")
+	os.system("diff %s %s | gzip -9 -c > out.tmp" % (fn, fn2))
+	return os.stat("out.tmp").st_size 
+def diff_u_gz(fn, fn2):
+	os.unlink("out.tmp")
+	os.system("diff -u %s %s | gzip -9 -c > out.tmp" % (fn, fn2))
+	return os.stat("out.tmp").st_size 
+def diff_e_gz(fn, fn2):
+	os.unlink("out.tmp")
+	os.system("diff -e %s %s | gzip -9 -c > out.tmp" % (fn, fn2))
+	return os.stat("out.tmp").st_size 
+
+def diff_bz2(fn, fn2):
+	os.unlink("out.tmp")
+	os.system("diff %s %s | bzip2 -9 -c > out.tmp" % (fn, fn2))
+	return os.stat("out.tmp").st_size 
+def diff_u_bz2(fn, fn2):
+	os.unlink("out.tmp")
+	os.system("diff -u %s %s | bzip2 -9 -c > out.tmp" % (fn, fn2))
+	return os.stat("out.tmp").st_size 
+def diff_e_bz2(fn, fn2):
+	os.unlink("out.tmp")
+	os.system("diff -e %s %s | bzip2 -9 -c > out.tmp" % (fn, fn2))
+	return os.stat("out.tmp").st_size 
+
+def condiff_gz(fn, fn2):
+	os.unlink("out.tmp")
+	os.system("./condiff.py %s %s | gzip -9 -c > out.tmp" % (fn, fn2))
+	return os.stat("out.tmp").st_size 
+def condiff_bz2(fn, fn2):
+	os.unlink("out.tmp")
+	os.system("./condiff.py %s %s | bzip2 -9 -c > out.tmp" % (fn, fn2))
+	return os.stat("out.tmp").st_size 
+
+def condiff2_gz(fn, fn2):
+	os.unlink("out.tmp")
+	os.system("./condiff2.py %s %s | gzip -9 -c > out.tmp" % (fn, fn2))
+	return os.stat("out.tmp").st_size 
+def condiff2_bz2(fn, fn2):
+	os.unlink("out.tmp")
+	os.system("./condiff2.py %s %s | bzip2 -9 -c > out.tmp" % (fn, fn2))
+	return os.stat("out.tmp").st_size 
+
+f = open("out.tmp", 'w')
+f.write("xyz")
+f.close()
+
+func = globals()[sys.argv[1]]
+
+allvals = []
+total = 0L
+N = 0
+
+lag = int(sys.argv[2])
+
+def pairwise(it):
+	it = iter(it)
+	lastv = []
+	for i in xrange(lag):
+		lastv.append(it.next())
+	for v in it:
+		yield lastv[0], v
+		lastv.append(v)
+		del lastv[0]
+
+for fname, fname2 in pairwise(sys.argv[3:]):
+	n = func(fname, fname2)
+	N += 1
+	total += n
+	allvals.append(n)
+
+allvals.sort()
+print "%s: lag %s: mean %s. median %s"%(sys.argv[1], lag, total//N, allvals[N//2])
+
+	
diff --git a/task-7009/wround.py b/task-7009/wround.py
new file mode 100755
index 0000000..3444950
--- /dev/null
+++ b/task-7009/wround.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python
+import sys
+import re
+import os
+def wround(v):
+    if v < 8:
+       return 8
+    elif v < 128:
+       downto = 1
+    elif v < 1024:
+       downto = 3
+    else:
+       downto = 7 
+    v_orig = v
+    shift = 0
+    while v > downto:
+       v >>= 1
+       shift += 1
+    return v << shift
+
+p = re.compile(r'w Bandwidth=(\d+)(.*)')
+
+def rewrite(fname):
+   inp = open(fname)
+   out = open(fname+".tmp", 'w')
+   for line in inp:
+       m = p.match(line)
+       if not m:
+          out.write(line)
+          continue
+       v = wround(int(m.group(1)))
+       out.write("w Bandwidth=%s%s\n"%(v,m.group(2).rstrip()))
+   out.close()
+   inp.close()
+   os.rename(fname+".tmp", fname) 
+
+for fn in sys.argv[1:]:
+    rewrite(fn)
+



More information about the tor-commits mailing list