commit 869036cceac9468db6daa7e3c9d878b41432a6a7
Author: Karsten Loesing <karsten.loesing(a)gmx.net>
Date: Mon Jul 1 08:33:39 2013 +0200
Add scripts written by nickm to analyze consensus diff sizes (#7009).
---
task-7009/README | 2 +
task-7009/build_md_table_2.py | 20 +++++++
task-7009/cmds5 | 65 +++++++++++++++++++++++
task-7009/condiff.py | 91 ++++++++++++++++++++++++++++++++
task-7009/condiff2.py | 115 +++++++++++++++++++++++++++++++++++++++++
task-7009/cy.py | 104 +++++++++++++++++++++++++++++++++++++
task-7009/individually.py | 43 +++++++++++++++
task-7009/md-kludge.py | 56 ++++++++++++++++++++
task-7009/pairwise.py | 86 ++++++++++++++++++++++++++++++
task-7009/wround.py | 39 ++++++++++++++
10 files changed, 621 insertions(+)
diff --git a/task-7009/README b/task-7009/README
new file mode 100644
index 0000000..a5caa39
--- /dev/null
+++ b/task-7009/README
@@ -0,0 +1,2 @@
+Scripts written by nickm to analyze consensus diff sizes.
+
diff --git a/task-7009/build_md_table_2.py b/task-7009/build_md_table_2.py
new file mode 100755
index 0000000..9ab4051
--- /dev/null
+++ b/task-7009/build_md_table_2.py
@@ -0,0 +1,20 @@
+#!/usr/bin/python
+import sys
+
+result = {}
+
+for fname in sys.argv[1:]:
+ f = open(fname, 'r')
+ for line in f:
+ if line.startswith('r '):
+ lastR = line
+ if line.startswith('m 8') or (line.startswith('m ') and ",8" in line):
+ sha = line.split()[2]
+ assert sha.startswith("sha256=")
+ descID = lastR.split()[3]
+ result[descID] = sha
+ f.close()
+
+for k,v in result.iteritems():
+ print k,v
+
diff --git a/task-7009/cmds5 b/task-7009/cmds5
new file mode 100644
index 0000000..e2df77a
--- /dev/null
+++ b/task-7009/cmds5
@@ -0,0 +1,65 @@
+ ./individually.py gz */* >> RESULTS5
+ ./individually.py bz2 */* >> RESULTS5
+ ./individually.py xz */* >> RESULTS5
+ ./individually.py uncompressed */* >> RESULTS5
+
+ ./pairwise.py diff_gz 1 */* >> RESULTS5
+ ./pairwise.py diff_gz 2 */* >> RESULTS5
+ ./pairwise.py diff_gz 4 */* >> RESULTS5
+ ./pairwise.py diff_gz 6 */* >> RESULTS5
+ ./pairwise.py diff_gz 8 */* >> RESULTS5
+
+ ./pairwise.py diff_bz2 1 */* >> RESULTS5
+ ./pairwise.py diff_bz2 2 */* >> RESULTS5
+ ./pairwise.py diff_bz2 4 */* >> RESULTS5
+ ./pairwise.py diff_bz2 6 */* >> RESULTS5
+ ./pairwise.py diff_bz2 8 */* >> RESULTS5
+
+ ./pairwise.py diff_u_gz 1 */* >> RESULTS5
+ ./pairwise.py diff_u_gz 2 */* >> RESULTS5
+ ./pairwise.py diff_u_gz 4 */* >> RESULTS5
+ ./pairwise.py diff_u_gz 6 */* >> RESULTS5
+ ./pairwise.py diff_u_gz 8 */* >> RESULTS5
+
+ ./pairwise.py diff_u_bz2 1 */* >> RESULTS5
+ ./pairwise.py diff_u_bz2 2 */* >> RESULTS5
+ ./pairwise.py diff_u_bz2 4 */* >> RESULTS5
+ ./pairwise.py diff_u_bz2 6 */* >> RESULTS5
+ ./pairwise.py diff_u_bz2 8 */* >> RESULTS5
+
+ ./pairwise.py diff_e_gz 1 */* >> RESULTS5
+ ./pairwise.py diff_e_gz 2 */* >> RESULTS5
+ ./pairwise.py diff_e_gz 4 */* >> RESULTS5
+ ./pairwise.py diff_e_gz 6 */* >> RESULTS5
+ ./pairwise.py diff_e_gz 8 */* >> RESULTS5
+
+ ./pairwise.py diff_e_bz2 1 */* >> RESULTS5
+ ./pairwise.py diff_e_bz2 2 */* >> RESULTS5
+ ./pairwise.py diff_e_bz2 4 */* >> RESULTS5
+ ./pairwise.py diff_e_bz2 6 */* >> RESULTS5
+ ./pairwise.py diff_e_bz2 8 */* >> RESULTS5
+
+ ./pairwise.py condiff_gz 1 */* >> RESULTS5
+ ./pairwise.py condiff_gz 2 */* >> RESULTS5
+ ./pairwise.py condiff_gz 4 */* >> RESULTS5
+ ./pairwise.py condiff_gz 6 */* >> RESULTS5
+ ./pairwise.py condiff_gz 8 */* >> RESULTS5
+
+ ./pairwise.py condiff_bz2 1 */* >> RESULTS5
+ ./pairwise.py condiff_bz2 2 */* >> RESULTS5
+ ./pairwise.py condiff_bz2 4 */* >> RESULTS5
+ ./pairwise.py condiff_bz2 6 */* >> RESULTS5
+ ./pairwise.py condiff_bz2 8 */* >> RESULTS5
+
+ ./pairwise.py condiff2_gz 1 */* >> RESULTS5
+ ./pairwise.py condiff2_gz 2 */* >> RESULTS5
+ ./pairwise.py condiff2_gz 4 */* >> RESULTS5
+ ./pairwise.py condiff2_gz 6 */* >> RESULTS5
+ ./pairwise.py condiff2_gz 8 */* >> RESULTS5
+
+ ./pairwise.py condiff2_bz2 1 */* >> RESULTS5
+ ./pairwise.py condiff2_bz2 2 */* >> RESULTS5
+ ./pairwise.py condiff2_bz2 4 */* >> RESULTS5
+ ./pairwise.py condiff2_bz2 6 */* >> RESULTS5
+ ./pairwise.py condiff2_bz2 8 */* >> RESULTS5
+
diff --git a/task-7009/condiff.py b/task-7009/condiff.py
new file mode 100755
index 0000000..23c8a75
--- /dev/null
+++ b/task-7009/condiff.py
@@ -0,0 +1,91 @@
+#!/usr/bin/python
+import sys
+
+class Router:
+ def __init__(self, r_line):
+ assert r_line.startswith("r ")
+ self.ident = r_line.split()[2]
+ self.lines = [ r_line ]
+ def append(self, line):
+ self.lines.append(line)
+
+def splitfile(f):
+ header, body, footer = [], [], []
+ inHeader = True
+ inBody = False
+ inFooter = False
+ for line in f.readlines():
+ if inHeader and line.startswith("r "):
+ inBody = True
+ inHeader = False
+ curRouter = None
+ if inBody and line.startswith("directory-footer"):
+ inFooter = True
+ inBody = False
+
+ if inHeader:
+ header.append(line)
+
+ if inBody:
+ if line.startswith("r "):
+ curRouter = Router(line)
+ body.append(curRouter)
+ else:
+ curRouter.append(line)
+
+ if inFooter:
+ footer.append(line)
+
+ assert inFooter
+ return header, body, footer
+
+
+def main(f1, f2):
+ _, body1, _ = splitfile(f1)
+ header2, body2, footer2 = splitfile(f2)
+ assert footer2
+
+ for h in header2:
+ sys.stdout.write(h)
+
+ while body1 and body2:
+ if body1[0].ident < body2[0].ident:
+ print "-"
+ del body1[0]
+ elif body1[0].ident > body2[0].ident:
+ sys.stdout.write("* ")
+ for b in body2[0].lines:
+ sys.stdout.write(b)
+ del body2[0]
+ else: # same router
+ if body1[0].lines != body2[0].lines:
+ if len(body1[0].lines) != len(body2[0].lines):
+ print >>sys.stderr, "<<%s>><<%s>>"%(body1[0].lines, body2[0].lines)
+ sys.stdout.write("** ")
+ for b in body2[0].lines:
+ sys.stdout.write(b)
+ del body2[0]
+ del body1[0]
+ continue
+
+ if body1[0].lines[0] == body2[0].lines[0]:
+ print "."
+ else:
+ sys.stdout.write(body2[0].lines[0])
+ for l1,l2 in zip(body1[0].lines[1:], body2[0].lines[1:]):
+ if l1 != l2: sys.stdout.write(l2)
+ del body1[0]
+ del body2[0]
+
+ for r in body2:
+ sys.stdout.write("* ")
+ for l in r.lines:
+ sys.stdout.write(l)
+
+ for f in footer2:
+ sys.stdout.write(f)
+
+f1 = open(sys.argv[1])
+f2 = open(sys.argv[2])
+
+main(f1,f2)
diff --git a/task-7009/condiff2.py b/task-7009/condiff2.py
new file mode 100755
index 0000000..c93ab9f
--- /dev/null
+++ b/task-7009/condiff2.py
@@ -0,0 +1,115 @@
+#!/usr/bin/python
+import sys
+
+class Router:
+ def __init__(self, r_line):
+ assert r_line.startswith("r ")
+ self.ident = r_line.split()[2]
+ self.lines = [ r_line ]
+ def append(self, line):
+ self.lines.append(line)
+
+
+def rdiff(r1,r2):
+ r1 = r1.split()
+ r2 = r2.split()
+ p1 = ["r"]
+ p2 = []
+ for i in xrange(len(r1)):
+ if r1[i] == r2[i]:
+ continue
+ p1.append(str(i))
+ p2.append(r2[i])
+ return "".join(p1) + " " + " ".join(p2)
+
+def sdiff(s1, s2):
+ s1 = set(s1.split()[1:])
+ s2 = set(s2.split()[1:])
+ minus = sorted(("-%s"%item) for item in s1 if item not in s2)
+ plus = sorted(("+%s"%item) for item in s2 if item not in s1)
+ return " ".join(["s"] + minus + plus)
+
+def splitfile(f):
+ header, body, footer = [], [], []
+ inHeader = True
+ inBody = False
+ inFooter = False
+ for line in f.readlines():
+ if inHeader and line.startswith("r "):
+ inBody = True
+ inHeader = False
+ curRouter = None
+ if inBody and line.startswith("directory-footer"):
+ inFooter = True
+ inBody = False
+
+ if inHeader:
+ header.append(line)
+
+ if inBody:
+ if line.startswith("r "):
+ curRouter = Router(line)
+ body.append(curRouter)
+ else:
+ curRouter.append(line)
+
+ if inFooter:
+ footer.append(line)
+
+ assert inFooter
+ return header, body, footer
+
+
+def main(f1, f2):
+ _, body1, _ = splitfile(f1)
+ header2, body2, footer2 = splitfile(f2)
+ assert footer2
+
+ for h in header2:
+ sys.stdout.write(h)
+
+ while body1 and body2:
+ if body1[0].ident < body2[0].ident:
+ print "-"
+ del body1[0]
+ elif body1[0].ident > body2[0].ident:
+ sys.stdout.write("* ")
+ for b in body2[0].lines:
+ sys.stdout.write(b)
+ del body2[0]
+ else: # same router
+ if body1[0].lines != body2[0].lines:
+ if len(body1[0].lines) != len(body2[0].lines):
+ print >>sys.stderr, "<<%s>><<%s>>"%(body1[0].lines, body2[0].lines)
+ sys.stdout.write("** ")
+ for b in body2[0].lines:
+ sys.stdout.write(b)
+ del body2[0]
+ del body1[0]
+ continue
+
+ if body1[0].lines[0] == body2[0].lines[0]:
+ print "."
+ else:
+ print rdiff(body1[0].lines[0],body2[0].lines[0])
+ for l1,l2 in zip(body1[0].lines[1:], body2[0].lines[1:]):
+ if l1 != l2:
+ if l2.startswith('s ') and l1.startswith('s '):
+ print sdiff(l1, l2)
+ else:
+ sys.stdout.write(l2)
+ del body1[0]
+ del body2[0]
+
+ for r in body2:
+ sys.stdout.write("* ")
+ for l in r.lines:
+ sys.stdout.write(l)
+
+ for f in footer2:
+ sys.stdout.write(f)
+
+f1 = open(sys.argv[1])
+f2 = open(sys.argv[2])
+
+main(f1,f2)
diff --git a/task-7009/cy.py b/task-7009/cy.py
new file mode 100755
index 0000000..940d917
--- /dev/null
+++ b/task-7009/cy.py
@@ -0,0 +1,104 @@
+#!/usr/bin/python
+import sys
+
+class Router:
+ def __init__(self, r_line):
+ assert r_line.startswith("r ")
+ self.ident = r_line.split()[2]
+ self.lines = [ r_line ]
+ def append(self, line):
+ self.lines.append(line)
+
+
+def rdiff(r1,r2):
+ r1 = r1.split()
+ r2 = r2.split()
+ p1 = ["r"]
+ p2 = []
+ for i in xrange(len(r1)):
+ if r1[i] == r2[i]:
+ continue
+ p1.append(str(i))
+ p2.append(r2[i])
+ return "".join(p1) + " " + " ".join(p2)
+
+def splitfile(f):
+ header, body, footer = [], [], []
+ inHeader = True
+ inBody = False
+ inFooter = False
+ for line in f.readlines():
+ if inHeader and line.startswith("r "):
+ inBody = True
+ inHeader = False
+ curRouter = None
+ if inBody and line.startswith("directory-footer"):
+ inFooter = True
+ inBody = False
+
+ if inHeader:
+ header.append(line)
+
+ if inBody:
+ if line.startswith("r "):
+ curRouter = Router(line)
+ body.append(curRouter)
+ else:
+ curRouter.append(line)
+
+ if inFooter:
+ footer.append(line)
+
+ assert inFooter
+ return header, body, footer
+
+
+def main(f1, f2):
+ _, body1, _ = splitfile(f1)
+ header2, body2, footer2 = splitfile(f2)
+ assert footer2
+
+ for h in header2:
+ sys.stdout.write(h)
+
+ while body1 and body2:
+ if body1[0].ident < body2[0].ident:
+ print "-"
+ del body1[0]
+ elif body1[0].ident > body2[0].ident:
+ sys.stdout.write("* ")
+ for b in body2[0].lines:
+ sys.stdout.write(b)
+ del body2[0]
+ else: # same router
+ if body1[0].lines != body2[0].lines:
+ if len(body1[0].lines) != len(body2[0].lines):
+ print >>sys.stderr, "<<%s>><<%s>>"%(body1[0].lines, body2[0].lines)
+ sys.stdout.write("** ")
+ for b in body2[0].lines:
+ sys.stdout.write(b)
+ del body2[0]
+ del body1[0]
+ continue
+
+ if body1[0].lines[0] == body2[0].lines[0]:
+ print "."
+ else:
+ print rdiff(body1[0].lines[0],body2[0].lines[0])
+ for l1,l2 in zip(body1[0].lines[1:], body2[0].lines[1:]):
+ if l1 != l2: sys.stdout.write(l2)
+ del body1[0]
+ del body2[0]
+
+ for r in body2:
+ sys.stdout.write("* ")
+ for l in r.lines:
+ sys.stdout.write(l)
+
+ for f in footer2:
+ sys.stdout.write(f)
+
+f1 = open(sys.argv[1])
+f2 = open(sys.argv[2])
+
+main(f1,f2)
diff --git a/task-7009/individually.py b/task-7009/individually.py
new file mode 100755
index 0000000..0868ef6
--- /dev/null
+++ b/task-7009/individually.py
@@ -0,0 +1,43 @@
+#!/usr/bin/python
+
+import os
+import sys
+import subprocess
+
+def uncompressed(fname):
+ return os.stat(fname).st_size
+
+def gz(fname):
+ try:
+ os.unlink("out.tmp")
+ except OSError, e:
+ pass
+ os.system("gzip -c -9 %s > out.tmp" % fname)
+ return os.stat("out.tmp").st_size
+
+def bz2(fname):
+ os.unlink("out.tmp")
+ os.system("bzip2 -c -9 %s > out.tmp" % fname)
+ return os.stat("out.tmp").st_size
+
+def xz(fname):
+ os.unlink("out.tmp")
+ os.system("xz -c -9 %s > out.tmp" %fname)
+ return os.stat("out.tmp").st_size
+
+func = globals()[sys.argv[1]]
+
+allvals = []
+total = 0L
+N = 0
+
+for fname in sys.argv[2:]:
+ n = func(fname)
+ N += 1
+ total += n
+ allvals.append(n)
+
+allvals.sort()
+print "%s: mean %s. median %s"%(sys.argv[1], total//N, allvals[N//2])
+
+
diff --git a/task-7009/md-kludge.py b/task-7009/md-kludge.py
new file mode 100755
index 0000000..6f18c8b
--- /dev/null
+++ b/task-7009/md-kludge.py
@@ -0,0 +1,56 @@
+#!/usr/bin/python
+
+import sys
+import re
+import os
+
+def read_table(fn):
+ p = re.compile(r'^([^ ]*) sha256=([^ \n]*)')
+ bad = 0
+ t = {}
+ for line in open(fn):
+ m = p.match(line)
+ if not m:
+ bad += 1
+ continue
+ t[m.group(1)] = m.group(2)
+ print bad, "bad entries in", fn
+ return t
+
+
+def process(fn, t):
+ tmp = fn+".tmp"
+ inp = open(fn, 'r')
+ out = open(tmp, 'w')
+ h = m = 0
+ for line in inp:
+ if line.startswith('r '):
+ r = line.split()
+ desc_id = r[3]
+ del r[3]
+ print >>out, " ".join(r)
+ try:
+ md_id = t[desc_id]
+ h += 1
+ except KeyError:
+ md_id = desc_id #kluuuuuuuuudge!!!!!
+ m += 1
+ print >>out, "m",md_id
+ else:
+ out.write(line)
+ inp.close()
+ out.close()
+ os.rename(tmp, fn)
+ return h, m
+
+table = read_table("table.txt")
+
+hit = 0
+miss = 0
+
+for fn in sys.argv[1:]:
+ h,m = process(fn, table)
+ hit += h
+ miss += m
+
+print hit, miss
diff --git a/task-7009/pairwise.py b/task-7009/pairwise.py
new file mode 100755
index 0000000..30a498d
--- /dev/null
+++ b/task-7009/pairwise.py
@@ -0,0 +1,86 @@
+#!/usr/bin/python
+
+import os
+import sys
+import subprocess
+
+def echo(a,b):
+ print a, b
+ return 0
+
+def diff_gz(fn, fn2):
+ os.unlink("out.tmp")
+ os.system("diff %s %s | gzip -9 -c > out.tmp" % (fn, fn2))
+ return os.stat("out.tmp").st_size
+def diff_u_gz(fn, fn2):
+ os.unlink("out.tmp")
+ os.system("diff -u %s %s | gzip -9 -c > out.tmp" % (fn, fn2))
+ return os.stat("out.tmp").st_size
+def diff_e_gz(fn, fn2):
+ os.unlink("out.tmp")
+ os.system("diff -e %s %s | gzip -9 -c > out.tmp" % (fn, fn2))
+ return os.stat("out.tmp").st_size
+
+def diff_bz2(fn, fn2):
+ os.unlink("out.tmp")
+ os.system("diff %s %s | bzip2 -9 -c > out.tmp" % (fn, fn2))
+ return os.stat("out.tmp").st_size
+def diff_u_bz2(fn, fn2):
+ os.unlink("out.tmp")
+ os.system("diff -u %s %s | bzip2 -9 -c > out.tmp" % (fn, fn2))
+ return os.stat("out.tmp").st_size
+def diff_e_bz2(fn, fn2):
+ os.unlink("out.tmp")
+ os.system("diff -e %s %s | bzip2 -9 -c > out.tmp" % (fn, fn2))
+ return os.stat("out.tmp").st_size
+
+def condiff_gz(fn, fn2):
+ os.unlink("out.tmp")
+ os.system("./condiff.py %s %s | gzip -9 -c > out.tmp" % (fn, fn2))
+ return os.stat("out.tmp").st_size
+def condiff_bz2(fn, fn2):
+ os.unlink("out.tmp")
+ os.system("./condiff.py %s %s | bzip2 -9 -c > out.tmp" % (fn, fn2))
+ return os.stat("out.tmp").st_size
+
+def condiff2_gz(fn, fn2):
+ os.unlink("out.tmp")
+ os.system("./condiff2.py %s %s | gzip -9 -c > out.tmp" % (fn, fn2))
+ return os.stat("out.tmp").st_size
+def condiff2_bz2(fn, fn2):
+ os.unlink("out.tmp")
+ os.system("./condiff2.py %s %s | bzip2 -9 -c > out.tmp" % (fn, fn2))
+ return os.stat("out.tmp").st_size
+
+f = open("out.tmp", 'w')
+f.write("xyz")
+f.close()
+
+func = globals()[sys.argv[1]]
+
+allvals = []
+total = 0L
+N = 0
+
+lag = int(sys.argv[2])
+
+def pairwise(it):
+ it = iter(it)
+ lastv = []
+ for i in xrange(lag):
+ lastv.append(it.next())
+ for v in it:
+ yield lastv[0], v
+ lastv.append(v)
+ del lastv[0]
+
+for fname, fname2 in pairwise(sys.argv[3:]):
+ n = func(fname, fname2)
+ N += 1
+ total += n
+ allvals.append(n)
+
+allvals.sort()
+print "%s: lag %s: mean %s. median %s"%(sys.argv[1], lag, total//N, allvals[N//2])
+
+
diff --git a/task-7009/wround.py b/task-7009/wround.py
new file mode 100755
index 0000000..3444950
--- /dev/null
+++ b/task-7009/wround.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python
+import sys
+import re
+import os
+def wround(v):
+ if v < 8:
+ return 8
+ elif v < 128:
+ downto = 1
+ elif v < 1024:
+ downto = 3
+ else:
+ downto = 7
+ v_orig = v
+ shift = 0
+ while v > downto:
+ v >>= 1
+ shift += 1
+ return v << shift
+
+p = re.compile(r'w Bandwidth=(\d+)(.*)')
+
+def rewrite(fname):
+ inp = open(fname)
+ out = open(fname+".tmp", 'w')
+ for line in inp:
+ m = p.match(line)
+ if not m:
+ out.write(line)
+ continue
+ v = wround(int(m.group(1)))
+ out.write("w Bandwidth=%s%s\n"%(v,m.group(2).rstrip()))
+ out.close()
+ inp.close()
+ os.rename(fname+".tmp", fname)
+
+for fn in sys.argv[1:]:
+ rewrite(fn)
+