commit 704c053fbae93353976e3f7abf585b6283edb3a9 Author: Zack Weinberg zackw@panix.com Date: Sun Feb 12 20:36:56 2012 -0800
Rough draft benchmarking tools. --- scripts/benchmark-plot.R | 7 + scripts/benchmark.py | 410 +++++++++++++++++++++++++++++++++++ scripts/bm-fixedrate-cgi.c | 158 ++++++++++++++ scripts/bm-genfiles.py | 162 ++++++++++++++ scripts/bm-mcurl.c | 196 +++++++++++++++++ scripts/tool_urlglob.c | 516 ++++++++++++++++++++++++++++++++++++++++++++ scripts/tool_urlglob.h | 69 ++++++ 7 files changed, 1518 insertions(+), 0 deletions(-)
diff --git a/scripts/benchmark-plot.R b/scripts/benchmark-plot.R new file mode 100644 index 0000000..a58ae0a --- /dev/null +++ b/scripts/benchmark-plot.R @@ -0,0 +1,7 @@ +#! /usr/bin/Rscript + +suppressPackageStartupMessages({ + library(ggplot2) +}) + +lf.direct <- read.csv("bench-lf-direct.tab", header=TRUE) diff --git a/scripts/benchmark.py b/scripts/benchmark.py new file mode 100755 index 0000000..c6487b6 --- /dev/null +++ b/scripts/benchmark.py @@ -0,0 +1,410 @@ +#! /usr/bin/python + +# Stegotorus benchmarking script. +# Several different computers are involved: +# +# - the "client" is the machine you run this script on; the workload +# generator will run there, as will the StegoTorus and Tor clients. +# +# - the "proxy" is a machine that you can ssh to with no password. +# It will run the StegoTorus and Tor bridge servers. +# +# - the "target" is the HTTP server that will be contacted in various ways. +# +# bm-genfiles.py must have been run on this server to create file +# trees named 'fixed' and 'pareto' which appear as direct children +# of the root URL. bm-fixedrate-cgi.c must have been compiled for +# that server and appear as /bm-fixedrate.cgi. +# +# Software you need on the client machine: +# +# bwm-ng: http://www.gropp.org/?id=projects&sub=bwm-ng +# curl: http://curl.haxx.se/ +# httperf: http://www.hpl.hp.com/research/linux/httperf/ +# tsocks: http://tsocks.sourceforge.net/about.php +# tor: https://torproject.org/ +# stegotorus: you already have it :) +# +# Software you need on the proxy machine: +# +# nylon: http://monkey.org/~marius/pages/?page=nylon +# tor, stegotorus +# +# You configure this script by setting variables below. + +# Client host + +CLIENT_IP = "99.113.33.155" +CLIENT_IFACE = "eth0" + +# Proxy host + +PROXY = "sandbox03.sv.cmu.edu" +PROXY_IP = "209.129.244.30" # some things won't do DNS for this +PROXY_PORT = "1080" +PROXY_SSH_CMD = ("ssh", PROXY) + +# Target + +TARGET = "storustest.nfshost.com" + +# Fudge factors. For some reason, bm-fixedrate generates data a +# linear factor slower than it was meant to; this is the quick fix. + +FUDGE_FIXEDRATE = 2.5 + +# Programs we need to run. Change these if any binary is not in the +# default path or hasn't got the default name. +# C_ - for the client. P_ - for the proxy. +# You can NOT specify arguments here - if you need to do any +# setup, write a wrapper script. + +C_bwm = "bwm-ng" +C_curl = "curl" +C_httperf = "httperf" +C_storus = "stegotorus-wrapper" +C_tor = "/usr/sbin/tor" +C_tsocks = "/usr/lib/libtsocks.so" + +P_nylon = "nylon" +P_storus = "stegotorus-wrapper" +P_tor = "tor" +P_python = "/usr/local/bin/python" # this must be an absolute path, + # it goes on a shebang line + +# ACTUAL PROGRAM STARTS HERE + +from types import MethodType +import os +import os.path +import pickle +import subprocess +import sys +import time + +def monitor(report, label, period): + """Monitor network utilization (bytes/sec up and down) for a + period of PERIOD seconds, writing the report to REPORT, labeling + each line with LABEL.""" + + bwm = subprocess.Popen((C_bwm, "-o", "csv", "-c", str(period), "-t", "1000", + "-u", "bytes", "-T", "rate", "-I", CLIENT_IFACE), + stdout=subprocess.PIPE, + universal_newlines=True) + try: + n = 1 + for line in bwm.stdout: + (stamp, iface, upbytes, dnbytes, rest) = line.split(';', 4) + if iface == 'total': continue + + # convert to most compact possible form + upbytes = str(float(upbytes)) + dnbytes = str(float(dnbytes)) + + report.write("%s,%d,%s,%s\n" % (label,n,upbytes,dnbytes)) + n += 1 + except: + bwm.terminate() + raise + finally: + bwm.wait() + +class ProxyProcess(object): + """A process running on the proxy host. It has a command line and + an optional config file. It is not expected to produce any output + (if it does, it will get dumped to this script's stdout/stderr) or + require any input (input is redirected from /dev/null). It is + expected to run until it is killed.""" + + @staticmethod + def prepare_remote(): + remote_driver=r"""#! %s +import pickle +import signal +import subprocess +import sys +import traceback + +wrote_rpid = False + +# Remote driver for proxy processes. +try: + data = pickle.load(sys.stdin) + sys.stdin.close() + if data['cfgname']: + f = open(data['cfgname'], "w") + f.write(data['cfgdata']) + f.close() + proc = subprocess.Popen(data['args'], stdin=open("/dev/null", "r"), + stdout=2) # redirect child stdout to our stderr + sys.stdout.write(str(proc.pid) + "\n") + wrote_rpid = True + sys.stdout.close() + proc.wait() + + # the process being killed by SIGTERM is normal + if proc.returncode != 0 and proc.returncode != -signal.SIGTERM: + raise subprocess.CalledProcessError(proc.returncode, data['args'][0]) +except: + traceback.print_exc() + if not wrote_rpid: sys.stdout.write("X\n") + sys.exit(1) + +sys.exit(0) +""" % P_python + remote_setup=r"""newdriver=`mktemp ./driver.py.XXXXXX` || exit 1 +cat > "$newdriver" +if cmp -s "$newdriver" driver.py +then rm -f "$newdriver" +else set -e; mv -f "$newdriver" driver.py; chmod +x driver.py +fi +""" + prep_worker = subprocess.Popen(PROXY_SSH_CMD + (remote_setup,), + stdin=subprocess.PIPE, + stdout=2) + prep_worker.communicate(remote_driver) + if prep_worker.returncode != 0: + raise subprocess.CalledProcessError(prep_worker.returncode, + 'remote_setup script') + + def __init__(self, args, cfgname=None, cfgdata=None): + if ((cfgname is None or cfgdata is None) and + (cfgname is not None or cfgdata is not None)): + raise TypeError("either both or neither of cfgname and cfgdata" + " must be specified") + + self._rpid = "X" + + ProxyProcess.prepare_remote() + self._proc = subprocess.Popen(PROXY_SSH_CMD + ("./driver.py",), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + close_fds=True) + pickle.dump({ 'args' : args, + 'cfgname' : cfgname, + 'cfgdata' : cfgdata }, + self._proc.stdin, 2) + self._proc.stdin.close() + self._rpid = self._proc.stdout.readline().strip() + if self._rpid == "X" or self._rpid == "": + self._rpid = "X" + self._proc.wait() + raise RuntimeError("failed to execute '%s' on proxy host" + % " ".join(args)) + + def terminate(self): + if self._rpid == "X": return + subprocess.check_call(PROXY_SSH_CMD + ("kill", self._rpid)) + + def kill(self): + if self._rpid == "X": return + subprocess.check_call(PROXY_SSH_CMD + ("kill", "-9", self._rpid)) + + # forward everything else to _proc; logic copied verbatim from + # http://code.activestate.com/recipes/519639- + # true-lieberman-style-delegation-in-python/ + def __getattr__(self, aname): + target = self._proc + f = getattr(target, aname) + if isinstance(f, MethodType): + return MethodType(f.im_func, self, target.__class__) + else: + return f + +# Individual proxy-side test runners. +def p_nylon(): + return ProxyProcess((P_nylon, "-f", "-c", "nylon.conf"), + "nylon.conf", + """\ +[General] +No-Simultaneous-Conn=10 +Log=0 +Verbose=0 +PIDfile=nylon.pid + +[Server] +Port=%s +Allow-IP=%s/32 +""" % (PROXY_PORT, CLIENT_IP)) + +def p_tor_direct(): + return ProxyProcess((P_tor, "--quiet", "-f", "tor-direct.conf"), + "tor-direct.conf", + """\ +ORPort %s +SocksPort 0 +BridgeRelay 1 +PublishServerDescriptor 0 +ExitPolicy reject *:* +DataDirectory . +Log err stderr +# unfortunately there doesn't seem to be any way to tell Tor to accept +# OR connections from specific IP addresses only. +""" % PROXY_PORT) + +class ClientProcess(subprocess.Popen): + """A process running on the local machine. This is probably doing + the meat of the work of some benchmark. Basically a shim around + subprocess.Popen to fix constructor arguments.""" + + def __init__(self, argv, envp=None): + if envp is not None: + env = os.environ.copy() + env.update(envp) + subprocess.Popen.__init__(self, argv, + stdin=open("/dev/null", "r"), + stdout=open("/dev/null", "w"), + stderr=subprocess.STDOUT, env=env) + else: + subprocess.Popen.__init__(self, argv, + stdin=open("/dev/null", "r"), + stdout=2) + +def c_tor_direct(): + fp = open("tor-direct-client.conf", "w") + fp.write("""\ +ORPort 0 +SocksPort %s +DataDirectory . +Log err stderr +Bridge %s:%s +UseBridges 1 +SafeSocks 0 +""" % (PROXY_PORT, PROXY_IP, PROXY_PORT)) + fp.close() + return ClientProcess((C_tor, "--quiet", "-f", "tor-direct-client.conf")) + +def c_curl(url, proxyhost): + return ClientProcess((C_curl, "-s", "--socks5-hostname", + proxyhost + ":" + PROXY_PORT, + url, "-o", "/dev/null")) + +def c_httperf(prefix, rate, proxyhost): + fp = open("tsocks.conf", "w") + fp.write("""\ +server = %s +local = %s/255.255.255.255 +server_port = %s +server_type = 5 +""" % (proxyhost, proxyhost, PROXY_PORT)) + fp.close() + return ClientProcess((C_httperf, "--hog", + "--server=" + TARGET, + "--uri=" + prefix, + "--period=" + str(rate), + "--num-calls=5", "--num-conns=2000", + "--wset=10000,1"), + { 'LD_PRELOAD' : C_tsocks, + 'TSOCKS_CONF_FILE' : + os.path.join(os.getcwd(), "tsocks.conf") }) + +# Benchmarks. + +def bench_fixedrate_direct(report): + client = None + proxy = None + try: + proxy = p_nylon() + + for cap in range(10, 810, 10): + sys.stderr.write("fixedrate,direct,%d\n" % (cap * 1000)) + try: + client = c_curl('http://' + TARGET + '/bm-fixedrate.cgi/' + + str(int(cap * 1000 * FUDGE_FIXEDRATE)), + PROXY) + monitor(report, "fixedrate,direct,%d" % (cap * 1000), 60) + finally: + if client is not None: + client.terminate() + client.wait() + client = None + finally: + if proxy is not None: + proxy.terminate() + proxy.wait() + +def bench_fixedrate_tor(report): + client = None + proxy = None + proxyl = None + try: + proxy = p_tor_direct() + proxyl = c_tor_direct() + time.sleep(5) # tor startup is slow + + for cap in range(10,810,10): + sys.stderr.write("fixedrate,tor,%d\n" % (cap * 1000)) + try: + client = c_curl('http://' + TARGET + '/bm-fixedrate.cgi/' + + str(int(cap * 1000 * FUDGE_FIXEDRATE)), + '127.0.0.1') + monitor(report, "fixedrate,tor,%d" % (cap * 1000), 60) + finally: + if client is not None: + client.terminate() + client.wait() + client = None + finally: + if proxy is not None: + proxy.terminate() + proxy.wait() + if proxyl is not None: + proxyl.terminate() + proxyl.wait() + +def bench_files_direct(report, prefix): + client = None + proxy = None + try: + proxy = p_nylon() + + for cps in range(1,81): + sys.stderr.write("files.%s,direct,%d\n" % (prefix, cps)) + try: + client = c_httperf(prefix, 1./cps, PROXY_IP) + monitor(report, "files.%s,direct,%d" % (prefix, cps), 60) + finally: + if client is not None: + client.terminate() + client.wait() + client = None + finally: + if proxy is not None: + proxy.terminate() + proxy.wait() + +def bench_files_tor(report, prefix): + client = None + proxy = None + proxyl = None + try: + proxy = p_tor_direct() + proxyl = c_tor_direct() + time.sleep(5) # tor startup is slow + + for cps in range(1,81): + sys.stderr.write("files.%s,tor,%d\n" % (prefix, cps)) + try: + client = c_httperf(prefix, 1./cps, '127.0.0.1') + monitor(report, "files.%s,tor,%d" % (prefix, cps), 60) + finally: + if client is not None: + client.terminate() + client.wait() + client = None + finally: + if proxy is not None: + proxy.terminate() + proxy.wait() + if proxyl is not None: + proxyl.terminate() + proxyl.wait() + +if __name__ == '__main__': + sys.stdout.write("benchmark,relay,cap,obs,up,down\n") + bench_fixedrate_direct(sys.stdout) + bench_fixedrate_tor(sys.stdout) + bench_files_direct(sys.stdout, "fixed") + bench_files_tor(sys.stdout, "fixed") + bench_files_direct(sys.stdout, "pareto") + bench_files_tor(sys.stdout, "pareto") diff --git a/scripts/bm-fixedrate-cgi.c b/scripts/bm-fixedrate-cgi.c new file mode 100644 index 0000000..2b48f98 --- /dev/null +++ b/scripts/bm-fixedrate-cgi.c @@ -0,0 +1,158 @@ +#define _XOPEN_SOURCE 600 +#define _POSIX_C_SOURCE 200112 + +#include <stdbool.h> +#include <stddef.h> + +#include <errno.h> +#include <math.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> + +/* 1400 bytes is a safe figure for per-packet transmissible payload. */ +#define BLOCKSZ 1400 + + +#if __GNUC__ >= 3 +#define NORETURN void __attribute__((noreturn)) +#else +#define NORETURN void +#endif + +extern char **environ; + +static NORETURN +error_400(const char *msg) +{ + char **p; + printf("Status: 400 Bad Request\nContent-Type: text/plain\n\n" + "400 Bad Request (%s)\nCGI environment dump follows:\n\n", msg); + for (p = environ; *p; p++) + puts(*p); + exit(0); +} + +static NORETURN +error_500(const char *syscall) +{ + printf("Status: 500 Internal Server Error\nContent-Type:text/plain\n\n" + "500 Internal Server Error: %s: %s\n", + syscall, strerror(errno)); + exit(0); +} + +static void +generate(unsigned long rate, bool dryrun) +{ + double interval; + timer_t timerid; + struct sigevent sev; + struct itimerspec its; + sigset_t mask; + int sig; + char *data; + size_t bufsz = BLOCKSZ; + + /* You send data at R bytes per second in 1400-byte blocks by + calling write() every 1/(R/1400) second. However, despite our + use of the high-resolution interval timers, we cannot count on + being scheduled more often than every 1/CLOCKS_PER_SEC seconds, + so if we need to send data faster than that, bump up the block + size instead. */ + interval = 1./(rate/(double)BLOCKSZ); + + if (interval < 1./CLOCKS_PER_SEC) { + interval = 1./CLOCKS_PER_SEC; + bufsz = rate / CLOCKS_PER_SEC; + } + + its.it_value.tv_sec = lrint(floor(interval)); + its.it_value.tv_nsec = lrint((interval - its.it_value.tv_sec) * 1e9); + its.it_interval.tv_sec = its.it_value.tv_sec; + its.it_interval.tv_nsec = its.it_value.tv_nsec; + + if (dryrun) { + printf("Content-Type: text/plain\n\n" + "Goal %lu bytes per second:\n" + "would send %lu bytes every %f seconds\n" + " " " " " " %lu sec + %lu nsec\n", + rate, bufsz, interval, + (unsigned long)its.it_value.tv_sec, + (unsigned long)its.it_value.tv_nsec); + return; + } + + data = malloc(bufsz); + if (!data) + error_500("malloc"); + memset(data, 0, bufsz); + + fflush(stdout); + setvbuf(stdout, 0, _IONBF, 0); + fputs("Content-Type: application/octet-stream\n" + "Cache-Control: no-store,no-cache\n\n", stdout); + + sigemptyset(&mask); + sigaddset(&mask, SIGRTMIN); + if (sigprocmask(SIG_SETMASK, &mask, 0)) + error_500("sigprocmask"); + + memset(&sev, 0, sizeof sev); + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = SIGRTMIN; + sev.sigev_value.sival_ptr = &timerid; + if (timer_create(CLOCK_MONOTONIC, &sev, &timerid)) + error_500("timer_create"); + + if (timer_settime(timerid, 0, &its, 0)) + error_500("timer_settime"); + + do { + size_t r, n = bufsz; + char *p = data; + do { + r = fwrite(p, 1, n, stdout); + if (r == 0) + exit(1); + n -= r; + p += r; + } while (n > 0); + } while (sigwait(&mask, &sig) == 0); +} + +int +main(void) +{ + unsigned long rate; + char *endp; + bool dryrun; + char *request_method = getenv("REQUEST_METHOD"); + char *query_string = getenv("QUERY_STRING"); + char *path_info = getenv("PATH_INFO"); + + if (!request_method || strcmp(request_method, "GET")) + error_400("method not supported"); + if (query_string && strcmp(query_string, "")) + error_400("no query parameters accepted"); + + if (!path_info || path_info[0] != '/') + error_400("malformed or missing PATH_INFO"); + + rate = strtoul(path_info+1, &endp, 10); + if (endp == path_info+1) + error_400("missing rate (specify bytes per second)"); + + if (endp[0] == '\0') + dryrun = false; + else if (endp[0] == ';' && endp[1] == 'd' && endp[2] == '\0') + dryrun = true; + else + error_400("unrecognized extra arguments"); + + generate(rate, dryrun); + return 0; +} diff --git a/scripts/bm-genfiles.py b/scripts/bm-genfiles.py new file mode 100755 index 0000000..dcd1030 --- /dev/null +++ b/scripts/bm-genfiles.py @@ -0,0 +1,162 @@ +#! /usr/bin/python + +"""Generate files for network performance testing. + +The default behavior is to generate 10,000 files all of which are +exactly 3584 bytes long, because that is approximately how big +Flickr's 75x75px JPEG thumbnails are. You can request a different +size, or you can request that the file sizes instead follow a bounded +Pareto distribution with tunable alpha. + +The files have names compatible with httperf's --wset mode. Since +it insists on .html as a file suffix, the files are syntactically +valid HTML. Their contents are word salad. + +There is one mandatory command line argument: the path to the root +of the tree of files to generate. It is created if it doesn't +already exist. If it already exists, its contents will be erased! +(so don't use '.')""" + +from __future__ import division + +import argparse +import errno +import math +import os +import os.path +import random +import shutil +import sys +import textwrap + +def ensure_empty_dir(dpath): + todelete = [] + try: + todelete = os.listdir(dpath) + except OSError, e: + # Don't delete a _file_ that's in the way. + # Don't try to create parent directories that are missing. + if e.errno != errno.ENOENT: + raise + os.mkdir(dpath) + return + for f in todelete: + p = os.path.join(dpath, f) + try: + os.remove(p) + except OSError, e: + if e.errno != errno.EISDIR and e.errno != errno.EPERM: + raise + shutil.rmtree(p) + +def ensure_parent_directories(path): + try: + os.makedirs(os.path.dirname(path)) + except OSError, e: + if e.errno != errno.EEXIST: + raise + +def word_salad(f, words, seed, maxlen): + rng = random.Random(seed) + salad = [] + slen = 0 + while slen < maxlen - 1: + nl = rng.randint(1, min((maxlen - 1) - slen, len(words))) - 1 + w = rng.choice(words[nl]) + salad.append(w) + slen += len(w) + 1 + salad = textwrap.fill(" ".join(salad), 78) + while len(salad) < maxlen-1: + salad += '.' + salad += '\n' + f.write(salad) + +def load_words(): + words = [ [] for _ in xrange(15) ] + for w in open('/usr/share/dict/words'): + w = w.strip() + if w.endswith("'s"): continue + if len(w) > 15 or len(w) < 2: continue + words[len(w)-1].append(w) + # special case words[0] as dictfiles often have every single single letter + words[0].extend(('a','I')) + return words + +FILE_PREFIX = '<!doctype html>\n<title>{0}</title>\n<p>\n' +FILE_SUFFIX = '</p>\n' + +def create_one(parent, ctr, digits, words, filesize, seed, resume, progress): + label = format(ctr, '0'+str(digits)+'d') + fname = os.path.join(parent, *label) + '.html' + ensure_parent_directories(fname) + + if os.path.exists(fname): + if not resume: raise RuntimeError('{0} already exists'.format(fname)) + return + + prefix = FILE_PREFIX.format(label) + suffix = FILE_SUFFIX + limit = filesize - (len(prefix) + len(suffix)) + if limit <= 0: + raise TypeError("{0} bytes is too small to generate (minimum {1})" + .format(filesize, len(prefix)+len(suffix))) + + if progress: + sys.stderr.write(fname + '\n') + + f = open(fname, "w") + f.write(prefix) + word_salad(f, words, ctr+seed, limit) + f.write(suffix) + +def bounded_pareto(rng, alpha, L, H): + while True: + U = rng.random() + if U < 1: break + Ha = H**alpha + La = L**alpha + return int(round((-(U*Ha - U*La - Ha)/(Ha * La)) ** (-1/alpha))) + +if __name__ == '__main__': + + default_filesize = 3584 + default_filecount = 10000 # 0/0/0/0.html through 9/9/9/9.html + + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('directory', + help='directory to populate with files') + parser.add_argument('-c', '--count', type=int, default=default_filecount, + help='number of files to generate') + sg = parser.add_mutually_exclusive_group() + sg.add_argument('-s', '--size', type=int, default=default_filesize, + help='all files will be exactly SIZE bytes long') + sg.add_argument('-p', '--pareto', type=float, + metavar='ALPHA', + help='file sizes will follow a bounded Pareto distribution' + ' with parameter ALPHA') + parser.add_argument('-m', '--minsize', type=int, default=512, + help='minimum file size (only useful with -p)') + parser.add_argument('-M', '--maxsize', type=int, default=2*1024*1024, + help='maximum file size (only useful with -p)') + parser.add_argument('-S', '--seed', type=int, default=719, + help='seed for random number generator') + parser.add_argument('--resume', action='store_true', + help='resume an interrupted run where it left off') + parser.add_argument('--progress', action='store_true', + help='report progress') + + args = parser.parse_args() + digits = len(str(args.count - 1)) + rng = random.Random(args.seed) + + words = load_words() + if not args.resume: + ensure_empty_dir(args.directory) + + size = args.size + for i in xrange(args.count): + if args.pareto is not None: + size = bounded_pareto(rng, args.pareto, args.minsize, args.maxsize) + create_one(args.directory, i, digits, words, size, args.seed, + args.resume, args.progress) diff --git a/scripts/bm-mcurl.c b/scripts/bm-mcurl.c new file mode 100644 index 0000000..ac24f3a --- /dev/null +++ b/scripts/bm-mcurl.c @@ -0,0 +1,196 @@ +/* Use libcurl to retrieve many URLs, according to a wildcard pattern, + starting new connections at a constant rate until we hit a limit. + + Command line arguments -- all are required, but 'proxy' may be an + empty string if you want direct connections: + + bm-mcurl [-v] rate limit proxy url-pattern [url-pattern ...] + + There is no output; it is assumed that you are monitoring traffic + externally. Passing -v turns on CURLOPT_VERBOSE debugging spew. + */ + +#define _XOPEN_SOURCE 600 + +#include <stdbool.h> +#include <stddef.h> + +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> + +#include <curl/curl.h> +#include "tool_urlglob.h" + +#define NORETURN __attribute__((noreturn)) + +static bool verbose = false; + +static size_t +discard_data(char *ptr, size_t size, size_t nmemb, void *userdata) +{ + return size * nmemb; +} + +static size_t +read_abort(void *ptr, size_t size, size_t nmemb, void *userdata) +{ + /* we don't do anything that should require this to be called, + so if it does get called, something is wrong */ + return CURL_READFUNC_ABORT; +} + +static CURL * +setup_curl_easy_handle(char *proxy) +{ + CURL *h = curl_easy_init(); + if (!h) abort(); + +#define SET_OR_CRASH(h, opt, param) \ + do { if (curl_easy_setopt(h, opt, param)) abort(); } while (0) + + SET_OR_CRASH(h, CURLOPT_VERBOSE, (unsigned long)verbose); + SET_OR_CRASH(h, CURLOPT_NOPROGRESS, 1L); + SET_OR_CRASH(h, CURLOPT_FAILONERROR, 1L); + SET_OR_CRASH(h, CURLOPT_USERAGENT, "bm-mcurl/0.1"); + SET_OR_CRASH(h, CURLOPT_ACCEPT_ENCODING, ""); + SET_OR_CRASH(h, CURLOPT_AUTOREFERER, 1L); + SET_OR_CRASH(h, CURLOPT_FOLLOWLOCATION, 1L); + SET_OR_CRASH(h, CURLOPT_MAXREDIRS, 30L); + + SET_OR_CRASH(h, CURLOPT_WRITEFUNCTION, discard_data); + SET_OR_CRASH(h, CURLOPT_WRITEDATA, NULL); + SET_OR_CRASH(h, CURLOPT_READFUNCTION, read_abort); + SET_OR_CRASH(h, CURLOPT_READDATA, NULL); + + if (proxy && proxy[0]) { + SET_OR_CRASH(h, CURLOPT_PROXY, proxy); + SET_OR_CRASH(h, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5_HOSTNAME); + } +#undef SET_OR_CRASH +} + +static bool +process_events_once(CURLM *multi, unsigned long timeout_max) +{ + struct timeval tv; + int rc; /* select() return code */ + + fd_set fdread; + fd_set fdwrite; + fd_set fdexcept; + int maxfd = -1; + + unsigned long timeout = 1000000; /* one second - ultimate default */ + long curl_tout_ms = -1; + + /* get fd sets for all pending transfers */ + FD_ZERO(&fdread); + FD_ZERO(&fdwrite); + FD_ZERO(&fdexcept); + curl_multi_fdset(multi_handle, &fdread, &fdwrite, &fdexcept, &maxfd); + + /* timeout */ + if (timeout_max > 0 && timeout_max < timeout) + timeout = timeout_max; + + curl_multi_timeout(multi_handle, &curl_tout_ms); + + if (curl_tout_ms >= 0) { + unsigned long curl_tout_us = ((unsigned long)curl_tout_ms) * 1000; + if (timeout > curl_tout_us) + timeout = curl_tout_us; + } + + tv.tv_sec = timeout / 1000000; + if(tv.tv_sec >= 1) + tv.tv_sec = 1; + else + tv.tv_usec = timeout % 1000000; + + do { + rc = select(maxfd+1, &fdread, &fdwrite, &fdexcept, &tv); + } while (rc == -1 && errno == EINTR); + + if (rc > 0) { + int still_running; + curl_multi_perform(multi_handle, &still_running); + return !!still_running; + } else + abort(); +} + +/* Note: this function must not return until we are ready to start + another connection. */ +static void +queue_one(CURLM *multi, unsigned long rate, unsigned long limit, + char *proxy, char *url) +{ + +} + +static void +run(unsigned long rate, unsigned long limit, char *proxy, char **urls) +{ + CURLM *multi; + curl_global_init(); + multi = curl_multi_init(); + if (!multi) abort(); + + for (char **upat = urls; *upat; url++) { + URLGlob *uglob; + int *n; + if (glob_url(&uglob, *upat, &n, stderr)) + continue; + do { + char *url; + if (glob_next_url(&url, uglob)) abort(); + queue_one(multi, rate, limit, proxy, url); /* takes ownership */ + } while (--n); + glob_cleanup(uglob); + } + + /* spin the event loop until all outstanding transfers complete */ + while (process_events_once(multi, 0)); + + curl_multi_cleanup(multi); +} + +static NORETURN +usage(const char *av0, const char *complaint) +{ + fprintf(stderr, + "%s\nusage: %s [-v] rate limit proxy url [url...]\n", + complaint, av0); + exit(2); +} + +int +main(int argc, char **argv) +{ + unsigned long rate; + unsigned long limit; + char *endp; + + if (argv[1] && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--verbose"))) { + verbose = true; + argc--; + argv[1] = argv[0]; + argv++; + } + + if (argc < 5) + usage("not enough arguments"); + + rate = strtoul(argv[1], &endp, 10); + if (endp == argv[1] || *endp) + usage("rate must be a positive integer (connections per second)"); + + limit = strtoul(argv[2], &endp, 10); + if (endp == argv[2] || *endp) + usage("limit must be a positive integer (max outstanding requests)"); + + run(rate, limit, argv[3], argv+4); + return 0; +} diff --git a/scripts/tool_urlglob.c b/scripts/tool_urlglob.c new file mode 100644 index 0000000..d714971 --- /dev/null +++ b/scripts/tool_urlglob.c @@ -0,0 +1,516 @@ +/*************************************************************************** + * _ _ ____ _ + * Project ___| | | | _ | | + * / __| | | | |_) | | + * | (__| |_| | _ <| |___ + * ___|___/|_| ______| + * + * Copyright (C) 1998 - 2011, Daniel Stenberg, daniel@haxx.se, et al. + * + * This software is licensed as described in the file COPYING, which + * you should have received as part of this distribution. The terms + * are also available at http://curl.haxx.se/docs/copyright.html. + * + * You may opt to use, copy, modify, merge, publish, distribute and/or sell + * copies of the Software, and permit persons to whom the Software is + * furnished to do so, under the terms of the COPYING file. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ***************************************************************************/ + +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <curl/curl.h> + +#include "tool_urlglob.h" + +typedef enum { + GLOB_OK, + GLOB_NO_MEM, + GLOB_ERROR +} GlobCode; + +/* + * glob_word() + * + * Input a full globbed string, set the forth argument to the amount of + * strings we get out of this. Return GlobCode. + */ +static GlobCode glob_word(URLGlob *, /* object anchor */ + char *, /* globbed string */ + size_t, /* position */ + int *); /* returned number of strings */ + +static GlobCode glob_set(URLGlob *glob, char *pattern, + size_t pos, int *amount) +{ + /* processes a set expression with the point behind the opening '{' + ','-separated elements are collected until the next closing '}' + */ + URLPattern *pat; + GlobCode res; + bool done = false; + char* buf = glob->glob_buffer; + + pat = &glob->pattern[glob->size / 2]; + /* patterns 0,1,2,... correspond to size=1,3,5,... */ + pat->type = UPTSet; + pat->content.Set.size = 0; + pat->content.Set.ptr_s = 0; + pat->content.Set.elements = NULL; + + ++glob->size; + + while(!done) { + switch (*pattern) { + case '\0': /* URL ended while set was still open */ + snprintf(glob->errormsg, sizeof(glob->errormsg), + "unmatched brace at pos %zu\n", pos); + return GLOB_ERROR; + + case '{': + case '[': /* no nested expressions at this time */ + snprintf(glob->errormsg, sizeof(glob->errormsg), + "nested braces not supported at pos %zu\n", pos); + return GLOB_ERROR; + + case ',': + case '}': /* set element completed */ + *buf = '\0'; + if(pat->content.Set.elements) { + char **new_arr = realloc(pat->content.Set.elements, + (pat->content.Set.size + 1) * sizeof(char*)); + if(!new_arr) { + short elem; + for(elem = 0; elem < pat->content.Set.size; elem++) + Curl_safefree(pat->content.Set.elements[elem]); + Curl_safefree(pat->content.Set.elements); + pat->content.Set.ptr_s = 0; + pat->content.Set.size = 0; + } + pat->content.Set.elements = new_arr; + } + else + pat->content.Set.elements = malloc(sizeof(char*)); + if(!pat->content.Set.elements) { + snprintf(glob->errormsg, sizeof(glob->errormsg), "out of memory\n"); + return GLOB_NO_MEM; + } + pat->content.Set.elements[pat->content.Set.size] = + strdup(glob->glob_buffer); + if(!pat->content.Set.elements[pat->content.Set.size]) { + short elem; + for(elem = 0; elem < pat->content.Set.size; elem++) + Curl_safefree(pat->content.Set.elements[elem]); + Curl_safefree(pat->content.Set.elements); + pat->content.Set.ptr_s = 0; + pat->content.Set.size = 0; + snprintf(glob->errormsg, sizeof(glob->errormsg), "out of memory\n"); + return GLOB_NO_MEM; + } + ++pat->content.Set.size; + + if(*pattern == '}') { + /* entire set pattern completed */ + int wordamount; + + /* always check for a literal (may be "") between patterns */ + res = glob_word(glob, ++pattern, ++pos, &wordamount); + if(res) { + short elem; + for(elem = 0; elem < pat->content.Set.size; elem++) + Curl_safefree(pat->content.Set.elements[elem]); + Curl_safefree(pat->content.Set.elements); + pat->content.Set.ptr_s = 0; + pat->content.Set.size = 0; + return res; + } + + *amount = pat->content.Set.size * wordamount; + + done = true; + continue; + } + + buf = glob->glob_buffer; + ++pattern; + ++pos; + break; + + case ']': /* illegal closing bracket */ + snprintf(glob->errormsg, sizeof(glob->errormsg), + "illegal pattern at pos %zu\n", pos); + return GLOB_ERROR; + + case '\': /* escaped character, skip '' */ + if(pattern[1]) { + ++pattern; + ++pos; + } + /* intentional fallthrough */ + default: + *buf++ = *pattern++; /* copy character to set element */ + ++pos; + } + } + return GLOB_OK; +} + +static GlobCode glob_range(URLGlob *glob, char *pattern, + size_t pos, int *amount) +{ + /* processes a range expression with the point behind the opening '[' + - char range: e.g. "a-z]", "B-Q]" + - num range: e.g. "0-9]", "17-2000]" + - num range with leading zeros: e.g. "001-999]" + expression is checked for well-formedness and collected until the next ']' + */ + URLPattern *pat; + char *c; + char sep; + char sep2; + int step; + int rc; + GlobCode res; + int wordamount = 1; + + pat = &glob->pattern[glob->size / 2]; + /* patterns 0,1,2,... correspond to size=1,3,5,... */ + ++glob->size; + + if(ISALPHA(*pattern)) { + /* character range detected */ + char min_c; + char max_c; + + pat->type = UPTCharRange; + + rc = sscanf(pattern, "%c-%c%c%d%c", &min_c, &max_c, &sep, &step, &sep2); + + if((rc < 3) || (min_c >= max_c) || ((max_c - min_c) > ('z' - 'a'))) { + /* the pattern is not well-formed */ + snprintf(glob->errormsg, sizeof(glob->errormsg), + "error: bad range specification after pos %zu\n", pos); + return GLOB_ERROR; + } + + /* check the (first) separating character */ + if((sep != ']') && (sep != ':')) { + snprintf(glob->errormsg, sizeof(glob->errormsg), + "error: unsupported character (%c) after range at pos %zu\n", + sep, pos); + return GLOB_ERROR; + } + + /* if there was a ":[num]" thing, use that as step or else use 1 */ + pat->content.CharRange.step = + ((sep == ':') && (rc == 5) && (sep2 == ']')) ? step : 1; + + pat->content.CharRange.ptr_c = pat->content.CharRange.min_c = min_c; + pat->content.CharRange.max_c = max_c; + } + else if(ISDIGIT(*pattern)) { + /* numeric range detected */ + int min_n; + int max_n; + + pat->type = UPTNumRange; + pat->content.NumRange.padlength = 0; + + rc = sscanf(pattern, "%d-%d%c%d%c", &min_n, &max_n, &sep, &step, &sep2); + + if((rc < 2) || (min_n > max_n)) { + /* the pattern is not well-formed */ + snprintf(glob->errormsg, sizeof(glob->errormsg), + "error: bad range specification after pos %zu\n", pos); + return GLOB_ERROR; + } + pat->content.NumRange.ptr_n = pat->content.NumRange.min_n = min_n; + pat->content.NumRange.max_n = max_n; + + /* if there was a ":[num]" thing, use that as step or else use 1 */ + pat->content.NumRange.step = + ((sep == ':') && (rc == 5) && (sep2 == ']')) ? step : 1; + + if(*pattern == '0') { + /* leading zero specified */ + c = pattern; + while(ISDIGIT(*c)) { + c++; + ++pat->content.NumRange.padlength; /* padding length is set for all + instances of this pattern */ + } + } + } + else { + snprintf(glob->errormsg, sizeof(glob->errormsg), + "illegal character in range specification at pos %zu\n", pos); + return GLOB_ERROR; + } + + c = (char*)strchr(pattern, ']'); /* continue after next ']' */ + if(c) + c++; + else { + snprintf(glob->errormsg, sizeof(glob->errormsg), "missing ']'"); + return GLOB_ERROR; /* missing ']' */ + } + + /* always check for a literal (may be "") between patterns */ + + res = glob_word(glob, c, pos + (c - pattern), &wordamount); + if(res == GLOB_ERROR) { + wordamount = 1; + res = GLOB_OK; + } + + if(!res) { + if(pat->type == UPTCharRange) + *amount = wordamount * (pat->content.CharRange.max_c - + pat->content.CharRange.min_c + 1); + else + *amount = wordamount * (pat->content.NumRange.max_n - + pat->content.NumRange.min_n + 1); + } + + return res; /* GLOB_OK or GLOB_NO_MEM */ +} + +static GlobCode glob_word(URLGlob *glob, char *pattern, + size_t pos, int *amount) +{ + /* processes a literal string component of a URL + special characters '{' and '[' branch to set/range processing functions + */ + char* buf = glob->glob_buffer; + size_t litindex; + GlobCode res = GLOB_OK; + + *amount = 1; /* default is one single string */ + + while(*pattern != '\0' && *pattern != '{' && *pattern != '[') { + if(*pattern == '}' || *pattern == ']') { + snprintf(glob->errormsg, sizeof(glob->errormsg), + "unmatched close brace/bracket at pos %zu\n", pos); + return GLOB_ERROR; + } + + /* only allow \ to escape known "special letters" */ + if(*pattern == '\' && + (*(pattern+1) == '{' || *(pattern+1) == '[' || + *(pattern+1) == '}' || *(pattern+1) == ']') ) { + + /* escape character, skip '' */ + ++pattern; + ++pos; + } + *buf++ = *pattern++; /* copy character to literal */ + ++pos; + } + *buf = '\0'; + litindex = glob->size / 2; + /* literals 0,1,2,... correspond to size=0,2,4,... */ + glob->literal[litindex] = strdup(glob->glob_buffer); + if(!glob->literal[litindex]) { + snprintf(glob->errormsg, sizeof(glob->errormsg), "out of memory\n"); + return GLOB_NO_MEM; + } + ++glob->size; + + switch (*pattern) { + case '\0': + /* singular URL processed */ + break; + + case '{': + /* process set pattern */ + res = glob_set(glob, ++pattern, ++pos, amount); + break; + + case '[': + /* process range pattern */ + res = glob_range(glob, ++pattern, ++pos, amount); + break; + } + + if(res) + Curl_safefree(glob->literal[litindex]); + + return res; +} + +int glob_url(URLGlob** glob, char* url, int *urlnum, FILE *error) +{ + /* + * We can deal with any-size, just make a buffer with the same length + * as the specified URL! + */ + URLGlob *glob_expand; + int amount; + char *glob_buffer; + GlobCode res; + + *glob = NULL; + + glob_buffer = malloc(strlen(url) + 1); + if(!glob_buffer) + return CURLE_OUT_OF_MEMORY; + + glob_expand = calloc(1, sizeof(URLGlob)); + if(!glob_expand) { + Curl_safefree(glob_buffer); + return CURLE_OUT_OF_MEMORY; + } + glob_expand->size = 0; + glob_expand->urllen = strlen(url); + glob_expand->glob_buffer = glob_buffer; + glob_expand->beenhere = 0; + + res = glob_word(glob_expand, url, 1, &amount); + if(!res) + *urlnum = amount; + else { + if(error && glob_expand->errormsg[0]) { + /* send error description to the error-stream */ + fprintf(error, "curl: (%d) [globbing] %s", + (res == GLOB_NO_MEM) ? CURLE_OUT_OF_MEMORY : CURLE_URL_MALFORMAT, + glob_expand->errormsg); + } + /* it failed, we cleanup */ + Curl_safefree(glob_buffer); + Curl_safefree(glob_expand); + *urlnum = 1; + return (res == GLOB_NO_MEM) ? CURLE_OUT_OF_MEMORY : CURLE_URL_MALFORMAT; + } + + *glob = glob_expand; + return CURLE_OK; +} + +void glob_cleanup(URLGlob* glob) +{ + size_t i; + int elem; + + for(i = glob->size - 1; i < glob->size; --i) { + if(!(i & 1)) { /* even indexes contain literals */ + Curl_safefree(glob->literal[i/2]); + } + else { /* odd indexes contain sets or ranges */ + if((glob->pattern[i/2].type == UPTSet) && + (glob->pattern[i/2].content.Set.elements)) { + for(elem = glob->pattern[i/2].content.Set.size - 1; + elem >= 0; + --elem) { + Curl_safefree(glob->pattern[i/2].content.Set.elements[elem]); + } + Curl_safefree(glob->pattern[i/2].content.Set.elements); + } + } + } + Curl_safefree(glob->glob_buffer); + Curl_safefree(glob); +} + +int glob_next_url(char **globbed, URLGlob *glob) +{ + URLPattern *pat; + char *lit; + size_t i; + size_t j; + size_t len; + size_t buflen = glob->urllen + 1; + char *buf = glob->glob_buffer; + + *globbed = NULL; + + if(!glob->beenhere) + glob->beenhere = 1; + else { + bool carry = true; + + /* implement a counter over the index ranges of all patterns, + starting with the rightmost pattern */ + for(i = glob->size / 2 - 1; carry && (i < glob->size); --i) { + carry = false; + pat = &glob->pattern[i]; + switch (pat->type) { + case UPTSet: + if((pat->content.Set.elements) && + (++pat->content.Set.ptr_s == pat->content.Set.size)) { + pat->content.Set.ptr_s = 0; + carry = true; + } + break; + case UPTCharRange: + pat->content.CharRange.ptr_c = (char)(pat->content.CharRange.step + + (int)((unsigned char)pat->content.CharRange.ptr_c)); + if(pat->content.CharRange.ptr_c > pat->content.CharRange.max_c) { + pat->content.CharRange.ptr_c = pat->content.CharRange.min_c; + carry = true; + } + break; + case UPTNumRange: + pat->content.NumRange.ptr_n += pat->content.NumRange.step; + if(pat->content.NumRange.ptr_n > pat->content.NumRange.max_n) { + pat->content.NumRange.ptr_n = pat->content.NumRange.min_n; + carry = true; + } + break; + default: + printf("internal error: invalid pattern type (%d)\n", (int)pat->type); + return CURLE_FAILED_INIT; + } + } + if(carry) { /* first pattern ptr has run into overflow, done! */ + /* TODO: verify if this should actally return CURLE_OK. */ + return CURLE_OK; /* CURLE_OK to match previous behavior */ + } + } + + for(j = 0; j < glob->size; ++j) { + if(!(j&1)) { /* every other term (j even) is a literal */ + lit = glob->literal[j/2]; + len = snprintf(buf, buflen, "%s", lit); + buf += len; + buflen -= len; + } + else { /* the rest (i odd) are patterns */ + pat = &glob->pattern[j/2]; + switch(pat->type) { + case UPTSet: + if(pat->content.Set.elements) { + len = strlen(pat->content.Set.elements[pat->content.Set.ptr_s]); + snprintf(buf, buflen, "%s", + pat->content.Set.elements[pat->content.Set.ptr_s]); + buf += len; + buflen -= len; + } + break; + case UPTCharRange: + *buf++ = pat->content.CharRange.ptr_c; + break; + case UPTNumRange: + len = snprintf(buf, buflen, "%0*d", + pat->content.NumRange.padlength, + pat->content.NumRange.ptr_n); + buf += len; + buflen -= len; + break; + default: + printf("internal error: invalid pattern type (%d)\n", (int)pat->type); + return CURLE_FAILED_INIT; + } + } + } + *buf = '\0'; + + *globbed = strdup(glob->glob_buffer); + if(!*globbed) + return CURLE_OUT_OF_MEMORY; + + return CURLE_OK; +} diff --git a/scripts/tool_urlglob.h b/scripts/tool_urlglob.h new file mode 100644 index 0000000..562b08e --- /dev/null +++ b/scripts/tool_urlglob.h @@ -0,0 +1,69 @@ +#ifndef HEADER_CURL_TOOL_URLGLOB_H +#define HEADER_CURL_TOOL_URLGLOB_H +/*************************************************************************** + * _ _ ____ _ + * Project ___| | | | _ | | + * / __| | | | |_) | | + * | (__| |_| | _ <| |___ + * ___|___/|_| ______| + * + * Copyright (C) 1998 - 2011, Daniel Stenberg, daniel@haxx.se, et al. + * + * This software is licensed as described in the file COPYING, which + * you should have received as part of this distribution. The terms + * are also available at http://curl.haxx.se/docs/copyright.html. + * + * You may opt to use, copy, modify, merge, publish, distribute and/or sell + * copies of the Software, and permit persons to whom the Software is + * furnished to do so, under the terms of the COPYING file. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ***************************************************************************/ + +typedef enum { + UPTSet = 1, + UPTCharRange, + UPTNumRange +} URLPatternType; + +typedef struct { + URLPatternType type; + union { + struct { + char **elements; + short size; + short ptr_s; + } Set; + struct { + char min_c; + char max_c; + char ptr_c; + int step; + } CharRange; + struct { + int min_n; + int max_n; + short padlength; + int ptr_n; + int step; + } NumRange ; + } content; +} URLPattern; + +typedef struct { + char *literal[10]; + URLPattern pattern[9]; + size_t size; + size_t urllen; + char *glob_buffer; + char beenhere; + char errormsg[80]; /* error message buffer */ +} URLGlob; + +int glob_url(URLGlob**, char*, int *, FILE *); +int glob_next_url(char **, URLGlob *); +void glob_cleanup(URLGlob* glob); + +#endif /* HEADER_CURL_TOOL_URLGLOB_H */