From 02e166d85d58681e007ce34a6b222e09a53c1ed8 Mon Sep 17 00:00:00 2001 From: Tom Nicholls Date: Fri, 10 Oct 2014 16:58:33 +0100 Subject: [PATCH] Better documentation for warcexclude.py --- warcexclude.py | 115 +++++++++++++++++++++++++++++-------------------- 1 file changed, 69 insertions(+), 46 deletions(-) diff --git a/warcexclude.py b/warcexclude.py index 2e297a4..aff3960 100755 --- a/warcexclude.py +++ b/warcexclude.py @@ -33,53 +33,12 @@ import argparse import time -parser = argparse.ArgumentParser(description='Recreate a WARC record, ' - 'optionally excluding records which match an arbitrary number of ' - 'given header/regex pairs. If multiple patterns are given, ' - 'by default exclude only if all patterns match.') -parser.add_argument("-e", "--do-not-expose-http-headers", - help="Don't expose additional headers if the record " - "payload is an HTTP response. Normally, " - "XHTTP-Response-Code contains the HTTP status " - "code from the record, XHTTP-Content-Type " - "contains the value of the HTTP Content-Type " - "header, and XHTTP-Body contains the full " - "content body.", - action="store_true") -parser.add_argument('-i', '--in-filename', metavar='inwf', - help='Input WARC filename. Default: stdin.') -parser.add_argument('-o', '--out-filename', metavar='outwf', - help='Output WARC filename. Default: stdout.') - - -gzinput = parser.add_mutually_exclusive_group() -gzinput.add_argument('-gz', '--gzipped-input', action="store_true", - help='Treat input stream as gzipped. Default: guess, ' - 'which fails on stdin.') -gzinput.add_argument('-gp', '--plain-input', action="store_true", - help='Treat input stream as plain text.') - -parser.add_argument('-G', '--gzipped-output', action="store_true", - help='Gzip the output stream (record-wise).') - -parser.add_argument('-a', '--match-any', action="store_true", - help='Exclude if any one pattern is matched. ' - 'Default: all') - -parser.add_argument('pattern', metavar='patt', nargs='+', - help="field/regexp, where field is a " - "WARC header and regexp is a pattern to match against. " - "Example pattern: WARC-Target-URI/^https?://www.example.com/.*$" - "If the field is of the format XFile/filepath, then the given " - "file will be loaded and each line interpreted as a pattern.") - - - -args = parser.parse_args() - -uuidsexcluded = set() - def parse_exc_args(argl, exclist=list()): + """Given a list of patterns and an optional list of 2-tuples. If given + a pattern of the form "XFile/filename", fetches the file and recurses + throught it, treating each line in the file as a pattern. + + Returns: a list containing ("Header",) for each item""" print argl for arg in argl: if '/' not in arg: @@ -95,9 +54,18 @@ def parse_exc_args(argl, exclist=list()): return exclist def check_headers(exclist, record, just_one=False): + """Tests the given record against the list of exclusion patterns given in + exclist. If just_one is True, testing is optimised by returning after + any match has been made. + + Returns: The number of matches that have been made""" matches = 0 for tup in exclist: heads = [h for h in record.headers if h[0] == tup[0]] + # Try to avoid processing the HTTP Response content unless we have + # a pattern which requires it, as it's expensive. + # This could be further optimised by caching the body etc. if + # calculated once. if (record.type == WarcRecord.RESPONSE and record.url.startswith('http') and not args.do_not_expose_http_headers): @@ -125,6 +93,56 @@ def check_headers(exclist, record, just_one=False): return matches return matches +##### +#ARGUMENT PARSER +##### + +parser = argparse.ArgumentParser(description='Recreate a WARC record, ' + 'optionally excluding records which match an arbitrary number of ' + 'given header/regex pairs. If multiple patterns are given, ' + 'by default exclude only if all patterns match.') +parser.add_argument("-e", "--do-not-expose-http-headers", + help="Don't expose additional headers if the record " + "payload is an HTTP response. Normally, " + "XHTTP-Response-Code contains the HTTP status " + "code from the record, XHTTP-Content-Type " + "contains the value of the HTTP Content-Type " + "header, and XHTTP-Body contains the full " + "content body.", + action="store_true") +parser.add_argument('-i', '--in-filename', metavar='inwf', + help='Input WARC filename. Default: stdin.') +parser.add_argument('-o', '--out-filename', metavar='outwf', + help='Output WARC filename. Default: stdout.') + + +gzinput = parser.add_mutually_exclusive_group() +gzinput.add_argument('-gz', '--gzipped-input', action="store_true", + help='Treat input stream as gzipped. Default: guess, ' + 'which fails on stdin.') +gzinput.add_argument('-gp', '--plain-input', action="store_true", + help='Treat input stream as plain text.') + +parser.add_argument('-G', '--gzipped-output', action="store_true", + help='Gzip the output stream (record-wise).') + +parser.add_argument('-a', '--match-any', action="store_true", + help='Exclude if any one pattern is matched. ' + 'Default: all') + +parser.add_argument('pattern', metavar='patt', nargs='+', + help="field/regexp, where field is a " + "WARC header and regexp is a pattern to match against. " + "Example pattern: WARC-Target-URI/^https?://www.example.com/.*$" + "If the field is of the format XFile/filepath, then the given " + "file will be loaded and each line interpreted as a pattern.") + + + +args = parser.parse_args() + +uuidsexcluded = set() + exclist = parse_exc_args(args.pattern) # In theory this could be agnostic as to whether the stream is compressed or @@ -144,6 +162,10 @@ def check_headers(exclist, record, just_one=False): inwf = WarcRecord.open_archive(filename=args.in_filename, mode='rb', gzip=gzi) +##### +#MAIN +##### + outf = sys.stdout if args.out_filename is not None: outf = open(args.out_filename, 'wb') @@ -175,3 +197,4 @@ def check_headers(exclist, record, just_one=False): sys.stderr.write('-') uuidsexcluded.add(record.id) sys.stderr.write("Done.\n") +