Skip to content
This repository has been archived by the owner on Jan 12, 2025. It is now read-only.

Commit

Permalink
Better documentation for warcexclude.py
Browse files Browse the repository at this point in the history
  • Loading branch information
pmyteh committed Oct 10, 2014
1 parent 2657463 commit 02e166d
Showing 1 changed file with 69 additions and 46 deletions.
115 changes: 69 additions & 46 deletions warcexclude.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,53 +33,12 @@
import argparse
import time

parser = argparse.ArgumentParser(description='Recreate a WARC record, '
'optionally excluding records which match an arbitrary number of '
'given header/regex pairs. If multiple patterns are given, '
'by default exclude only if all patterns match.')
parser.add_argument("-e", "--do-not-expose-http-headers",
help="Don't expose additional headers if the record "
"payload is an HTTP response. Normally, "
"XHTTP-Response-Code contains the HTTP status "
"code from the record, XHTTP-Content-Type "
"contains the value of the HTTP Content-Type "
"header, and XHTTP-Body contains the full "
"content body.",
action="store_true")
parser.add_argument('-i', '--in-filename', metavar='inwf',
help='Input WARC filename. Default: stdin.')
parser.add_argument('-o', '--out-filename', metavar='outwf',
help='Output WARC filename. Default: stdout.')


gzinput = parser.add_mutually_exclusive_group()
gzinput.add_argument('-gz', '--gzipped-input', action="store_true",
help='Treat input stream as gzipped. Default: guess, '
'which fails on stdin.')
gzinput.add_argument('-gp', '--plain-input', action="store_true",
help='Treat input stream as plain text.')

parser.add_argument('-G', '--gzipped-output', action="store_true",
help='Gzip the output stream (record-wise).')

parser.add_argument('-a', '--match-any', action="store_true",
help='Exclude if any one pattern is matched. '
'Default: all')

parser.add_argument('pattern', metavar='patt', nargs='+',
help="field/regexp, where field is a "
"WARC header and regexp is a pattern to match against. "
"Example pattern: WARC-Target-URI/^https?://www.example.com/.*$"
"If the field is of the format XFile/filepath, then the given "
"file will be loaded and each line interpreted as a pattern.")



args = parser.parse_args()

uuidsexcluded = set()

def parse_exc_args(argl, exclist=list()):
"""Given a list of patterns and an optional list of 2-tuples. If given
a pattern of the form "XFile/filename", fetches the file and recurses
throught it, treating each line in the file as a pattern.
Returns: a list containing ("Header",<compiled regex>) for each item"""
print argl
for arg in argl:
if '/' not in arg:
Expand All @@ -95,9 +54,18 @@ def parse_exc_args(argl, exclist=list()):
return exclist

def check_headers(exclist, record, just_one=False):
"""Tests the given record against the list of exclusion patterns given in
exclist. If just_one is True, testing is optimised by returning after
any match has been made.
Returns: The number of matches that have been made"""
matches = 0
for tup in exclist:
heads = [h for h in record.headers if h[0] == tup[0]]
# Try to avoid processing the HTTP Response content unless we have
# a pattern which requires it, as it's expensive.
# This could be further optimised by caching the body etc. if
# calculated once.
if (record.type == WarcRecord.RESPONSE
and record.url.startswith('http')
and not args.do_not_expose_http_headers):
Expand Down Expand Up @@ -125,6 +93,56 @@ def check_headers(exclist, record, just_one=False):
return matches
return matches

#####
#ARGUMENT PARSER
#####

parser = argparse.ArgumentParser(description='Recreate a WARC record, '
'optionally excluding records which match an arbitrary number of '
'given header/regex pairs. If multiple patterns are given, '
'by default exclude only if all patterns match.')
parser.add_argument("-e", "--do-not-expose-http-headers",
help="Don't expose additional headers if the record "
"payload is an HTTP response. Normally, "
"XHTTP-Response-Code contains the HTTP status "
"code from the record, XHTTP-Content-Type "
"contains the value of the HTTP Content-Type "
"header, and XHTTP-Body contains the full "
"content body.",
action="store_true")
parser.add_argument('-i', '--in-filename', metavar='inwf',
help='Input WARC filename. Default: stdin.')
parser.add_argument('-o', '--out-filename', metavar='outwf',
help='Output WARC filename. Default: stdout.')


gzinput = parser.add_mutually_exclusive_group()
gzinput.add_argument('-gz', '--gzipped-input', action="store_true",
help='Treat input stream as gzipped. Default: guess, '
'which fails on stdin.')
gzinput.add_argument('-gp', '--plain-input', action="store_true",
help='Treat input stream as plain text.')

parser.add_argument('-G', '--gzipped-output', action="store_true",
help='Gzip the output stream (record-wise).')

parser.add_argument('-a', '--match-any', action="store_true",
help='Exclude if any one pattern is matched. '
'Default: all')

parser.add_argument('pattern', metavar='patt', nargs='+',
help="field/regexp, where field is a "
"WARC header and regexp is a pattern to match against. "
"Example pattern: WARC-Target-URI/^https?://www.example.com/.*$"
"If the field is of the format XFile/filepath, then the given "
"file will be loaded and each line interpreted as a pattern.")



args = parser.parse_args()

uuidsexcluded = set()

exclist = parse_exc_args(args.pattern)

# In theory this could be agnostic as to whether the stream is compressed or
Expand All @@ -144,6 +162,10 @@ def check_headers(exclist, record, just_one=False):
inwf = WarcRecord.open_archive(filename=args.in_filename,
mode='rb', gzip=gzi)

#####
#MAIN
#####

outf = sys.stdout
if args.out_filename is not None:
outf = open(args.out_filename, 'wb')
Expand Down Expand Up @@ -175,3 +197,4 @@ def check_headers(exclist, record, just_one=False):
sys.stderr.write('-')
uuidsexcluded.add(record.id)
sys.stderr.write("Done.\n")

0 comments on commit 02e166d

Please sign in to comment.