Skip to content

Commit

Permalink
annotate dumppdf, and comment likely bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
0xabu committed Sep 3, 2021
1 parent 7278d83 commit 45d2ce9
Showing 1 changed file with 42 additions and 27 deletions.
69 changes: 42 additions & 27 deletions tools/dumppdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import os.path
import re
import sys
from typing import Any, Container, Dict, Iterable, List, Optional, TextIO, \
Union, cast
import warnings
from argparse import ArgumentParser

Expand All @@ -22,13 +24,15 @@
ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')


def escape(s):
def escape(s: Union[str, bytes]) -> str:
if isinstance(s, bytes):
s = str(s, 'latin-1')
return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), s)
us = str(s, 'latin-1')
else:
us = s
return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), us)


def dumpxml(out, obj, codec=None):
def dumpxml(out: TextIO, obj: Any, codec: Optional[str] = None) -> None:
if obj is None:
out.write('<null />')
return
Expand All @@ -51,15 +55,17 @@ def dumpxml(out, obj, codec=None):
out.write('</list>')
return

if isinstance(obj, ((str,), bytes)):
if isinstance(obj, (str, bytes)):
out.write('<string size="%d">%s</string>' % (len(obj), escape(obj)))
return

if isinstance(obj, PDFStream):
if codec == 'raw':
out.write(obj.get_rawdata())
# Likely bug: writing bytes to text I/O.
out.write(obj.get_rawdata()) # type: ignore [arg-type]
elif codec == 'binary':
out.write(obj.get_data())
# Likely bug: writing bytes to text I/O.
out.write(obj.get_data()) # type: ignore [arg-type]
else:
out.write('<stream>\n<props>\n')
dumpxml(out, obj.attrs)
Expand All @@ -76,11 +82,15 @@ def dumpxml(out, obj, codec=None):
return

if isinstance(obj, PSKeyword):
out.write('<keyword>%s</keyword>' % obj.name)
# Likely bug: obj.name is bytes, not str
out.write('<keyword>%s</keyword>'
% obj.name) # type: ignore [str-bytes-safe]
return

if isinstance(obj, PSLiteral):
out.write('<literal>%s</literal>' % obj.name)
# Likely bug: obj.name may be bytes, not str
out.write('<literal>%s</literal>'
% obj.name) # type: ignore [str-bytes-safe]
return

if isnumber(obj):
Expand All @@ -90,11 +100,12 @@ def dumpxml(out, obj, codec=None):
raise TypeError(obj)


def dumptrailers(out, doc, show_fallback_xref=False):
def dumptrailers(out: TextIO, doc: PDFDocument,
show_fallback_xref: bool = False) -> None:
for xref in doc.xrefs:
if not isinstance(xref, PDFXRefFallback) or show_fallback_xref:
out.write('<trailer>\n')
dumpxml(out, xref.trailer)
dumpxml(out, xref.get_trailer())
out.write('\n</trailer>\n\n')
no_xrefs = all(isinstance(xref, PDFXRefFallback) for xref in doc.xrefs)
if no_xrefs and not show_fallback_xref:
Expand All @@ -105,7 +116,8 @@ def dumptrailers(out, doc, show_fallback_xref=False):
return


def dumpallobjs(out, doc, codec=None, show_fallback_xref=False):
def dumpallobjs(out: TextIO, doc: PDFDocument, codec: Optional[str] = None,
show_fallback_xref: bool = False) -> None:
visited = set()
out.write('<pdf>')
for xref in doc.xrefs:
Expand All @@ -127,15 +139,17 @@ def dumpallobjs(out, doc, codec=None, show_fallback_xref=False):
return


def dumpoutline(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None):
def dumpoutline(outfp: TextIO, fname: str, objids: Any,
pagenos: Container[int], password: str = '',
dumpall: bool = False, codec: Optional[str] = None,
extractdir: Optional[str] = None) -> None:
fp = open(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser, password)
pages = {page.pageid: pageno for (pageno, page)
in enumerate(PDFPage.create_pages(doc), 1)}

def resolve_dest(dest):
def resolve_dest(dest: Any) -> Any:
if isinstance(dest, (str, bytes)):
dest = resolve1(doc.get_dest(dest))
elif isinstance(dest, PSLiteral):
Expand Down Expand Up @@ -183,10 +197,10 @@ def resolve_dest(dest):
LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')


def extractembedded(outfp, fname, objids, pagenos, password='',
dumpall=False, codec=None, extractdir=None):
def extract1(objid, obj):
filename = os.path.basename(obj.get('UF') or obj.get('F').decode())
def extractembedded(fname: str, password: str, extractdir: str) -> None:
def extract1(objid: int, obj: Dict[str, Any]) -> None:
filename = os.path.basename(obj.get('UF') or
cast(bytes, obj.get('F')).decode())
fileref = obj['EF'].get('UF') or obj['EF'].get('F')
fileobj = doc.getobj(fileref.objid)
if not isinstance(fileobj, PDFStream):
Expand Down Expand Up @@ -221,8 +235,10 @@ def extract1(objid, obj):
return


def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False,
codec=None, extractdir=None, show_fallback_xref=False):
def dumppdf(outfp: TextIO, fname: str, objids: Iterable[int],
pagenos: Container[int], password: str = '', dumpall: bool = False,
codec: Optional[str] = None, extractdir: Optional[str] = None,
show_fallback_xref: bool = False) -> None:
fp = open(fname, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser, password)
Expand All @@ -249,7 +265,7 @@ def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False,
return


def create_parser():
def create_parser() -> ArgumentParser:
parser = ArgumentParser(description=__doc__, add_help=True)
parser.add_argument('files', type=str, default=None, nargs='+',
help='One or more paths to PDF files.')
Expand Down Expand Up @@ -313,7 +329,7 @@ def create_parser():
return parser


def main(argv=None):
def main(argv: Optional[List[str]] = None) -> None:
parser = create_parser()
args = parser.parse_args(args=argv)

Expand All @@ -340,7 +356,7 @@ def main(argv=None):
password = args.password

if args.raw_stream:
codec = 'raw'
codec: Optional[str] = 'raw'
elif args.binary_stream:
codec = 'binary'
elif args.text_stream:
Expand All @@ -356,8 +372,7 @@ def main(argv=None):
)
elif args.extract_embedded:
extractembedded(
outfp, fname, objids, pagenos, password=password,
dumpall=args.all, codec=codec, extractdir=args.extract_embedded
fname, password=password, extractdir=args.extract_embedded
)
else:
dumppdf(
Expand All @@ -370,4 +385,4 @@ def main(argv=None):


if __name__ == '__main__':
sys.exit(main()) # type: ignore[no-untyped-call]
main()

0 comments on commit 45d2ce9

Please sign in to comment.