pdfminer · pietermarsman · Jun 28, 2024 · Mar 5, 2024 · Mar 10, 2024 · Mar 10, 2024
diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml
@@ -0,0 +1,39 @@
+name: CIFuzz
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+permissions: {}
+jobs:
+  Fuzzing:
+    runs-on: ubuntu-latest
+    permissions:
+      security-events: write
+    steps:
+    - name: Build Fuzzers
+      id: build
+      uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'pdfminersix'
+        language: python
+    - name: Run Fuzzers
+      uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'pdfminersix'
+        language: python
+        fuzz-seconds: 800
+        output-sarif: true
+    - name: Upload Crash
+      uses: actions/upload-artifact@v3
+      if: failure() && steps.build.outcome == 'success'
+      with:
+        name: artifacts
+        path: ./out/artifacts
+    - name: Upload Sarif
+      if: always() && steps.build.outcome == 'success'
+      uses: github/codeql-action/upload-sarif@v2
+      with:
+        # Path to SARIF file relative to the root of the repository
+        sarif_file: cifuzz-sarif/results.sarif
+        checkout_path: cifuzz-sarif
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ### Added
 
 - Support for zipped jpeg's ([#938](https://github.com/pdfminer/pdfminer.six/pull/938))
+
+- Fuzzing harnesses for integration into Google's OSS-Fuzz ([949](https://github.com/pdfminer/pdfminer.six/pull/949))
 - Support for setuptools-git-versioning version 2.0.0 ([#957](https://github.com/pdfminer/pdfminer.six/pull/957))
 
 ### Fixed

diff --git a/fuzzing/__init__.py b/fuzzing/__init__.py
diff --git a/fuzzing/build.sh b/fuzzing/build.sh
@@ -0,0 +1,10 @@
+cd "$SRC"/pdfminer.six
+pip3 install .[dev]
+
+# Build fuzzers in $OUT
+for fuzzer in $(find fuzzing -name '*_fuzzer.py');do
+  compile_python_fuzzer "$fuzzer" --collect-all charset_normalizer --hidden-import=_cffi_backend
+  base_name=$(basename "$fuzzer")
+  base_name_no_ext=${base_name%.*}
+  zip -q $OUT/"$base_name_no_ext".zip $SRC/corpus/*
+done
diff --git a/fuzzing/extract_text_fuzzer.py b/fuzzing/extract_text_fuzzer.py
@@ -0,0 +1,39 @@
+import sys
+
+import atheris
+
+from fuzzing.fuzzed_data_provider import PdfminerFuzzedDataProvider
+
+with atheris.instrument_imports():
+    from fuzzing.utils import (
+        prepare_pdfminer_fuzzing,
+        is_valid_byte_stream,
+        generate_layout_parameters,
+    )
+    from pdfminer.high_level import extract_text
+
+from pdfminer.psexceptions import PSException
+
+
+def fuzz_one_input(data: bytes) -> None:
+    if not is_valid_byte_stream(data):
+        # Not worth continuing with this test case
+        return
+
+    fdp = PdfminerFuzzedDataProvider(data)
+
+    try:
+        extract_text(
+            fdp.ConsumeMemoryFile(),
+            maxpages=fdp.ConsumeIntInRange(0, 10),
+            page_numbers=fdp.ConsumeOptionalIntList(10, 0, 10),
+            laparams=generate_layout_parameters(fdp),
+        )
+    except (AssertionError, PSException):
+        return
+
+
+if __name__ == "__main__":
+    prepare_pdfminer_fuzzing()
+    atheris.Setup(sys.argv, fuzz_one_input)
+    atheris.Fuzz()
diff --git a/fuzzing/extract_text_to_fp_fuzzer.py b/fuzzing/extract_text_to_fp_fuzzer.py
@@ -0,0 +1,49 @@
+import io
+import sys
+
+import atheris
+
+from fuzzing.fuzzed_data_provider import PdfminerFuzzedDataProvider
+
+with atheris.instrument_imports():
+    from fuzzing.utils import (
+        prepare_pdfminer_fuzzing,
+        is_valid_byte_stream,
+        generate_layout_parameters,
+    )
+    from pdfminer.high_level import extract_text_to_fp
+    from pdfminer.psexceptions import PSException
+
+available_output_formats = ["text", "html", "xml", "tag"]
+available_layout_modes = ["exact", "normal", "loose"]
+
+
+def fuzz_one_input(data: bytes) -> None:
+    if not is_valid_byte_stream(data):
+        # Not worth continuing with this test case
+        return
+
+    fdp = PdfminerFuzzedDataProvider(data)
+
+    try:
+        with fdp.ConsumeMemoryFile(all_data=False) as f_in, io.BytesIO() as f_out:
+            extract_text_to_fp(
+                f_in,
+                f_out,
+                output_type=fdp.PickValueInList(available_output_formats),
+                laparams=generate_layout_parameters(fdp),
+                maxpages=fdp.ConsumeIntInRange(0, 10),
+                page_numbers=fdp.ConsumeOptionalIntList(10, 0, 10),
+                scale=fdp.ConsumeFloatInRange(0.0, 2.0),
+                rotation=fdp.ConsumeIntInRange(0, 360),
+                layoutmode=fdp.PickValueInList(available_layout_modes),
+                strip_control=fdp.ConsumeBool(),
+            )
+    except (AssertionError, PSException):
+        return
+
+
+if __name__ == "__main__":
+    prepare_pdfminer_fuzzing()
+    atheris.Setup(sys.argv, fuzz_one_input)
+    atheris.Fuzz()
diff --git a/fuzzing/fuzzed_data_provider.py b/fuzzing/fuzzed_data_provider.py
@@ -0,0 +1,34 @@
+import io
+from typing import List, Optional
+
+from atheris import FuzzedDataProvider
+
+
+class PdfminerFuzzedDataProvider(FuzzedDataProvider):  # type: ignore[misc]
+    def ConsumeRandomBytes(self) -> bytes:
+        int_range = self.ConsumeIntInRange(0, self.remaining_bytes())
+        return bytes(self.ConsumeBytes(int_range))
+
+    def ConsumeRandomString(self) -> str:
+        int_range = self.ConsumeIntInRange(0, self.remaining_bytes())
+        return str(self.ConsumeUnicodeNoSurrogates(int_range))
+
+    def ConsumeRemainingString(self) -> str:
+        return str(self.ConsumeUnicodeNoSurrogates(self.remaining_bytes()))
+
+    def ConsumeRemainingBytes(self) -> bytes:
+        return bytes(self.ConsumeBytes(self.remaining_bytes()))
+
+    def ConsumeMemoryFile(self, all_data: bool = False) -> io.BytesIO:
+        if all_data:
+            return io.BytesIO(self.ConsumeRemainingBytes())
+        else:
+            return io.BytesIO(self.ConsumeRandomBytes())
+
+    def ConsumeOptionalIntList(
+        self, max_count: int, min: int, max: int
+    ) -> Optional[List[int]]:
+        if self.ConsumeBool():
+            count = self.ConsumeIntInRange(0, max_count)
+            return [int(i) for i in self.ConsumeIntListInRange(count, min, max)]
+        return None
diff --git a/fuzzing/page_extraction_fuzzer.py b/fuzzing/page_extraction_fuzzer.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+import atheris
+import sys
+
+from fuzzing.fuzzed_data_provider import PdfminerFuzzedDataProvider
+
+with atheris.instrument_imports():
+    from fuzzing.utils import (
+        prepare_pdfminer_fuzzing,
+        is_valid_byte_stream,
+        generate_layout_parameters,
+    )
+    from pdfminer.high_level import extract_pages
+    from pdfminer.psexceptions import PSException
+
+
+def fuzz_one_input(data: bytes) -> None:
+    if not is_valid_byte_stream(data):
+        # Not worth continuing with this test case
+        return
+
+    fdp = PdfminerFuzzedDataProvider(data)
+
+    try:
+        with fdp.ConsumeMemoryFile() as f:
+            list(
+                extract_pages(
+                    f,
+                    maxpages=fdp.ConsumeIntInRange(0, 10),
+                    page_numbers=fdp.ConsumeOptionalIntList(10, 0, 10),
+                    laparams=generate_layout_parameters(fdp),
+                )
+            )
+    except (AssertionError, PSException):
+        return
+
+
+if __name__ == "__main__":
+    prepare_pdfminer_fuzzing()
+    atheris.Setup(sys.argv, fuzz_one_input)
+    atheris.Fuzz()
diff --git a/fuzzing/utils.py b/fuzzing/utils.py
@@ -0,0 +1,53 @@
+"""
+Utilities shared across the various PDF fuzzing harnesses
+"""
+import logging
+from typing import Optional
+
+import atheris
+
+from pdfminer.layout import LAParams
+
+PDF_MAGIC_BYTES = b"%PDF-"
+
+
+def prepare_pdfminer_fuzzing() -> None:
+    """
+    Used to disable logging of the pdfminer module
+    """
+    logging.getLogger("pdfminer").setLevel(logging.CRITICAL)
+
+
+@atheris.instrument_func  # type: ignore[misc]
+def generate_layout_parameters(
+    fdp: atheris.FuzzedDataProvider,
+) -> Optional[LAParams]:
+    if fdp.ConsumeBool():
+        return None
+
+    boxes_flow: Optional[float] = None
+    if fdp.ConsumeBool():
+        boxes_flow = fdp.ConsumeFloatInRange(-1.0, 1.0)
+
+    return LAParams(
+        line_overlap=fdp.ConsumeFloat(),
+        char_margin=fdp.ConsumeFloat(),
+        line_margin=fdp.ConsumeFloat(),
+        word_margin=fdp.ConsumeFloat(),
+        boxes_flow=boxes_flow,
+        detect_vertical=fdp.ConsumeBool(),
+        all_texts=fdp.ConsumeBool(),
+    )
+
+
+@atheris.instrument_func  # type: ignore[misc]
+def is_valid_byte_stream(data: bytes) -> bool:
+    """Quick check to see if this is worth of passing to atheris
+    :return: Whether the byte-stream passes the basic checks
+    """
+    if not data.startswith(PDF_MAGIC_BYTES):
+        return False
+    if b"/Root" not in data:
+        return False
+
+    return True
diff --git a/mypy.ini b/mypy.ini
@@ -30,4 +30,7 @@ ignore_missing_imports = True
 ignore_missing_imports = True
 
 [mypy-charset_normalizer.*]
-ignore_missing_imports = True
+ignore_missing_imports = True
+
+[mypy-atheris.*]
+ignore_missing_imports = True
diff --git a/noxfile.py b/noxfile.py
@@ -4,7 +4,7 @@
 
 
 PYTHON_ALL_VERSIONS = ["3.8", "3.9", "3.10", "3.11", "3.12"]
-PYTHON_MODULES = ["pdfminer", "tools", "tests", "noxfile.py", "setup.py"]
+PYTHON_MODULES = ["fuzzing", "pdfminer", "tools", "tests", "noxfile.py", "setup.py"]
 
 
 @nox.session

diff --git a/pdfminer/_saslprep.py b/pdfminer/_saslprep.py
@@ -24,6 +24,8 @@
 from typing import Callable, Tuple
 import unicodedata
 
+from .pdfexceptions import PDFValueError
+
 # RFC4013 section 2.3 prohibited output.
 _PROHIBITED: Tuple[Callable[[str], bool], ...] = (
     # A strict reading of RFC 4013 requires table c12 here, but
@@ -77,7 +79,7 @@ def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
             # RFC3454, Section 6, #3. If a string contains any
             # RandALCat character, the first and last characters
             # MUST be RandALCat characters.
-            raise ValueError("SASLprep: failed bidirectional check")
+            raise PDFValueError("SASLprep: failed bidirectional check")
         # RFC3454, Section 6, #2. If a string contains any RandALCat
         # character, it MUST NOT contain any LCat character.
         prohibited = prohibited + (stringprep.in_table_d2,)
@@ -90,6 +92,6 @@ def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
     # RFC3454 section 2, step 3 and 4 - Prohibit and check bidi
     for char in data:
         if any(in_table(char) for in_table in prohibited):
-            raise ValueError("SASLprep: failed prohibited character check")
+            raise PDFValueError("SASLprep: failed prohibited character check")
 
     return data
diff --git a/pdfminer/ccitt.py b/pdfminer/ccitt.py
@@ -25,6 +25,8 @@
     cast,
 )
 
+from .pdfexceptions import PDFException, PDFValueError
+
 
 def get_bytes(data: bytes) -> Iterator[int]:
     yield from data
@@ -331,13 +333,16 @@ class CCITTG4Parser(BitParser):
     BitParser.add(UNCOMPRESSED, "T00000", "00000000011")
     BitParser.add(UNCOMPRESSED, "T10000", "00000000010")
 
-    class EOFB(Exception):
+    class CCITTException(PDFException):
+        pass
+
+    class EOFB(CCITTException):
         pass
 
-    class InvalidData(Exception):
+    class InvalidData(CCITTException):
         pass
 
-    class ByteSkip(Exception):
+    class ByteSkip(CCITTException):
         pass
 
     _color: int
@@ -584,7 +589,7 @@ def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes:
         reversed = cast(bool, params.get("BlackIs1"))
         parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
     else:
-        raise ValueError(K)
+        raise PDFValueError(K)
     parser.feedbytes(data)
     return parser.close()
 

diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py
@@ -32,21 +32,21 @@
     Set,
 )
 
+from pdfminer.pdfexceptions import PDFException, PDFTypeError
 from .encodingdb import name2unicode
 from .psparser import KWD
-from .psparser import PSEOF
+from pdfminer.psexceptions import PSEOF, PSSyntaxError
 from .psparser import PSKeyword
 from .psparser import PSLiteral
 from .psparser import PSStackParser
-from .psparser import PSSyntaxError
 from .psparser import literal_name
 from .utils import choplist
 from .utils import nunpack
 
 log = logging.getLogger(__name__)
 
 
-class CMapError(Exception):
+class CMapError(PDFException):
     pass
 
 
@@ -202,7 +202,7 @@ def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
         elif isinstance(code, int):
             unichr = chr(code)
         else:
-            raise TypeError(code)
+            raise PDFTypeError(code)
 
         # A0 = non-breaking space, some weird fonts can have a collision on a cid here.
         if unichr == "\u00A0" and self.cid2unichr.get(cid) == " ":