pdfminer · pietermarsman · Jul 11, 2020 · Jul 3, 2020 · Jul 3, 2020 · Jul 3, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## Changed
 - Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
-
+- Changed `Text extraction is not allowed` error to a warning by default, this fixes ([#350](https://github.com/pdfminer/pdfminer.six/issues/350))
+
 ## [20200517]
 
 ### Added

diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py
@@ -80,8 +80,7 @@ def extract_text_to_fp(inf, outfp, output_type='text', codec='utf-8',
                                   page_numbers,
                                   maxpages=maxpages,
                                   password=password,
-                                  caching=not disable_caching,
-                                  check_extractable=True):
+                                  caching=not disable_caching):
         page.rotate = (page.rotate + rotation) % 360
         interpreter.process_page(page)
 
@@ -118,7 +117,6 @@ def extract_text(pdf_file, password='', page_numbers=None, maxpages=0,
                 maxpages=maxpages,
                 password=password,
                 caching=caching,
-                check_extractable=True,
         ):
             interpreter.process_page(page)
 

diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py
@@ -44,7 +44,11 @@ class PDFPasswordIncorrect(PDFEncryptionError):
     pass
 
 
-class PDFTextExtractionNotAllowed(PDFEncryptionError):
+class PDFTextExtractionNotAllowedWarning(UserWarning):
+    pass
+
+
+class PDFTextExtractionNotAllowedError(PDFEncryptionError):
     pass
 
 

diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py
@@ -1,4 +1,5 @@
 import logging
+import warnings
 from . import settings
 from .psparser import LIT
 from .pdftypes import PDFObjectNotFound
@@ -8,7 +9,8 @@
 from .pdftypes import dict_value
 from .pdfparser import PDFParser
 from .pdfdocument import PDFDocument
-from .pdfdocument import PDFTextExtractionNotAllowed
+from .pdfdocument import PDFTextExtractionNotAllowedWarning
+from .pdfdocument import PDFTextExtractionNotAllowedError
 
 
 log = logging.getLogger(__name__)
@@ -120,15 +122,23 @@ def search(obj, parent):
     @classmethod
     def get_pages(cls, fp,
                   pagenos=None, maxpages=0, password='',
-                  caching=True, check_extractable=True):
+                  caching=True, check_extractable=False):
         # Create a PDF parser object associated with the file object.
         parser = PDFParser(fp)
         # Create a PDF document object that stores the document structure.
         doc = PDFDocument(parser, password=password, caching=caching)
-        # Check if the document allows text extraction. If not, abort.
-        if check_extractable and not doc.is_extractable:
-            error_msg = 'Text extraction is not allowed: %r' % fp
-            raise PDFTextExtractionNotAllowed(error_msg)
+        # Check if the document allows text extraction.
+        # If not, warn the user and proceed.
+        if not doc.is_extractable:
+            if check_extractable:
+                error_msg = 'Text extraction is not allowed: %r' % fp
+                raise PDFTextExtractionNotAllowedError(error_msg)
+            else:
+                warning_msg = 'The PDF %r contains a metadata field '\
+                            'indicating that it should not allow '   \
+                            'text extraction. Ignoring this field '  \
+                            'and proceeding.' % fp
+                warnings.warn(warning_msg, PDFTextExtractionNotAllowedWarning)
         # Process each page contained in the document.
         for (pageno, page) in enumerate(cls.create_pages(doc)):
             if pagenos and (pageno not in pagenos):