smalot · k00ni · Feb 26, 2024 · Feb 14, 2024 · Feb 14, 2024 · Feb 14, 2024
diff --git a/samples/bugs/Issue668.pdf b/samples/bugs/Issue668.pdf
diff --git a/src/Smalot/PdfParser/PDFObject.php b/src/Smalot/PdfParser/PDFObject.php
@@ -214,6 +214,16 @@ private function formatContent(?string $content): string
             return '';
         }
 
+        // Outside of (String) content in PDF document streams, all
+        // text should conform to UTF-8. Test for binary content by
+        // deleting everything after the first open-parenthesis ( which
+        // indicates the beginning of a string. Then test what remains
+        // for valid UTF-8. If it's not UTF-8, return an empty string
+        // as this $content is most likely binary.
+        if (!mb_check_encoding(preg_replace('/\(.*$/s', '', $content), 'UTF-8')) {
+            return '';
+        }
+
         // Find all strings () and replace them so they aren't affected
         // by the next steps
         $pdfstrings = [];
@@ -261,17 +271,6 @@ private function formatContent(?string $content): string
             );
         }
 
-        // Now that all strings and dictionaries are hidden, the only
-        // PDF commands left should all be plain text.
-        // Detect text encoding of the current string to prevent reading
-        // content streams that are images, etc. This prevents PHP
-        // error messages when JPEG content is sent to this function
-        // by the sample file '12249.pdf' from:
-        // https://github.com/smalot/pdfparser/issues/458
-        if (false === mb_detect_encoding($content, null, true)) {
-            return '';
-        }
-
         // Normalize white-space in the document stream
         $content = preg_replace('/\s{2,}/', ' ', $content);
 

diff --git a/tests/PHPUnit/Integration/PDFObjectTest.php b/tests/PHPUnit/Integration/PDFObjectTest.php
@@ -274,6 +274,16 @@ public function testFormatContent(): void
         $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content);
 
         $this->assertEquals('', $cleaned);
+
+        // See: https://github.com/smalot/pdfparser/issues/668
+        $filename = $this->rootDir.'/samples/bugs/Issue668.pdf';
+
+        $parser = $this->getParserInstance();
+        $document = $parser->parseFile($filename);
+        $pages = $document->getPages();
+
+        // Binary check is done before a regexp that causes an error
+        $this->assertStringContainsString('Marko Nestorović PR', $pages[0]->getText());
     }
 
     public function testGetSectionsText(): void