diff --git a/samples/bugs/Issue585.pdf b/samples/bugs/Issue585.pdf new file mode 100644 index 00000000..b282c108 Binary files /dev/null and b/samples/bugs/Issue585.pdf differ diff --git a/samples/bugs/Issue608.pdf b/samples/bugs/Issue608.pdf new file mode 100644 index 00000000..6e1e9f58 Binary files /dev/null and b/samples/bugs/Issue608.pdf differ diff --git a/samples/Issue609.pdf b/samples/bugs/Issue609.pdf similarity index 100% rename from samples/Issue609.pdf rename to samples/bugs/Issue609.pdf diff --git a/samples/grouped-by-generator/Document_Generated_by_SmallPDF.pdf b/samples/grouped-by-generator/Document_Generated_by_SmallPDF.pdf new file mode 100644 index 00000000..55a681d1 Binary files /dev/null and b/samples/grouped-by-generator/Document_Generated_by_SmallPDF.pdf differ diff --git a/samples/grouped-by-generator/Mostly_German_Text_Generated_By_Microsoft_Word_2016.pdf b/samples/grouped-by-generator/Mostly_German_Text_Generated_By_Microsoft_Word_2016.pdf new file mode 100644 index 00000000..ad2a0e2d Binary files /dev/null and b/samples/grouped-by-generator/Mostly_German_Text_Generated_By_Microsoft_Word_2016.pdf differ diff --git a/samples/grouped-by-generator/Mostly_German_Text_Generated_by_Power_PDF_Create.pdf b/samples/grouped-by-generator/Mostly_German_Text_Generated_by_Power_PDF_Create.pdf new file mode 100644 index 00000000..8557fe5b Binary files /dev/null and b/samples/grouped-by-generator/Mostly_German_Text_Generated_by_Power_PDF_Create.pdf differ diff --git "a/samples/grouped-by-generator/Mostly_Turkish_Text_Generated_by_Word_i\303\247in_Acrobat_PDFMaker_17.pdf" "b/samples/grouped-by-generator/Mostly_Turkish_Text_Generated_by_Word_i\303\247in_Acrobat_PDFMaker_17.pdf" new file mode 100644 index 00000000..d2c40e1d Binary files /dev/null and "b/samples/grouped-by-generator/Mostly_Turkish_Text_Generated_by_Word_i\303\247in_Acrobat_PDFMaker_17.pdf" differ diff --git a/samples/grouped-by-generator/R2RML-Spec_Generated_by_Chromium-SaveAs-PDF.pdf b/samples/grouped-by-generator/R2RML-Spec_Generated_by_Chromium-SaveAs-PDF.pdf new file mode 100644 index 00000000..1220fe85 Binary files /dev/null and b/samples/grouped-by-generator/R2RML-Spec_Generated_by_Chromium-SaveAs-PDF.pdf differ diff --git a/samples/grouped-by-generator/RichDocument_Generated_by_Libreoffice-6.4_PDF-v1.4.pdf b/samples/grouped-by-generator/RichDocument_Generated_by_Libreoffice-6.4_PDF-v1.4.pdf new file mode 100644 index 00000000..1ac0eb3c Binary files /dev/null and b/samples/grouped-by-generator/RichDocument_Generated_by_Libreoffice-6.4_PDF-v1.4.pdf differ diff --git a/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.4.pdf b/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.4.pdf new file mode 100644 index 00000000..f71ceeac Binary files /dev/null and b/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.4.pdf differ diff --git a/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.5.pdf b/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.5.pdf new file mode 100644 index 00000000..287b9476 Binary files /dev/null and b/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.5.pdf differ diff --git a/samples/grouped-by-generator/Wikipedia-PDF_Generated_by_Microsoft_Print-to-PDF.pdf b/samples/grouped-by-generator/Wikipedia-PDF_Generated_by_Microsoft_Print-to-PDF.pdf new file mode 100644 index 00000000..d0de53c4 Binary files /dev/null and b/samples/grouped-by-generator/Wikipedia-PDF_Generated_by_Microsoft_Print-to-PDF.pdf differ diff --git a/src/Smalot/PdfParser/Font.php b/src/Smalot/PdfParser/Font.php index 2e05cff5..378f8109 100644 --- a/src/Smalot/PdfParser/Font.php +++ b/src/Smalot/PdfParser/Font.php @@ -386,7 +386,7 @@ public static function decodeEntities(string $text): string */ public static function decodeUnicode(string $text): string { - if (preg_match('/^\xFE\xFF/i', $text)) { + if ("\xFE\xFF" === substr($text, 0, 2)) { // Strip U+FEFF byte order marker. $decode = substr($text, 2); $text = ''; @@ -411,16 +411,17 @@ protected function getFontSpaceLimit(): int /** * Decode text by commands array. */ - public function decodeText(array $commands): string + public function decodeText(array $commands, float $fontFactor = 4): string { $word_position = 0; $words = []; - $font_space = $this->getFontSpaceLimit(); + $font_space = $this->getFontSpaceLimit() * abs($fontFactor) / 4; foreach ($commands as $command) { switch ($command[PDFObject::TYPE]) { case 'n': - if ((float) trim($command[PDFObject::COMMAND]) < $font_space) { + $offset = (float) trim($command[PDFObject::COMMAND]); + if ($offset - (float) $font_space < 0) { $word_position = \count($words); } continue 2; @@ -451,9 +452,32 @@ public function decodeText(array $commands): string foreach ($words as &$word) { $word = $this->decodeContent($word); + $word = str_replace("\t", ' ', $word); } - return implode(' ', $words); + // Remove internal "words" that are just spaces, but leave them + // if they are at either end of the array of words. This fixes, + // for example, lines that are justified to fill + // a whole row. + for ($x = \count($words) - 2; $x >= 1; --$x) { + if ('' === trim($words[$x], ' ')) { + unset($words[$x]); + } + } + $words = array_values($words); + + // Cut down on the number of unnecessary internal spaces by + // imploding the string on the null byte, and checking if the + // text includes extra spaces on either side. If so, merge + // where appropriate. + $words = implode("\x00\x00", $words); + $words = str_replace( + [" \x00\x00 ", "\x00\x00 ", " \x00\x00", "\x00\x00"], + [' ', ' ', ' ', ' '], + $words + ); + + return $words; } /** @@ -463,6 +487,12 @@ public function decodeText(array $commands): string */ public function decodeContent(string $text, bool &$unicode = null): string { + // If this string begins with a UTF-16BE BOM, then decode it + // directly as Unicode + if ("\xFE\xFF" === substr($text, 0, 2)) { + return $this->decodeUnicode($text); + } + if ($this->has('ToUnicode')) { return $this->decodeContentByToUnicodeCMapOrDescendantFonts($text); } diff --git a/src/Smalot/PdfParser/PDFObject.php b/src/Smalot/PdfParser/PDFObject.php index 798b84d0..30901ecb 100644 --- a/src/Smalot/PdfParser/PDFObject.php +++ b/src/Smalot/PdfParser/PDFObject.php @@ -73,6 +73,11 @@ class PDFObject */ protected $config; + /** + * @var bool + */ + protected $addPositionWhitespace = false; + public function __construct( Document $document, Header $header = null, @@ -127,6 +132,16 @@ public function getContent(): ?string return $this->content; } + /** + * Creates a duplicate of the document stream with + * strings and other items replaced by $char. Formerly + * getSectionsText() used this output to more easily gather offset + * values to extract text from the *actual* document stream. + * + * @deprecated function is no longer used and will be removed in a future release + * + * @internal + */ public function cleanContent(string $content, string $char = 'X') { $char = $char[0]; @@ -186,41 +201,204 @@ public function cleanContent(string $content, string $char = 'X') return $content; } + /** + * Takes a string of PDF document stream text and formats + * it into a multi-line string with one PDF command on each line, + * separated by \r\n. If the given string is null, or binary data + * is detected instead of a document stream then return an empty + * string. + */ + private function formatContent(?string $content): string + { + if (null === $content) { + return ''; + } + + // Find all strings () and replace them so they aren't affected + // by the next steps + $pdfstrings = []; + $attempt = '('; + while (preg_match('/'.preg_quote($attempt, '/').'.*?(?> commands and replace them so they + // aren't affected by the next steps + $dictstore = []; + while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/', $content, $dicttext)) { + $dictid = uniqid('DICT_', true); + $dictstore[$dictid] = $dicttext[1]; + $content = preg_replace( + '/'.preg_quote($dicttext[0], '/').'/', + ' ###'.$dictid.'###'.$dicttext[2], + $content, + 1 + ); + } + + // Now that all strings and dictionaries are hidden, the only + // PDF commands left should all be plain text. + // Detect text encoding of the current string to prevent reading + // content streams that are images, etc. This prevents PHP + // error messages when JPEG content is sent to this function + // by the sample file '12249.pdf' from: + // https://github.com/smalot/pdfparser/issues/458 + if (false === mb_detect_encoding($content, null, true)) { + return ''; + } + + // Normalize white-space in the document stream + $content = preg_replace('/\s{2,}/', ' ', $content); + + // Find all valid PDF operators and add \r\n after each; this + // ensures there is just one command on every line + // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A + // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A + // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while + // PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions + // appear here in the list for completeness. + $operators = [ + 'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS', + 'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs', + 'g', 'G', 'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n', + 'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC', + 'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw', + 'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"', + ]; + foreach ($operators as $operator) { + $content = preg_replace( + '/(?> commands + $dictstore = array_reverse($dictstore, true); + foreach ($dictstore as $id => $dict) { + $content = str_replace('###'.$id.'###', $dict, $content); + } + + // Restore the original string content + $pdfstrings = array_reverse($pdfstrings, true); + foreach ($pdfstrings as $id => $text) { + // Strings may contain escaped newlines, or literal newlines + // and we should clean these up before replacing the string + // back into the content stream; this ensures no strings are + // split between two lines (every command must be on one line) + $text = str_replace( + ["\\\r\n", "\\\r", "\\\n", "\r", "\n"], + ['', '', '', '\r', '\n'], + $text + ); + + $content = str_replace('@@@'.$id.'@@@', $text, $content); + } + + $content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content)); + + return $content; + } + + /** + * getSectionsText() now takes an entire, unformatted + * document stream as a string, cleans it, then filters out + * commands that aren't needed for text positioning/extraction. It + * returns an array of unprocessed PDF commands, one command per + * element. + * + * @internal + */ public function getSectionsText(?string $content): array { $sections = []; - $content = ' '.$content.' '; - $textCleaned = $this->cleanContent($content, '_'); - - // Extract text blocks. - if (preg_match_all('/(\sQ)?\s+BT[\s|\(|\[]+(.*?)\s*ET(\sq)?/s', $textCleaned, $matches, \PREG_OFFSET_CAPTURE)) { - foreach ($matches[2] as $pos => $part) { - $text = $part[0]; - if ('' === $text) { - continue; - } - $offset = $part[1]; - $section = substr($content, $offset, \strlen($text)); - // Removes BDC and EMC markup. - $section = preg_replace('/(\/[A-Za-z0-9]+\s*<<.*?)(>>\s*BDC)(.*?)(EMC\s+)/s', '${3}', $section.' '); + // A cleaned stream has one command on every line, so split the + // cleaned stream content on \r\n into an array + $textCleaned = preg_split( + '/(\r\n|\n|\r)/', + $this->formatContent($content), + -1, + \PREG_SPLIT_NO_EMPTY + ); - // Add Q and q flags if detected around BT/ET. - // @see: https://github.com/smalot/pdfparser/issues/387 - $section = trim((!empty($matches[1][$pos][0]) ? "Q\n" : '').$section).(!empty($matches[3][$pos][0]) ? "\nq" : ''); + $inTextBlock = false; + foreach ($textCleaned as $line) { + $line = trim($line); - $sections[] = $section; + // Skip empty lines + if ('' === $line) { + continue; } - } - - // Extract 'do' commands. - if (preg_match_all('/(\/[A-Za-z0-9\.\-_]+\s+Do)\s/s', $textCleaned, $matches, \PREG_OFFSET_CAPTURE)) { - foreach ($matches[1] as $part) { - $text = $part[0]; - $offset = $part[1]; - $section = substr($content, $offset, \strlen($text)); - $sections[] = $section; + // If a 'BT' is encountered, set the $inTextBlock flag + if (preg_match('/BT$/', $line)) { + $inTextBlock = true; + $sections[] = $line; + + // If an 'ET' is encountered, unset the $inTextBlock flag + } elseif ('ET' == $line) { + $inTextBlock = false; + $sections[] = $line; + } elseif ($inTextBlock) { + // If we are inside a BT ... ET text block, save all lines + $sections[] = trim($line); + } else { + // Otherwise, if we are outside of a text block, only + // save specific, necessary lines. Care should be taken + // to ensure a command being checked for *only* matches + // that command. For instance, a simple search for 'c' + // may also match the 'sc' command. See the command + // list in the formatContent() method above. + // Add more commands to save here as you find them in + // weird PDFs! + if ('q' == $line[-1] || 'Q' == $line[-1]) { + // Save and restore graphics state commands + $sections[] = $line; + } elseif (preg_match('/(?> $command */ - private function getTJUsingFontFallback(Font $font, array $command, Page $page = null): string + private function getTJUsingFontFallback(Font $font, array $command, Page $page = null, float $fontFactor = 4): string { - $orig_text = $font->decodeText($command); + $orig_text = $font->decodeText($command, $fontFactor); $text = $orig_text; // If we make this a Config option, we can add a check if it's @@ -262,8 +446,8 @@ private function getTJUsingFontFallback(Font $font, array $command, Page $page = // If the decoded text contains UTF-8 control characters // then the font page being used is probably the wrong one. // Loop through the rest of the fonts to see if we can get - // a good decode. - while (preg_match('/[\x00-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) { + // a good decode. Allow x09 to x0d which are whitespace. + while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) { // If we're out of font IDs, then give up and use the // original string if (0 == \count($font_ids)) { @@ -272,7 +456,7 @@ private function getTJUsingFontFallback(Font $font, array $command, Page $page = // Try the next font ID $font = $page->getFont(array_shift($font_ids)); - $text = $font->decodeText($command); + $text = $font->decodeText($command, $fontFactor); } } @@ -280,148 +464,223 @@ private function getTJUsingFontFallback(Font $font, array $command, Page $page = } /** + * Expects a string that is a full PDF dictionary object, + * including the outer enclosing << >> angle brackets + * + * @internal + * * @throws \Exception */ - public function getText(Page $page = null): string + public function parseDictionary(string $dictionary): array { - $result = ''; - $sections = $this->getSectionsText($this->content); - $current_font = $this->getDefaultFont($page); - $clipped_font = $current_font; + // Normalize whitespace + $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary)); - $current_position_td = ['x' => false, 'y' => false]; - $current_position_tm = ['x' => false, 'y' => false]; + if ('<<' != substr($dictionary, 0, 2)) { + throw new \Exception('Not a valid dictionary object.'); + } - self::$recursionStack[] = $this->getUniqueId(); + $parsed = []; + $stack = []; + $currentName = ''; + $arrayTypeNumeric = false; + + // Remove outer layer of dictionary, and split on tokens + $split = preg_split( + '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/', + trim(preg_replace('/^<<|>>$/', '', $dictionary)), + -1, + \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE + ); - foreach ($sections as $section) { - $commands = $this->getCommandsText($section); - $reverse_text = false; - $text = ''; + foreach ($split as $token) { + $token = trim($token); + switch ($token) { + case '': + break; - foreach ($commands as $command) { - switch ($command[self::OPERATOR]) { - case 'BMC': - if ('ReversedChars' == $command[self::COMMAND]) { - $reverse_text = true; - } - break; + // Open numeric array + case '[': + $parsed[$currentName] = []; + $arrayTypeNumeric = true; - // set character spacing - case 'Tc': - break; + // Move up one level in the stack + $stack[\count($stack)] = &$parsed; + $parsed = &$parsed[$currentName]; + $currentName = ''; + break; - // move text current point - case 'Td': - $args = preg_split('/\s/s', $command[self::COMMAND]); - $y = array_pop($args); - $x = array_pop($args); - if (((float) $x <= 0) - || (false !== $current_position_td['y'] && (float) $y < (float) $current_position_td['y']) - ) { - // vertical offset - $text .= "\n"; - } elseif (false !== $current_position_td['x'] && (float) $x > (float) - $current_position_td['x'] - ) { - $text .= $this->config->getHorizontalOffset(); - } - $current_position_td = ['x' => $x, 'y' => $y]; - break; + // Open hashed array + case '<<': + $parsed[$currentName] = []; + $arrayTypeNumeric = false; - // move text current point and set leading - case 'TD': - $args = preg_split('/\s/s', $command[self::COMMAND]); - $y = array_pop($args); - $x = array_pop($args); - if ((float) $y < 0) { - $text .= "\n"; - } elseif ((float) $x <= 0) { - $text .= ' '; - } - break; + // Move up one level in the stack + $stack[\count($stack)] = &$parsed; + $parsed = &$parsed[$currentName]; + $currentName = ''; + break; - case 'Tf': - list($id) = preg_split('/\s/s', $command[self::COMMAND]); - $id = trim($id, '/'); - if (null !== $page) { - $new_font = $page->getFont($id); - // If an invalid font ID is given, do not update the font. - // This should theoretically never happen, as the PDF spec states for the Tf operator: - // "The specified font value shall match a resource name in the Font entry of the default resource dictionary" - // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435) - // But we want to make sure that malformed PDFs do not simply crash. - if (null !== $new_font) { - $current_font = $new_font; - } + // Close numeric array + case ']': + // Revert string type arrays back to a single element + if (\is_array($parsed) && 1 == \count($parsed) + && isset($parsed[0]) && \is_string($parsed[0]) + && '' !== $parsed[0] && '/' != $parsed[0][0]) { + $parsed = '['.$parsed[0].']'; + } + // Close hashed array + // no break + case '>>': + $arrayTypeNumeric = false; + + // Move down one level in the stack + $parsed = &$stack[\count($stack) - 1]; + unset($stack[\count($stack) - 1]); + break; + + default: + // If value begins with a slash, then this is a name + // Add it to the appropriate array + if ('/' == substr($token, 0, 1)) { + $currentName = substr($token, 1); + if (true == $arrayTypeNumeric) { + $parsed[] = $currentName; + $currentName = ''; } - break; + } elseif ('' != $currentName) { + if (false == $arrayTypeNumeric) { + $parsed[$currentName] = $token; + } + $currentName = ''; + } elseif ('' == $currentName) { + $parsed[] = $token; + } + } + } - case 'Q': - // Use clip: restore font. - $current_font = $clipped_font; - break; + return $parsed; + } - case 'q': - // Use clip: save font. - $clipped_font = $current_font; - break; + /** + * Returns the text content of a PDF as a string. Attempts to add + * whitespace for spacing and line-breaks where appropriate. + * + * getText() leverages getTextArray() to get the content + * of the document, setting the addPositionWhitespace flag to true + * so whitespace is inserted in a logical way for reading by + * humans. + */ + public function getText(Page $page = null): string + { + $this->addPositionWhitespace = true; + $result = $this->getTextArray($page); + $this->addPositionWhitespace = false; - case "'": - case 'Tj': - $command[self::COMMAND] = [$command]; - // no break - case 'TJ': - $text .= $this->getTJUsingFontFallback( - $current_font, - $command[self::COMMAND], - $page - ); - break; + return implode('', $result).' '; + } - // set leading - case 'TL': - $text .= ' '; + /** + * Returns the text content of a PDF as an array of strings. No + * extra whitespace is inserted besides what is actually encoded in + * the PDF text. + * + * @throws \Exception + */ + public function getTextArray(Page $page = null): array + { + $result = []; + $text = []; + + $marked_stack = []; + $last_written_position = false; + + $sections = $this->getSectionsText($this->content); + $current_font = $this->getDefaultFont($page); + $current_font_size = 1; + $current_text_leading = 0; + + $current_position = ['x' => false, 'y' => false]; + $current_position_tm = [ + 'a' => 1, 'b' => 0, 'c' => 0, + 'i' => 0, 'j' => 1, 'k' => 0, + 'x' => 0, 'y' => 0, 'z' => 1, + ]; + $current_position_td = ['x' => 0, 'y' => 0]; + $current_position_cm = [ + 'a' => 1, 'b' => 0, 'c' => 0, + 'i' => 0, 'j' => 1, 'k' => 0, + 'x' => 0, 'y' => 0, 'z' => 1, + ]; + + $clipped_font = []; + $clipped_position_cm = []; + + self::$recursionStack[] = $this->getUniqueId(); + + foreach ($sections as $section) { + $commands = $this->getCommandsText($section); + foreach ($commands as $command) { + switch ($command[self::OPERATOR]) { + // Begin text object + case 'BT': + // Reset text positioning matrices + $current_position_tm = [ + 'a' => 1, 'b' => 0, 'c' => 0, + 'i' => 0, 'j' => 1, 'k' => 0, + 'x' => 0, 'y' => 0, 'z' => 1, + ]; + $current_position_td = ['x' => 0, 'y' => 0]; + $current_text_leading = 0; break; - case 'Tm': - $args = preg_split('/\s/s', $command[self::COMMAND]); - $y = array_pop($args); - $x = array_pop($args); - if (false !== $current_position_tm['x']) { - $delta = abs((float) $x - (float) $current_position_tm['x']); - if ($delta > 10) { - $text .= "\t"; + // Begin marked content sequence with property list + case 'BDC': + if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) { + $dict = $this->parseDictionary($match[1]); + + // Check for ActualText block + if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) { + if ('[' == $dict['ActualText'][0]) { + // Simulate a 'TJ' command on the stack + $marked_stack[] = [ + 'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0], + ]; + } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) { + // Simulate a 'Tj' command on the stack + $marked_stack[] = [ + 'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0], + ]; + } } } - if (false !== $current_position_tm['y']) { - $delta = abs((float) $y - (float) $current_position_tm['y']); - if ($delta > 10) { - $text .= "\n"; - } - } - $current_position_tm = ['x' => $x, 'y' => $y]; break; - // set super/subscripting text rise - case 'Ts': - break; + // Begin marked content sequence + case 'BMC': + if ('ReversedChars' == $command[self::COMMAND]) { + // Upon encountering a ReversedChars command, + // add the characters we've built up so far to + // the result array + $result = array_merge($result, $text); - // set word spacing - case 'Tw': - break; + // Start a fresh $text array that will contain + // reversed characters + $text = []; - // set horizontal scaling - case 'Tz': - $text .= "\n"; - break; - - // move to start of next line - case 'T*': - $text .= "\n"; + // Add the reversed text flag to the stack + $marked_stack[] = ['ReversedChars' => true]; + } break; - case 'Da': + // set graphics position matrix + case 'cm': + $args = preg_split('/\s+/s', $command[self::COMMAND]); + $current_position_cm = [ + 'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0, + 'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0, + 'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1, + ]; break; case 'Do': @@ -433,178 +692,257 @@ public function getText(Page $page = null): string // @todo $xobject could be a ElementXRef object, which would then throw an error if (\is_object($xobject) && $xobject instanceof self && !\in_array($xobject->getUniqueId(), self::$recursionStack)) { // Not a circular reference. - $text .= $xobject->getText($page); + $text[] = $xobject->getText($page); } } break; - case 'rg': - case 'RG': - break; - - case 're': + // Marked content point with (DP) & without (MP) property list + case 'DP': + case 'MP': break; - case 'co': + // End text object + case 'ET': break; - case 'cs': - break; - - case 'gs': - break; - - case 'en': - break; - - case 'sc': - case 'SC': - break; - - case 'g': - case 'G': - break; - - case 'V': - break; - - case 'vo': - case 'Vo': + // Store current selected font and graphics matrix + case 'q': + $clipped_font[] = [$current_font, $current_font_size]; + $clipped_position_cm[] = $current_position_cm; break; - default: - } - } - - // Fix Hebrew and other reverse text oriented languages. - // @see: https://github.com/smalot/pdfparser/issues/398 - if ($reverse_text) { - $chars = mb_str_split($text, 1, mb_internal_encoding()); - $text = implode('', array_reverse($chars)); - } - - $result .= $text; - } - - return $result.' '; - } - - /** - * @throws \Exception - */ - public function getTextArray(Page $page = null): array - { - $text = []; - $sections = $this->getSectionsText($this->content); - $current_font = new Font($this->document, null, null, $this->config); - - foreach ($sections as $section) { - $commands = $this->getCommandsText($section); - - foreach ($commands as $command) { - switch ($command[self::OPERATOR]) { - // set character spacing - case 'Tc': + // Restore previous selected font and graphics matrix + case 'Q': + list($current_font, $current_font_size) = array_pop($clipped_font); + $current_position_cm = array_pop($clipped_position_cm); break; - // move text current point - case 'Td': - break; + // End marked content sequence + case 'EMC': + $data = false; + if (\count($marked_stack)) { + $marked = array_pop($marked_stack); + $action = key($marked); + $data = $marked[$action]; + + switch ($action) { + // If we are in ReversedChars mode... + case 'ReversedChars': + // Reverse the characters we've built up so far + foreach ($text as $key => $t) { + $text[$key] = implode('', array_reverse( + mb_str_split($t, 1, mb_internal_encoding()) + )); + } + + // Add these characters to the result array + $result = array_merge($result, $text); + + // Start a fresh $text array that will contain + // non-reversed characters + $text = []; + break; - // move text current point and set leading - case 'TD': - break; + case 'ActualText': + // Use the content of the ActualText as a command + $command = $data; + break; + } + } - case 'Tf': - if (null !== $page) { - list($id) = preg_split('/\s/s', $command[self::COMMAND]); - $id = trim($id, '/'); - $current_font = $page->getFont($id); + // If this EMC command has been transformed into a 'Tj' + // or 'TJ' command because of being ActualText, then bypass + // the break to proceed to the writing section below. + if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) { + break; } - break; + // no break case "'": + case '"': + if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) { + // Move to next line and write text + $current_position['x'] = 0; + $current_position_td['x'] = 0; + $current_position_td['y'] += $current_text_leading; + } + // no break case 'Tj': $command[self::COMMAND] = [$command]; // no break case 'TJ': - $text[] = $this->getTJUsingFontFallback( + // Check the marked content stack for flags + $actual_text = false; + $reverse_text = false; + foreach ($marked_stack as $marked) { + if (isset($marked['ActualText'])) { + $actual_text = true; + } + if (isset($marked['ReversedChars'])) { + $reverse_text = true; + } + } + + // Account for text position ONLY just before we write text + if (false === $actual_text && \is_array($last_written_position)) { + // If $last_written_position is an array, that + // means we have stored text position coordinates + // for placing an ActualText + $currentX = $last_written_position[0]; + $currentY = $last_written_position[1]; + $last_written_position = false; + } else { + $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x']; + $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y']; + } + $whiteSpace = ''; + + $factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i']; + $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j']; + + if (true === $this->addPositionWhitespace && false !== $current_position['x']) { + $curY = $currentY - $current_position['y']; + if (abs($curY) >= abs($factorY) / 4) { + $whiteSpace = "\n"; + } else { + if (true === $reverse_text) { + $curX = $current_position['x'] - $currentX; + } else { + $curX = $currentX - $current_position['x']; + } + + // In abs($factorX * 7) below, the 7 is chosen arbitrarily + // as the number of apparent "spaces" in a document we + // would need before considering them a "tab". In the + // future, we might offer this value to users as a config + // option. + if ($curX >= abs($factorX * 7)) { + $whiteSpace = "\t"; + } elseif ($curX >= abs($factorX * 2)) { + $whiteSpace = ' '; + } + } + } + + $newtext = $this->getTJUsingFontFallback( $current_font, $command[self::COMMAND], - $page + $page, + $factorX ); - break; - // set leading - case 'TL': - break; + // If there is no ActualText pending then write + if (false === $actual_text) { + $newtext = str_replace(["\r", "\n"], '', $newtext); + if (false !== $reverse_text) { + // If we are in ReversedChars mode, add the whitespace last + $text[] = preg_replace('/ $/', ' ', $newtext.$whiteSpace); + } else { + // Otherwise add the whitespace first + if (' ' === $whiteSpace && isset($text[\count($text) - 1])) { + $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]); + } + $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext); + } - case 'Tm': + // Record the position of this inserted text for comparison + // with the next text block. + // Provide a 'fudge' factor guess on how wide this text block + // is based on the number of characters. This helps limit the + // number of tabs inserted, but isn't perfect. + $factor = $factorX / 2; + $current_position = [ + 'x' => $currentX - mb_strlen($newtext) * $factor, + 'y' => $currentY, + ]; + } elseif (false === $last_written_position) { + // If there is an ActualText in the pipeline + // store the position this undisplayed text + // *would* have been written to, so the + // ActualText is displayed in the right spot + $last_written_position = [$currentX, $currentY]; + $current_position['x'] = $currentX; + } break; - // set super/subscripting text rise - case 'Ts': + // move to start of next line + case 'T*': + $current_position['x'] = 0; + $current_position_td['x'] = 0; + $current_position_td['y'] += $current_text_leading; break; - // set word spacing - case 'Tw': + // set character spacing + case 'Tc': break; - // set horizontal scaling - case 'Tz': - // $text .= "\n"; - break; + // move text current point and set leading + case 'Td': + case 'TD': + // move text current point + $args = preg_split('/\s+/s', $command[self::COMMAND]); + $y = (float) array_pop($args); + $x = (float) array_pop($args); - // move to start of next line - case 'T*': - // $text .= "\n"; - break; + if ('TD' == $command[self::OPERATOR]) { + $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j']; + } - case 'Da': + $current_position_td = [ + 'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'], + 'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'], + ]; break; - case 'Do': + case 'Tf': + $args = preg_split('/\s/s', $command[self::COMMAND]); + $size = (float) array_pop($args); + $id = trim(array_pop($args), '/'); if (null !== $page) { - $args = preg_split('/\s/s', $command[self::COMMAND]); - $id = trim(array_pop($args), '/ '); - if ($xobject = $page->getXObject($id)) { - $text[] = $xobject->getText($page); + $new_font = $page->getFont($id); + // If an invalid font ID is given, do not update the font. + // This should theoretically never happen, as the PDF spec states for the Tf operator: + // "The specified font value shall match a resource name in the Font entry of the default resource dictionary" + // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435) + // But we want to make sure that malformed PDFs do not simply crash. + if (null !== $new_font) { + $current_font = $new_font; + $current_font_size = $size; } } break; - case 'rg': - case 'RG': - break; - - case 're': - break; - - case 'co': - break; - - case 'cs': - break; - - case 'gs': + // set leading + case 'TL': + $y = (float) $command[self::COMMAND]; + $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j']; break; - case 'en': + // set text position matrix + case 'Tm': + $args = preg_split('/\s+/s', $command[self::COMMAND]); + $current_position_tm = [ + 'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0, + 'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0, + 'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1, + ]; break; - case 'sc': - case 'SC': + // set text rendering mode + case 'Ts': break; - case 'g': - case 'G': + // set super/subscripting text rise + case 'Ts': break; - case 'V': + // set word spacing + case 'Tw': break; - case 'vo': - case 'Vo': + // set horizontal scaling + case 'Tz': break; default: @@ -612,198 +950,103 @@ public function getTextArray(Page $page = null): array } } - return $text; + $result = array_merge($result, $text); + + return $result; } + /** + * getCommandsText() expects the content of $text_part to be an + * already formatted, single-line command from a document stream. + * The companion function getSectionsText() returns a document + * stream as an array of single commands for just this purpose. + * Because of this, the argument $offset is no longer used, and + * may be removed in a future PdfParser release. + * + * A better name for this function would be getCommandText() + * since it now always works on just one command. + */ public function getCommandsText(string $text_part, int &$offset = 0): array { $commands = $matches = []; - while ($offset < \strlen($text_part)) { - $offset += strspn($text_part, "\x00\x09\x0a\x0c\x0d\x20", $offset); - $char = $text_part[$offset]; - - $operator = ''; - $type = ''; - $command = false; - - switch ($char) { - case '/': - $type = $char; - if (preg_match( - '/\G\/([A-Z0-9\._,\+-]+\s+[0-9.\-]+)\s+([A-Z]+)\s*/si', - $text_part, - $matches, - 0, - $offset - ) - ) { - $operator = $matches[2]; - $command = $matches[1]; - $offset += \strlen($matches[0]); - } elseif (preg_match( - '/\G\/([A-Z0-9\._,\+-]+)\s+([A-Z]+)\s*/si', - $text_part, - $matches, - 0, - $offset - ) - ) { - $operator = $matches[2]; - $command = $matches[1]; - $offset += \strlen($matches[0]); - } - break; - - case '[': - case ']': - // array object - $type = $char; - if ('[' == $char) { - ++$offset; - // get elements - $command = $this->getCommandsText($text_part, $offset); - - if (preg_match( - '/\G\s*[A-Z]{1,2}\s*/si', - $text_part, - $matches, - 0, - $offset - ) - ) { - $operator = trim($matches[0]); - $offset += \strlen($matches[0]); - } - } else { - ++$offset; - break; - } - break; - - case '<': - case '>': - // array object - $type = $char; - ++$offset; - if ('<' == $char) { - $strpos = strpos($text_part, '>', $offset); - $command = substr($text_part, $offset, $strpos - $offset); - $offset = $strpos + 1; - } + preg_match('/^(([\/\[\(<])?.*)(? 0) { - if (!isset($text_part[$strpos])) { - break; - } - $ch = $text_part[$strpos]; - switch ($ch) { - case '\\': - // REVERSE SOLIDUS (5Ch) (Backslash) - // skip next character - ++$strpos; - break; - - case '(': - // LEFT PARENHESIS (28h) - ++$open_bracket; - break; + // If no valid command is detected, return an empty array + if (!isset($matches[1]) || !isset($matches[2]) || !isset($matches[3])) { + return []; + } - case ')': - // RIGHT PARENTHESIS (29h) - --$open_bracket; - break; - } - ++$strpos; - } - $command = substr($text_part, $offset, $strpos - $offset - 1); - $offset = $strpos; - - if (preg_match( - '/\G\s*([A-Z\']{1,2})\s*/si', - $text_part, - $matches, - 0, - $offset - ) - ) { - $operator = $matches[1]; - $offset += \strlen($matches[0]); - } + $type = $matches[2]; + $operator = $matches[3]; + $command = trim($matches[1]); + + if ('TJ' == $operator) { + $subcommand = []; + $command = trim($command, '[]'); + do { + $oldCommand = $command; + + // Search for parentheses string () format + if (preg_match('/^ *\((.*?)(? '(', + self::OPERATOR => 'TJ', + self::COMMAND => $tjmatch[1], + ]; + if (isset($tjmatch[2]) && trim($tjmatch[2])) { + $subcommand[] = [ + self::TYPE => 'n', + self::OPERATOR => '', + self::COMMAND => $tjmatch[2], + ]; } - break; + $command = substr($command, \strlen($tjmatch[0])); + } - default: - if ('ET' == substr($text_part, $offset, 2)) { - break; - } elseif (preg_match( - '/\G\s*(?P([0-9\.\-]+\s*?)+)\s+(?P[A-Z]{1,3})\s*/si', - $text_part, - $matches, - 0, - $offset - ) - ) { - $operator = trim($matches['id']); - $command = trim($matches['data']); - $offset += \strlen($matches[0]); - } elseif (preg_match( - '/\G\s*([0-9\.\-]+\s*?)+\s*/si', - $text_part, - $matches, - 0, - $offset - ) - ) { - $type = 'n'; - $command = trim($matches[0]); - $offset += \strlen($matches[0]); - } elseif (preg_match( - '/\G\s*([A-Z\*]+)\s*/si', - $text_part, - $matches, - 0, - $offset - ) - ) { - $type = ''; - $operator = $matches[1]; - $command = ''; - $offset += \strlen($matches[0]); + // Search for hexadecimal <> format + if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) { + $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]); + $subcommand[] = [ + self::TYPE => '<', + self::OPERATOR => 'TJ', + self::COMMAND => $tjmatch[1], + ]; + if (isset($tjmatch[2]) && trim($tjmatch[2])) { + $subcommand[] = [ + self::TYPE => 'n', + self::OPERATOR => '', + self::COMMAND => $tjmatch[2], + ]; } + $command = substr($command, \strlen($tjmatch[0])); + } + } while ($command != $oldCommand); + + $command = $subcommand; + } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) { + // Depending on the string type, trim the data of the + // appropriate delimiters + if ('(' == $type) { + // Don't use trim() here since a () string may end with + // a balanced or escaped right parentheses, and trim() + // will delete both. Both strings below are valid: + // eg. (String()) + // eg. (String\)) + $command = preg_replace('/^\(|\)$/', '', $command); + } elseif ('<' == $type) { + $command = trim($command, '<>'); } - - if (false !== $command) { - $commands[] = [ - self::TYPE => $type, - self::OPERATOR => $operator, - self::COMMAND => $command, - ]; - } else { - break; - } + } elseif ('/' == $type) { + $command = substr($command, 1); } + $commands[] = [ + self::TYPE => $type, + self::OPERATOR => $operator, + self::COMMAND => $command, + ]; + return $commands; } diff --git a/src/Smalot/PdfParser/Page.php b/src/Smalot/PdfParser/Page.php index 10afb523..b8002bd3 100644 --- a/src/Smalot/PdfParser/Page.php +++ b/src/Smalot/PdfParser/Page.php @@ -400,8 +400,6 @@ public function extractRawData(): array } $sectionsText = $content->getSectionsText($content->getContent()); foreach ($sectionsText as $sectionText) { - $extractedData[] = ['t' => '', 'o' => 'BT', 'c' => '']; - $commandsText = $content->getCommandsText($sectionText); foreach ($commandsText as $command) { $extractedData[] = $command; @@ -701,6 +699,12 @@ public function getDataTm(array $dataCommands = null): array $extractedTexts = $this->getTextArray(); $extractedData = []; foreach ($dataCommands as $command) { + // If we've used up all the texts from getTextArray(), exit + // so we aren't accessing non-existent array indices + // Fixes 'undefined array key' errors in Issues #575, #576 + if (\count($extractedTexts) <= \count($extractedData)) { + break; + } $currentText = $extractedTexts[\count($extractedData)]; switch ($command['o']) { /* @@ -716,15 +720,9 @@ public function getDataTm(array $dataCommands = null): array /* * ET - * End a text object, discarding the text matrix + * End a text object */ case 'ET': - $Tm = $defaultTm; - $Tl = $defaultTl; - $Tx = 0; - $Ty = 0; - $fontId = $defaultFontId; - $fontSize = $defaultFontSize; break; /* @@ -739,7 +737,7 @@ public function getDataTm(array $dataCommands = null): array /* * tx ty Td - * Move to the start of the next line, offset form the start of the + * Move to the start of the next line, offset from the start of the * current line by tx, ty. */ case 'Td': diff --git a/tests/PHPUnit/Integration/DocumentGeneratorFocusTest.php b/tests/PHPUnit/Integration/DocumentGeneratorFocusTest.php new file mode 100644 index 00000000..f49f6f2d --- /dev/null +++ b/tests/PHPUnit/Integration/DocumentGeneratorFocusTest.php @@ -0,0 +1,225 @@ + + * + * @date 2020-06-01 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Parser; + +/** + * Document related tests which focus on certain PDF generators. + */ +class DocumentGeneratorFocusTest extends TestCase +{ + /** + * Test getText result. + * + * PDF generated with Chromium 116 via SaveAs-dialog. + */ + public function testGetTextPull634Chromium(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/R2RML-Spec_Generated_by_Chromium-SaveAs-PDF.pdf'); + + self::assertStringContainsString('R2RML: RDB to RDF Mapping Language', $document->getText()); + } + + /** + * Test getText result. + * + * PDF (v 1.4) generated with Inkscape 0.92. + */ + public function testGetTextPull634InkscapePDF14(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.4.pdf'); + + self::assertEquals('TEST', $document->getText()); + } + + /** + * Test getText result. + * + * PDF (v 1.5) generated with Inkscape 0.92. + */ + public function testGetTextPull634InkscapePDF15(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/SimpleImage_Generated_by_Inkscape-0.92_PDF-v1.5.pdf'); + + self::assertEquals('TEST', $document->getText()); + } + + /** + * Test getText result. + * + * PDF (1.4) generated with LibreOffice Writer (6.4). + * + * @see https://help.libreoffice.org/6.4/en-US/text/shared/01/ref_pdf_export.html + */ + public function testGetTextPull634LibreOffice(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/RichDocument_Generated_by_Libreoffice-6.4_PDF-v1.4.pdf'); + + self::assertStringContainsString( + 'Some currency symbols: £, €, ¥'."\n".'German characters: ÄÖÜß', + $document->getText() + ); + } + + /** + * Test getText result. + * + * PDF (v 1.7) generated with Microsoft Print-to-PDF via Firefox. + */ + public function testGetTextPull634MicrosoftPDF17(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/Wikipedia-PDF_Generated_by_Microsoft_Print-to-PDF.pdf'); + + $outputText = $document->getText(); + + self::assertStringContainsString( + 'Adobe PDF icon'."\n".'Filename'."\n".'extension', + $outputText + ); + + self::assertStringContainsString( + 'are necessary to make, use, sell, and distribute PDF-compliant', + $outputText + ); + } + + /** + * Test Document functions. + * + * PDF (v 1.5) generated by Microsoft Word 2016. + */ + public function testGetTextPull634MicrosoftWord2016(): void + { + $path = $this->rootDir.'/samples/grouped-by-generator/Mostly_German_Text_Generated_By_Microsoft_Word_2016.pdf'; + $document = (new Parser())->parseFile($path); + + $outputText = $document->getText(); + + self::assertStringContainsString('(einschließlich Marktpflegequote) von 4 Mrd € angestrebt.', $outputText); + + // check whitespaces and tab usage + self::assertStringContainsString( + // ,--- here is a tab + 'Fälligkeit: 19. Oktober 2028 '."\n". + 'Zinszahlung: 19. Oktober gzj., Zinslaufbeginn 15. Juni 2023', + $outputText + ); + } + + /** + * Test getText result. + * + * PDF (v 1.5) generated with Power PDF Create. + */ + public function testGetTextPull634PowerPDFCreate(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/Mostly_German_Text_Generated_by_Power_PDF_Create.pdf'); + + $outputText = $document->getText(); + + // located on page 1 + self::assertStringContainsString( + 'Index-Verhältniszahl: 1,17812 (am Valutierungstag 7. September 2023)', + $outputText + ); + + // located on page 2 + self::assertStringContainsString( + 'Einbeziehung in den '."\n". + 'Börsenhandel: Dienstag, 5. September 2023 '."\n". + 'Valutierungstag: Donnerstag, 7. September 2023', + $outputText + ); + } + + /** + * Test getText result. + * + * PDF generated from .docx with SmallPDF (https://smallpdf.com) + */ + public function testGetTextPull634SmallPDF(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/grouped-by-generator/Document_Generated_by_SmallPDF.pdf'); + + $outputText = $document->getText(); + + // Actual encoded spaces in the document are preserved + self::assertStringContainsString( + 'SmallPDF SMALLPDF SmallPDF', + $outputText + ); + + // Hebrew text + self::assertStringContainsString( + 'Hebrew Keyboard - תדלקמ תירבעב - Type Hebrew Online', + $outputText + ); + + // Russian text + self::assertStringContainsString( + 'Russian Keyboard - русская клавиатура - Type Russian', + $outputText + ); + } + + /** + * Test getText result. + * + * PDF (1.6) generated by Word için Acrobat PDFMaker 17. + */ + public function testGetTextPull634WordIcinAcrobatPDFMaker17(): void + { + $path = $this->rootDir.'/samples/grouped-by-generator/Mostly_Turkish_Text_Generated_by_Word_için_Acrobat_PDFMaker_17.pdf'; + $document = (new Parser())->parseFile($path); + + $outputText = $document->getText(); + + self::assertStringContainsString( + 'İhracat ve döviz kazandırıcı hizmetler reeskont kredisi günlük', + $outputText + ); + + // Unnecessary tabs are not inserted due to font-size being 1, + // but the text-matrix scale is 9 or 10 + self::assertStringContainsString( + 'dikkate alınmasına devam edilecektir.', + $outputText + ); + + // This encoded segment contains an escaped backslash right before + // an octal code: \\\000. Account for this in Font::decodeOctal() + // See: https://github.com/smalot/pdfparser/pull/640 + self::assertStringContainsString('Sayı: 2023-34', $outputText); + } +} diff --git a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php new file mode 100644 index 00000000..7c7fe7e6 --- /dev/null +++ b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php @@ -0,0 +1,114 @@ + + * + * @date 2020-06-01 + * + * @author Sébastien MALOT + * + * @date 2017-01-03 + * + * @license LGPLv3 + * + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace PHPUnitTests\Integration; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\Document; +use Smalot\PdfParser\Parser; + +/** + * Document related tests which are related to certain issues. + */ +class DocumentIssueFocusTest extends TestCase +{ + /** + * Tests getText method without a given page limit. + * + * @see https://github.com/smalot/pdfparser/pull/562 + */ + public function testGetTextNoPageLimit(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf'); + + self::assertStringContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText()); + } + + /** + * Tests getText method with a given page limit. + * + * @see https://github.com/smalot/pdfparser/pull/562 + */ + public function testGetTextWithPageLimit(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf'); + + // given text is on page 2, it has to be ignored because of that + self::assertStringNotContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText(1)); + } + + /** + * Tests extraction of XMP Metadata vs. getHeader() data. + * + * @see https://github.com/smalot/pdfparser/pull/606 + */ + public function testExtractXMPMetadata(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/XMP_Metadata.pdf'); + + $details = $document->getDetails(); + + // Test that the dc:title data was extracted from the XMP + // Metadata. + self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $details['dc:title']); + } + + /** + * Tests PDFDocEncoding decode of Document Properties + * + * @see https://github.com/smalot/pdfparser/issues/609 + */ + public function testPDFDocEncodingDecode(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue609.pdf'); + + $details = $document->getDetails(); + + // These test that Adobe-inserted \r are removed from a UTF-8 + // escaped metadata string, and the surrounding characters are + // repaired + $testKeywords = '˘ˇˆ˙˝˛˞˜•†‡…—–ƒ⁄‹›−‰„“”‘’‚™fiflŁŒŠŸŽıłœšž€¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'; + self::assertStringContainsString($testKeywords, $details['Keywords']); + + $testKeywords = 'added line-feeds often destroy multibyte characters'; + self::assertStringContainsString($testKeywords, $details['Keywords']); + + // This tests that the PDFDocEncoding characters that differ + // from CP-1252 are decoded to their correct UTF-8 code points + // as well as removing \r line-feeds + $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž'; + self::assertStringContainsString($testSubject, $details['Subject']); + } +} diff --git a/tests/PHPUnit/Integration/DocumentTest.php b/tests/PHPUnit/Integration/DocumentTest.php index 88137888..26553fca 100644 --- a/tests/PHPUnit/Integration/DocumentTest.php +++ b/tests/PHPUnit/Integration/DocumentTest.php @@ -40,9 +40,11 @@ use Smalot\PdfParser\Header; use Smalot\PdfParser\Page; use Smalot\PdfParser\Pages; -use Smalot\PdfParser\Parser; use Smalot\PdfParser\PDFObject; +/** + * General Document related tests. + */ class DocumentTest extends TestCase { protected function getDocumentInstance(): Document @@ -230,72 +232,4 @@ public function testGetPagesMissingCatalog(): void $document = $this->getDocumentInstance(); $document->getPages(); } - - /** - * Tests getText method without a given page limit. - * - * @see https://github.com/smalot/pdfparser/pull/562 - */ - public function testGetTextNoPageLimit(): void - { - $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf'); - - self::assertStringContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText()); - } - - /** - * Tests getText method with a given page limit. - * - * @see https://github.com/smalot/pdfparser/pull/562 - */ - public function testGetTextWithPageLimit(): void - { - $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/Issue331.pdf'); - - // given text is on page 2, it has to be ignored because of that - self::assertStringNotContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText(1)); - } - - /** - * Tests extraction of XMP Metadata vs. getHeader() data. - * - * @see https://github.com/smalot/pdfparser/pull/606 - */ - public function testExtractXMPMetadata(): void - { - $document = (new Parser())->parseFile($this->rootDir.'/samples/XMP_Metadata.pdf'); - - $details = $document->getDetails(); - - // Test that the dc:title data was extracted from the XMP - // Metadata. - self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $details['dc:title']); - } - - /** - * Tests PDFDocEncoding decode of Document Properties - * - * @see https://github.com/smalot/pdfparser/issues/609 - */ - public function testPDFDocEncodingDecode(): void - { - $document = (new Parser())->parseFile($this->rootDir.'/samples/Issue609.pdf'); - - $details = $document->getDetails(); - - // These test that Adobe-inserted \r are removed from a UTF-8 - // escaped metadata string, and the surrounding characters are - // repaired - $testKeywords = '˘ˇˆ˙˝˛˞˜•†‡…—–ƒ⁄‹›−‰„“”‘’‚™fiflŁŒŠŸŽıłœšž€¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'; - self::assertStringContainsString($testKeywords, $details['Keywords']); - - $testKeywords = 'added line-feeds often destroy multibyte characters'; - self::assertStringContainsString($testKeywords, $details['Keywords']); - - // This tests that the PDFDocEncoding characters that differ - // from CP-1252 are decoded to their correct UTF-8 code points - // as well as removing \r line-feeds - $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž'; - self::assertStringContainsString($testSubject, $details['Subject']); - } } diff --git a/tests/PHPUnit/Integration/FontTest.php b/tests/PHPUnit/Integration/FontTest.php index e76a051d..35d03284 100644 --- a/tests/PHPUnit/Integration/FontTest.php +++ b/tests/PHPUnit/Integration/FontTest.php @@ -367,8 +367,6 @@ public function testDecodeText(): void * which would be instance of PDFObject class (but not Encoding or ElementString). * * @see https://github.com/smalot/pdfparser/pull/500 - * - * @group linux-only */ public function testDecodeTextForFontWithIndirectEncodingWithoutTypeEncoding(): void { @@ -378,14 +376,12 @@ public function testDecodeTextForFontWithIndirectEncodingWithoutTypeEncoding(): $pages = $document->getPages(); $page1 = reset($pages); $page1Text = $page1->getText(); - $expectedText = <<assertEquals($expectedText, trim($page1Text)); } @@ -454,6 +450,21 @@ public function testCalculateTextWidth(): void $this->assertEquals([], $missing); } + public function testDecodeContent(): void + { + /* + * we do this to get into the branch with private method "decodeContentByEncodingElement" in Font.php + */ + $encoding = $this->createMock(Element::class); + $encoding->method('getContent')->willReturn('WinAnsiEncoding'); + $header = new Header(['Encoding' => $encoding]); + + $font = new Font($this->createMock(Document::class), $header); + + // Check that a string with UTF-16BE BOM is decoded directly + $this->assertEquals('ABC', $font->decodeContent("\xFE\xFF\x00\x41\x00\x42\x00\x43")); + } + /** * Check behavior if getDetails() does return an array without a Widths-key. * diff --git a/tests/PHPUnit/Integration/PDFObjectTest.php b/tests/PHPUnit/Integration/PDFObjectTest.php index ab7b229b..9ec68043 100644 --- a/tests/PHPUnit/Integration/PDFObjectTest.php +++ b/tests/PHPUnit/Integration/PDFObjectTest.php @@ -52,12 +52,9 @@ protected function getPdfObjectInstance($document): PDFObject return new PDFObject($document); } - /** - * @group linux-only - */ public function testGetCommandsText(): void { - $content = "/R14 30 Tf 0.999016 0 0 1 137.4 + $content = "BT /R14 30 Tf 0.999016 0 0 1 137.4 342.561 Tm [(A)-168.854( BC D)-220.905(\\(E\\))20.905<20>] TJ /R14 17.16 Tf <20> Tj @@ -67,9 +64,20 @@ public function testGetCommandsText(): void q -124.774 124.127 5.64213 5.67154 930.307 4436.95 cm BI"; + $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content); + $offset = 0; - $parts = $this->getPdfObjectInstance(new Document())->getCommandsText($content, $offset); + $parts = []; + foreach ($sections as $section) { + $parts[] = $this->getPdfObjectInstance(new Document())->getCommandsText($section)[0]; + } + $reference = [ + [ + self::TYPE => '', + self::OPERATOR => 'BT', + self::COMMAND => '', + ], [ self::TYPE => '/', self::OPERATOR => 'Tf', @@ -78,7 +86,7 @@ public function testGetCommandsText(): void [ self::TYPE => '', self::OPERATOR => 'Tm', - self::COMMAND => "0.999016 0 0 1 137.4\n342.561", + self::COMMAND => '0.999016 0 0 1 137.4 342.561', ], [ self::TYPE => '[', @@ -86,7 +94,7 @@ public function testGetCommandsText(): void self::COMMAND => [ [ self::TYPE => '(', - self::OPERATOR => '', + self::OPERATOR => 'TJ', self::COMMAND => 'A', ], [ @@ -96,7 +104,7 @@ public function testGetCommandsText(): void ], [ self::TYPE => '(', - self::OPERATOR => '', + self::OPERATOR => 'TJ', self::COMMAND => ' BC D', ], [ @@ -106,7 +114,7 @@ public function testGetCommandsText(): void ], [ self::TYPE => '(', - self::OPERATOR => '', + self::OPERATOR => 'TJ', self::COMMAND => '\\(E\\)', ], [ @@ -116,7 +124,7 @@ public function testGetCommandsText(): void ], [ self::TYPE => '<', - self::OPERATOR => '', + self::OPERATOR => 'TJ', self::COMMAND => '20', ], ], @@ -151,10 +159,29 @@ public function testGetCommandsText(): void self::OPERATOR => 'Tf', self::COMMAND => 'R14 20.04', ], + [ + self::TYPE => '', + self::OPERATOR => 'ET', + self::COMMAND => '', + ], + [ + self::TYPE => '', + self::OPERATOR => 'Q', + self::COMMAND => '', + ], + [ + self::TYPE => '', + self::OPERATOR => 'q', + self::COMMAND => '', + ], + [ + self::TYPE => '', + self::OPERATOR => 'cm', + self::COMMAND => '-124.774 124.127 5.64213 5.67154 930.307 4436.95', + ], ]; $this->assertEquals($parts, $reference); - $this->assertEquals(172, $offset); } public function testCleanContent(): void @@ -202,10 +229,54 @@ public function testCleanContent(): void $this->assertEquals($cleaned, $expected); } - /** - * @group linux-only - */ - public function testGetSectionText(): void + public function testFormatContent(): void + { + $content = '/Shape <>> BT >>BDC Q /CS0 cs 1 1 0 scn 1 i +/GS0 gs BT /TT0 1 Tf 0.0007 Tc 0.0018 Tw 0 Ts 100 Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm +(Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj EMC (ABC) Tj +[ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TJ ET /Shape <>BDC q 0.03 841'; + + $expected = '/Shape <>> BT >>BDC +Q +/CS0 cs +1 1 0 scn +1 i +/GS0 gs +BT +/TT0 1 Tf +0.0007 Tc +0.0018 Tw +0 Ts +100 Tz +0 Tr +24 0 0 24 51.3 639.26025 Tm +(Modificatio[ns] au \\(14\\) septembre 2009 ET 2010)Tj +EMC +(ABC) Tj +[ (a)-4.5(b)6(c)8.8 ( fsdfsdfsdf[]sd) ] TJ +ET +/Shape <>BDC +q +0.03 841'; + + // Normalize line-endings + $expected = str_replace(["\r\n", "\n"], ["\n", "\r\n"], $expected); + + $formatContent = new \ReflectionMethod('Smalot\PdfParser\PDFObject', 'formatContent'); + $formatContent->setAccessible(true); + $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content); + + $this->assertEquals($expected, $cleaned); + + // Check that binary data is rejected + $content = hex2bin('a670c89d4a324e47'); + + $cleaned = $formatContent->invoke($this->getPdfObjectInstance(new Document()), $content); + + $this->assertEquals('', $cleaned); + } + + public function testGetSectionsText(): void { $content = '/Shape <>BDC Q @@ -229,16 +300,117 @@ public function testGetSectionText(): void $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content); $this->assertEquals( - ['/TT0 1 Tf -0.0007 Tc 0.0018 Tw 0 Ts 100 Tz 0 Tr 24 0 0 24 51.3 639.26025 Tm -(Mod BT atio[ns] au \(14\) septembre 2009 ET 2010)Tj -EMC -(ABC) Tj - -[ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD', '/TT1 1.5 Tf (BT )Tj -q'], + [ + '/Shape <>BDC', + 'Q', + 'BT', + '/TT0 1 Tf', + '0.0007 Tc', + '0.0018 Tw', + '0 Ts', + '100 Tz', + '0 Tr', + '24 0 0 24 51.3 639.26025 Tm', + '(Mod BT atio[ns] au \\(14\\) septembre 2009 ET 2010)Tj', + 'EMC', + '(ABC) Tj', + '[ (a)-4.5(b) 6(c)8.8 ( fsdfsdfsdf[ sd) ] TD', + 'ET', + '/Shape <>BDC', + 'BT', + '/TT1 1.5 Tf', + '(BT )Tj', + 'ET', + 'q', + ], $sections ); + + // Test that a Name containing 'ET' doesn't close a 'BT' block + // See: https://github.com/smalot/pdfparser/issues/474 + $content = 'BT +/FTxkPETkkj 8 Tf +1 0 0 1 535.55 627.4 Tm +(Hello World)TJ +ET'; + + $sections = $this->getPdfObjectInstance(new Document())->getSectionsText($content); + + $this->assertNotEquals('/FTxkP', $sections[0]); + $this->assertNotEquals('/FTxkP', $sections[1]); + } + + public function testParseDictionary(): void + { + $data = '<> /Array[/Parsed /Data/Actual]/Silent<>>>'; + + $dictionary = $this->getPdfObjectInstance(new Document())->parseDictionary($data); + + $this->assertArrayHasKey('ActualText', $dictionary); + $this->assertArrayHasKey('XObject', $dictionary); + $this->assertArrayHasKey('Array', $dictionary); + $this->assertArrayHasKey('Silent', $dictionary); + + $this->assertCount(3, $dictionary['Array']); + + $this->assertEquals('<>', $dictionary['Silent']); + } + + /** + * Tests that graphics position (cm) is taken into account when + * positioning text + * + * @see: https://github.com/smalot/pdfparser/issues/608 + */ + public function testGraphicsPositioning(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue608.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + + // The \n is not added if 'cm' commands are ignored + $this->assertStringContainsString("Heading 1 \nLorem ipsum", $pages[0]->getText()); + } + + /** + * Tests that ActualText text is printed for a block instead of the + * contents of the Tj or TJ commands in the block. + * + * @see: https://github.com/smalot/pdfparser/issues/464 + */ + public function testActualText(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue608.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + + // An ActualText command subs in the three literal characters + // 'ffi' for the single character ligature here + // In addition, if $last_written_position isn't used to store + // the position to insert, \n's would be erroniously inserted + // on either side of the 'ffi' + $this->assertStringContainsString('efficitur', $pages[0]->getText()); + } + + /** + * Tests for the correct decoding of an Em-dash character in + * certain font contexts + * + * See: https://github.com/smalot/pdfparser/issues/585 + */ + public function testDecodeEmDash(): void + { + $filename = $this->rootDir.'/samples/bugs/Issue585.pdf'; + + $parser = $this->getParserInstance(); + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + + $this->assertStringContainsString('слева по ходу — веревка', $pages[0]->getText()); } /** @@ -254,7 +426,10 @@ public function testReversedChars(): void $document = $parser->parseFile($filename); $pages = $document->getPages(); - $this->assertStringContainsString('שלומי טסט', $pages[0]->getText()); + $pageText = $pages[0]->getText(); + + $this->assertStringContainsString('שלומי טסט', $pageText); + $this->assertStringContainsString('בנמל מספנות ישראל.', $pageText); } /** @@ -290,4 +465,23 @@ public function testFontIDWithHyphen(): void $this->assertEquals('Tf', $fontCommandHyphen[0]['o']); $this->assertEquals('FID-01 15.00', $fontCommandHyphen[0]['c']); } + + /** + * Tests that an invalid command does not cause an error, but just + * returns an empty array + */ + public function testInvalidCommand(): void + { + $pdfObject = $this->getPdfObjectInstance(new Document()); + + $validCommand = $pdfObject->getCommandsText('75 rg'); + + $this->assertEquals('', $validCommand[0]['t']); + $this->assertEquals('rg', $validCommand[0]['o']); + $this->assertEquals('75', $validCommand[0]['c']); + + $invalidCommand = $pdfObject->getCommandsText('75'); + + $this->assertEquals([], $invalidCommand); + } } diff --git a/tests/PHPUnit/Integration/PageTest.php b/tests/PHPUnit/Integration/PageTest.php index 7807101e..5e40ee90 100644 --- a/tests/PHPUnit/Integration/PageTest.php +++ b/tests/PHPUnit/Integration/PageTest.php @@ -166,7 +166,7 @@ public function testGetTextPullRequest457(): void $this->assertStringContainsString('KRANT', $text); $this->assertStringContainsString('DINSDAG', $text); $this->assertStringContainsString('Snelfilterkoffie', $text); - $this->assertStringContainsString('AardappelenZak', $text); + $this->assertStringContainsString('Aardappelen'."\n".'Zak', $text); $this->assertStringContainsString('ALL', $text); } @@ -180,7 +180,7 @@ public function testExtractRawData(): void $page = $pages[0]; $extractedRawData = $page->extractRawData(); - $btItem = $extractedRawData[0]; + $btItem = $extractedRawData[4]; $this->assertCount(3, $btItem); $this->assertArrayHasKey('t', $btItem); $this->assertArrayHasKey('o', $btItem); @@ -188,9 +188,9 @@ public function testExtractRawData(): void $this->assertEquals('BT', $btItem['o']); - $tmItem = $extractedRawData[2]; + $tmItem = $extractedRawData[6]; - $this->assertcount(174, $extractedRawData); + $this->assertcount(185, $extractedRawData); $this->assertCount(3, $tmItem); $this->assertArrayHasKey('t', $tmItem); @@ -210,8 +210,8 @@ public function testExtractDecodedRawData(): void $pages = $document->getPages(); $page = $pages[0]; $extractedDecodedRawData = $page->extractDecodedRawData(); - $tmItem = $extractedDecodedRawData[2]; - $this->assertCount(174, $extractedDecodedRawData); + $tmItem = $extractedDecodedRawData[6]; + $this->assertCount(185, $extractedDecodedRawData); $this->assertCount(3, $tmItem); $this->assertArrayHasKey('t', $tmItem); @@ -226,7 +226,7 @@ public function testExtractDecodedRawData(): void $this->assertArrayHasKey('o', $tmItem); $this->assertArrayHasKey('c', $tmItem); - $tjItem = $extractedDecodedRawData[3]; + $tjItem = $extractedDecodedRawData[7]; $this->assertStringContainsString('TJ', $tjItem['o']); $this->assertStringContainsString('(', $tjItem['c'][0]['t']); $this->assertStringContainsString('D', $tjItem['c'][0]['c']); @@ -256,7 +256,7 @@ public function testGetDataCommands(): void $pages = $document->getPages(); $page = $pages[0]; $dataCommands = $page->getDataCommands(); - $this->assertCount(174, $dataCommands); + $this->assertCount(176, $dataCommands); $tmItem = $dataCommands[2]; $this->assertCount(3, $tmItem); diff --git a/tests/PHPUnit/Integration/ParserTest.php b/tests/PHPUnit/Integration/ParserTest.php index 2531ba8e..29091914 100644 --- a/tests/PHPUnit/Integration/ParserTest.php +++ b/tests/PHPUnit/Integration/ParserTest.php @@ -321,7 +321,7 @@ public function testChangedFontSpaceLimit(): void $this->fixture = new Parser([], $config); $document = $this->fixture->parseFile($filename); - $this->assertStringContainsString('dni a 10 maj a 2018', $document->getText()); + $this->assertStringContainsString('dni a 10 maj a 2018', $document->getText()); } /** @@ -376,7 +376,7 @@ public function testRetainImageContentImpact(): void } $usedMemory = memory_get_usage(true); - $this->assertTrue($usedMemory > ($baselineMemory * 1.5), 'Memory is only '.$usedMemory); + $this->assertGreaterThan($baselineMemory + 180000000, $usedMemory, 'Memory is only '.$usedMemory); $this->assertTrue(null != $document && '' !== $document->getText()); // force garbage collection @@ -400,7 +400,7 @@ public function testRetainImageContentImpact(): void * note: the following memory value is set manually and may differ from system to system. * it must be high enough to not produce a false negative though. */ - $this->assertTrue($usedMemory < ($baselineMemory * 1.05), 'Memory is '.$usedMemory); + $this->assertLessThan($baselineMemory * 1.05, $usedMemory, 'Memory is '.$usedMemory); $this->assertTrue('' !== $document->getText()); } }