diff --git a/samples/Document-Word-Landscape-printedaspdf.pdf b/samples/Document-Word-Landscape-printedaspdf.pdf new file mode 100644 index 00000000..a1dd549a Binary files /dev/null and b/samples/Document-Word-Landscape-printedaspdf.pdf differ diff --git a/src/Smalot/PdfParser/Page.php b/src/Smalot/PdfParser/Page.php index 0dafbfcc..1bd29e1e 100644 --- a/src/Smalot/PdfParser/Page.php +++ b/src/Smalot/PdfParser/Page.php @@ -525,7 +525,13 @@ public function getDataCommands(?array $extractedDecodedRawData = null): array case 'BT': $extractedData[] = $command; break; - + /* + * cm + * Concatenation Matrix that will transform all following Tm + */ + case 'cm': + $extractedData[] = $command; + break; /* * ET * End a text object, discarding the text matrix @@ -640,6 +646,18 @@ public function getDataCommands(?array $extractedDecodedRawData = null): array case 'TJ': $extractedData[] = $command; break; + /* + * q + * Save current graphics state to stack + */ + case 'q': + /* + * Q + * Load last saved graphics state from stack + */ + case 'Q': + $extractedData[] = $command; + break; default: } } @@ -671,7 +689,8 @@ public function getDataTm(?array $dataCommands = null): array * At the beginning of a text object Tm is the identity matrix */ $defaultTm = ['1', '0', '0', '1', '0', '0']; - + $concatTm = ['1', '0', '0', '1', '0', '0']; + $graphicsStatesStack = []; /* * Set the text leading used by T*, ' and " operators */ @@ -730,6 +749,18 @@ public function getDataTm(?array $dataCommands = null): array $Ty = 0; break; + case 'cm': + $newConcatTm = (array) explode(' ', $command['c']); + $TempMatrix = []; + // Multiply with previous concatTm + $TempMatrix[0] = (float) $concatTm[0] * (float) $newConcatTm[0] + (float) $concatTm[1] * (float) $newConcatTm[2]; + $TempMatrix[1] = (float) $concatTm[0] * (float) $newConcatTm[1] + (float) $concatTm[1] * (float) $newConcatTm[3]; + $TempMatrix[2] = (float) $concatTm[2] * (float) $newConcatTm[0] + (float) $concatTm[3] * (float) $newConcatTm[2]; + $TempMatrix[3] = (float) $concatTm[2] * (float) $newConcatTm[1] + (float) $concatTm[3] * (float) $newConcatTm[3]; + $TempMatrix[4] = (float) $concatTm[4] * (float) $newConcatTm[0] + (float) $concatTm[5] * (float) $newConcatTm[2] + (float) $newConcatTm[4]; + $TempMatrix[5] = (float) $concatTm[4] * (float) $newConcatTm[1] + (float) $concatTm[5] * (float) $newConcatTm[3] + (float) $newConcatTm[5]; + $concatTm = $TempMatrix; + break; /* * ET * End a text object @@ -786,6 +817,14 @@ public function getDataTm(?array $dataCommands = null): array */ case 'Tm': $Tm = explode(' ', $command['c']); + $TempMatrix = []; + $TempMatrix[0] = (float) $Tm[0] * (float) $concatTm[0] + (float) $Tm[1] * (float) $concatTm[2]; + $TempMatrix[1] = (float) $Tm[0] * (float) $concatTm[1] + (float) $Tm[1] * (float) $concatTm[3]; + $TempMatrix[2] = (float) $Tm[2] * (float) $concatTm[0] + (float) $Tm[3] * (float) $concatTm[2]; + $TempMatrix[3] = (float) $Tm[2] * (float) $concatTm[1] + (float) $Tm[3] * (float) $concatTm[3]; + $TempMatrix[4] = (float) $Tm[4] * (float) $concatTm[0] + (float) $Tm[5] * (float) $concatTm[2] + (float) $concatTm[4]; + $TempMatrix[5] = (float) $Tm[4] * (float) $concatTm[1] + (float) $Tm[5] * (float) $concatTm[3] + (float) $concatTm[5]; + $Tm = $TempMatrix; $Tx = (float) $Tm[$x]; $Ty = (float) $Tm[$y]; break; @@ -880,6 +919,20 @@ public function getDataTm(?array $dataCommands = null): array } $extractedData[] = $data; break; + /* + * q + * Save current graphics state to stack + */ + case 'q': + $graphicsStatesStack[] = $concatTm; + break; + /* + * Q + * Load last saved graphics state from stack + */ + case 'Q': + $concatTm = array_pop($graphicsStatesStack); + break; default: } } diff --git a/tests/PHPUnit/Integration/PageTest.php b/tests/PHPUnit/Integration/PageTest.php index 5e40ee90..33751e59 100644 --- a/tests/PHPUnit/Integration/PageTest.php +++ b/tests/PHPUnit/Integration/PageTest.php @@ -256,9 +256,9 @@ public function testGetDataCommands(): void $pages = $document->getPages(); $page = $pages[0]; $dataCommands = $page->getDataCommands(); - $this->assertCount(176, $dataCommands); + $this->assertCount(185, $dataCommands); - $tmItem = $dataCommands[2]; + $tmItem = $dataCommands[6]; $this->assertCount(3, $tmItem); $this->assertArrayHasKey('t', $tmItem); $this->assertArrayHasKey('o', $tmItem); @@ -267,7 +267,7 @@ public function testGetDataCommands(): void $this->assertStringContainsString('Tm', $tmItem['o']); $this->assertStringContainsString('0.999429 0 0 1 201.96 720.68', $tmItem['c']); - $tjItem = $dataCommands[3]; + $tjItem = $dataCommands[7]; $this->assertCount(3, $tjItem); $this->assertArrayHasKey('t', $tjItem); $this->assertArrayHasKey('o', $tjItem); @@ -307,7 +307,14 @@ public function testGetDataTm(): void '201.96', '720.68', ], - $item[0] + [ + round($item[0][0], 6), + round($item[0][1], 6), + round($item[0][2], 6), + round($item[0][3], 6), + round($item[0][4], 2), + round($item[0][5], 2), + ] ); $this->assertStringContainsString('Document title', $item[1]); @@ -321,7 +328,14 @@ public function testGetDataTm(): void '70.8', '673.64', ], - $item[0] + [ + round($item[0][0], 6), + round($item[0][1], 6), + round($item[0][2], 6), + round($item[0][3], 6), + round($item[0][4], 2), + round($item[0][5], 2), + ] ); $this->assertStringContainsString('Calibri : Lorem ipsum dolor sit amet, consectetur a', $item[1]); @@ -332,10 +346,17 @@ public function testGetDataTm(): void '0', '0', '1', - '342.840222606', + '342.84', '81.44', ], - $item[0] + [ + round($item[0][0], 6), + round($item[0][1], 6), + round($item[0][2], 6), + round($item[0][3], 6), + round($item[0][4], 2), + round($item[0][5], 2), + ] ); $this->assertStringContainsString('nenatis.', $item[1]); @@ -626,7 +647,7 @@ public function testGetTextXY(): void $document = $parser->parseFile($filename); $pages = $document->getPages(); $page = $pages[0]; - $result = $page->getTextXY(201.96, 720.68); + $result = $page->getTextXY(201.96, 720.68, 0.01, 0.01); $this->assertCount(1, $result); $this->assertCount(2, $result[0]); $this->assertEquals( @@ -638,7 +659,14 @@ public function testGetTextXY(): void '201.96', '720.68', ], - $result[0][0] + [ + round($result[0][0][0], 6), + round($result[0][0][1], 6), + round($result[0][0][2], 6), + round($result[0][0][3], 6), + round($result[0][0][4], 2), + round($result[0][0][5], 2), + ] ); $this->assertStringContainsString('Document title', $result[0][1]); @@ -657,7 +685,14 @@ public function testGetTextXY(): void '201.96', '720.68', ], - $result[0][0] + [ + round($result[0][0][0], 6), + round($result[0][0][1], 6), + round($result[0][0][2], 6), + round($result[0][0][3], 6), + round($result[0][0][4], 2), + round($result[0][0][5], 2), + ] ); $this->assertStringContainsString('Document title', $result[0][1]); @@ -827,10 +862,10 @@ public function testIssue454(): void $this->assertEquals(2, \count($dataTm[0])); $this->assertIsArray($dataTm[0][0]); $this->assertEquals(6, \count($dataTm[0][0])); - $this->assertEquals(201.96, $dataTm[0][0][4]); - $this->assertEquals(720.68, $dataTm[0][0][5]); + $this->assertEquals(201.96, round($dataTm[0][0][4], 2)); + $this->assertEquals(720.68, round($dataTm[0][0][5], 2)); $this->assertStringContainsString('Document title', $dataTm[0][1]); - $textData = $page->getTextXY(201.96, 720.68); + $textData = $page->getTextXY(201.96, 720.68, 0.01, 0.01); $this->assertStringContainsString('Document title', $textData[0][1]); $page = $pages[2]; $dataTm = $page->getDataTm(); @@ -889,4 +924,38 @@ public function testIssue629WithoutDataTmFontInfo(): void $this->assertCount(2, $dataTm[0]); $this->assertFalse(isset($dataTm[0][2])); } + + public function testCmCommandInPdfs(): void + { + $config = new Config(); + $parser = $this->getParserInstance($config); + $filename = $this->rootDir.'/samples/Document-Word-Landscape-printedaspdf.pdf'; + $document = $parser->parseFile($filename); + $pages = $document->getPages(); + $page = $pages[0]; + $dataTm = $page->getDataTm(); + $item = $dataTm[2]; + $this->assertCount(6, $dataTm); + $this->assertCount(2, $item); + $this->assertCount(6, $item[0]); + $this->assertEquals('This is just a test', trim($item[1])); + $this->assertEquals( + [ + '0.75', + '0.0', + '0.0', + '0.75', + '59.16', + '500.4', + ], + [ + round($item[0][0], 6), + round($item[0][1], 6), + round($item[0][2], 6), + round($item[0][3], 6), + round($item[0][4], 2), + round($item[0][5], 2), + ] + ); + } }