Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for two bugs related to Unicode translation support by Font objects #698

Merged
merged 20 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Smalot/PdfParser/Font.php
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ public function loadTranslateTable(): array
// Support for multiple bfchar sections
if (preg_match_all('/beginbfchar(?P<sections>.*?)endbfchar/s', $content, $matches)) {
foreach ($matches['sections'] as $section) {
$regexp = '/<(?P<from>[0-9A-F]+)> +<(?P<to>[0-9A-F]+)>[ \r\n]+/is';
$regexp = '/<(?P<from>[0-9A-F]+)> *<(?P<to>[0-9A-F]+)>[ \r\n]+/is';

preg_match_all($regexp, $section, $matches);

Expand Down
12 changes: 12 additions & 0 deletions src/Smalot/PdfParser/Page.php
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,18 @@ class Page extends PDFObject
*/
protected $dataTm;

/**
* @param array<\Smalot\PdfParser\Font> $fonts
*
* @internal
*/
public function setFonts($fonts)
unixnut marked this conversation as resolved.
Show resolved Hide resolved
{
if (empty($this->fonts)) {
$this->fonts = $fonts;
k00ni marked this conversation as resolved.
Show resolved Hide resolved
}
}

/**
* @return Font[]
*/
Expand Down
60 changes: 59 additions & 1 deletion src/Smalot/PdfParser/Pages.php
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,13 @@
class Pages extends PDFObject
{
/**
* @todo Objects other than Pages or Page might need to be treated specifically in order to get Page objects out of them,
* @var array<\Smalot\PdfParser\Font>|null
*/
protected $fonts;

/**
* @todo Objects other than Pages or Page might need to be treated specifically
* in order to get Page objects out of them.
*
* @see https://github.com/smalot/pdfparser/issues/331
*/
Expand All @@ -57,17 +63,69 @@ public function getPages(bool $deep = false): array
return $kidsElement->getContent();
}

// Prepare to apply the Pages' object's fonts to each page
if (false === \is_array($this->fonts)) {
$this->setupFonts();
}
$fontsAvailable = 0 < \count($this->fonts);

$kids = $kidsElement->getContent();
$pages = [];

foreach ($kids as $kid) {
if ($kid instanceof self) {
$pages = array_merge($pages, $kid->getPages(true));
} elseif ($kid instanceof Page) {
if ($fontsAvailable) {
$kid->setFonts($this->fonts);
}
$pages[] = $kid;
}
}

return $pages;
}

/**
* Gathers information about fonts and collects them in a list.
*
* @return void
*
* @internal
*/
protected function setupFonts()
{
$resources = $this->get('Resources');

if (method_exists($resources, 'has') && $resources->has('Font')) {
// no fonts available, therefore stop here
if ($resources->get('Font') instanceof Element\ElementMissing) {
return;
}

if ($resources->get('Font') instanceof Header) {
$fonts = $resources->get('Font')->getElements();
} else {
$fonts = $resources->get('Font')->getHeader()->getElements();
}

$table = [];

foreach ($fonts as $id => $font) {
if ($font instanceof Font) {
$table[$id] = $font;

// Store too on cleaned id value (only numeric)
$id = preg_replace('/[^0-9\.\-_]/', '', $id);
if ('' != $id) {
$table[$id] = $font;
}
}
}

$this->fonts = $table;
} else {
$this->fonts = [];
}
}
}
149 changes: 149 additions & 0 deletions tests/PHPUnit/Integration/PagesTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
<?php

/**
* @file This file is part of the PdfParser library.
*
* @author Konrad Abicht <k.abicht@gmail.com>
*
* @date 2024-04-19
*
* @license LGPLv3
*
* @url <https://github.com/smalot/pdfparser>
*
* PdfParser is a pdf library written in PHP, extraction oriented.
* Copyright (C) 2017 - Sébastien MALOT <sebastien@malot.fr>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program.
* If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>.
*/

namespace PHPUnitTests\Integration;

use PHPUnitTests\TestCase;
use Smalot\PdfParser\Document;
use Smalot\PdfParser\Element\ElementArray;
use Smalot\PdfParser\Font;
use Smalot\PdfParser\Header;
use Smalot\PdfParser\Page;
use Smalot\PdfParser\Pages;

/**
* @internal only for test purposes
*/
class PagesDummy extends Pages
{
/**
* The purpose of this function is to bypass the tedious
* work to setup instances which lead to a valid $fonts variable.
*
* @param array<\Smalot\PdfParser\Font> $fonts
*
* @return void
*/
public function setFonts($fonts)
{
$this->fonts = $fonts;
}
}

class PagesTest extends TestCase
{
/**
* If fonts are not stored in Page instances but in the Pages instance.
*
* Pages
* | `--- fonts = Font[] <=== will be used to override fonts in Page1 ...
* |
* |
* `--+ Page1
* | `--- fonts = null <=== Will be overwritten with the content of Pages.fonts
* `--+ ...
*
* @see https://github.com/smalot/pdfparser/pull/698
*/
public function testPullRequest698NoFontsSet(): void
{
$document = $this->createMock(Document::class);

// Create a Page mock and tell PHPUnit that its setFonts has to be called once
// otherwise an error is raised
$page1 = $this->createMock(Page::class);
$page1->expects($this->once())->method('setFonts');

// setup header
$header = new Header([
'Kids' => new ElementArray([
$page1,
]),
], $document);

$font1 = $this->createMock(Font::class);

// Preset fonts variable so we don't have to prepare all the
// prerequisites manually (like creating a Ressources instance
// with Font instances, see Pages::setupFonts())
$pages = new PagesDummy($document, $header);
$pages->setFonts([$font1]);

// We expect setFonts is called on $page1, therefore no assertion here
$pages->getPages(true);
}

/**
* Dont override fonts list in a Page instance, if available.
*
* Pages
* | `--- fonts = Font[] <=== Has to be ignored because fonts in Page1 is set
* |
* |
* `--+ Page1
* | `--- fonts = Font[] <=== must not be overwritten
* `--+ ...
*
* @see https://github.com/smalot/pdfparser/pull/698
*/
public function testPullRequest698DontOverride(): void
{
$document = $this->createMock(Document::class);

// create a Page mock and tell PHPUnit that its setFonts has to be called once
// otherwise an error is raised
k00ni marked this conversation as resolved.
Show resolved Hide resolved
$font2 = new Font($document);
$page1 = new Page($document);
$page1->setFonts([$font2]);

// setup header
$header = new Header([
'Kids' => new ElementArray([
$page1,
]),
], $document);

$font1 = $this->createMock(Font::class);

$pages = new PagesDummy($document, $header);
$pages->setFonts([$font1]);

// Trigger setupFonts method in $pages
$pages->getPages(true);

// Note:
// $font1 and $font2 are intenionally not both of the same type.
// One is a mock and the other one a real instance of Font.
// This way we can simply check the return value of getFonts here.
// If both were one of the other, we had to use a different assertation approach.
$this->assertEquals([$font2], $page1->getFonts());
}
}
Loading