Skip to content

Commit

Permalink
Support metadata element names containing spaces (#612)
Browse files Browse the repository at this point in the history
* Update Element.php

Add ability for PdfParser to parse metadata names with hexadecimal encoded characters such as "Document#20Type" where \#20 is a space.
Resolves Issue #529

* Update ElementTest.php

Add test for spaces in metadata property names.

* Make sure we fully support hex

Too quick on the commit! Make sure our two 'digit' regexp also finds A-F hex digits. Add a test for #2d which is a hyphen.

* fixed coding style issue in Element.php

---------

Co-authored-by: Konrad Abicht <hi@inspirito.de>
  • Loading branch information
GreyWyvern and k00ni authored Jul 11, 2023
1 parent d03ef96 commit c42fc11
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 3 deletions.
10 changes: 8 additions & 2 deletions src/Smalot/PdfParser/Element.php
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,16 @@ public static function parse(string $content, Document $document = null, int &$p
$old_position = $position;

if (!$only_values) {
if (!preg_match('/\G\s*(?P<name>\/[A-Z0-9\._]+)(?P<value>.*)/si', $content, $match, 0, $position)) {
if (!preg_match('/\G\s*(?P<name>\/[A-Z#0-9\._]+)(?P<value>.*)/si', $content, $match, 0, $position)) {
break;
} else {
$name = ltrim($match['name'], '/');
$name = preg_replace_callback(
'/#([0-9a-f]{2})/i',
function ($m) {
return \chr(base_convert($m[1], 16, 10));
},
ltrim($match['name'], '/')
);
$value = $match['value'];
$position = strpos($content, $value, $position + \strlen($match['name']));
}
Expand Down
7 changes: 6 additions & 1 deletion tests/PHPUnit/Integration/ElementTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ public function testParse(): void
$content = '/NameType /FlateDecode
/Contents[4 0 R 42]/Fonts<</F1 41/F2 43>>/NullType
null/StringType(hello)/DateType(D:20130901235555+02\'00\')/XRefType 2 0 R
/NumericType 8/HexaType<0020>/BooleanType false';
/NumericType 8/HexaType<0020>/BooleanType false
/Space#20Test(Templates)/Hyphen#2DTest(Templates)';
$offset = 0;

$elements = Element::parse($content, $document, $offset, false);
Expand Down Expand Up @@ -100,6 +101,10 @@ public function testParse(): void
$this->assertTrue($elements['BooleanType'] instanceof ElementBoolean);
$this->assertFalse($elements['BooleanType']->getContent());

$this->assertTrue(\array_key_exists('Space Test', $elements));

$this->assertTrue(\array_key_exists('Hyphen-Test', $elements));

// Only_values = true.
$content = '/NameType /FlateDecode';
$offset = 0;
Expand Down

0 comments on commit c42fc11

Please sign in to comment.