smalot · k00ni · Jul 4, 2023 · Jun 22, 2023 · Jun 22, 2023 · Jun 23, 2023
diff --git a/doc/Usage.md b/doc/Usage.md
@@ -140,9 +140,49 @@ Array
     [Producer] => Adobe Acrobat
     [CreatedOn] => 2022-01-28T16:36:11+00:00
     [Pages] => 35
+    ...
 )
 ```
 
+If the PDF contains Extensible Metadata Platform (XMP) XML metadata, their values, including the XMP namespace, will be appended to the data returned by `getDetails()`. You can read more about what values and namespaces are commonly used in the [XMP Specifications](https://github.com/adobe/XMP-Toolkit-SDK/tree/main/docs).
+
+```php
+Array
+(
+    ...
+    [Pages] => 35
+    [dc:creator] => My Name
+    [pdf:producer] => Adobe Acrobat
+    [dc:title] => My Document Title
+    ...
+)
+```
+
+Some XMP metadata values may have multiple values, or even named children with their own values. In these cases, the value will be an array. The XMP metadata will follow the structure of the XML so it is possible to have multiple levels of nested values.
+
+```php
+Array
+(
+    ...
+    [dc:title] => My Document Title
+    [xmptpg:maxpagesize] => Array
+    (
+        [stdim:w] => 21.500000
+        [stdim:h] => 6.222222
+        [stdim:unit] => Inches
+    )
+    [xmptpg:platenames] => Array
+    (
+        [0] => Cyan
+        [1] => Magenta
+        [2] => Yellow
+        [3] => Black
+    )
+    ...
+)
+```
+
+
 ## Read Base64 encoded PDFs
 
 If working with [Base64](https://en.wikipedia.org/wiki/Base64) encoded PDFs, you might want to parse the PDF without saving the file to disk.

diff --git a/samples/XMP_Metadata.pdf b/samples/XMP_Metadata.pdf
diff --git a/src/Smalot/PdfParser/Document.php b/src/Smalot/PdfParser/Document.php
@@ -61,6 +61,11 @@ class Document
      */
     protected $trailer;
 
+    /**
+     * @var array<mixed>
+     */
+    protected $metadata = [];
+
     /**
      * @var array
      */
@@ -144,9 +149,107 @@ protected function buildDetails()
             $details['Pages'] = 0;
         }
 
+        $details = array_merge($details, $this->metadata);
+
         $this->details = $details;
     }
 
+    /**
+     * Extract XMP Metadata
+     */
+    public function extractXMPMetadata(string $content): void
+    {
+        $xml = xml_parser_create();
+        xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
+
+        if (1 === xml_parse_into_struct($xml, $content, $values, $index)) {
+            /*
+             * short overview about the following code parts:
+             *
+             * The output of xml_parse_into_struct is a single dimensional array (= $values), and the $stack is a last-on,
+             * first-off array of pointers to positions in $metadata, while iterating through it, that potentially turn the
+             * results into a more intuitive multi-dimensional array. When an "open" XML tag is encountered,
+             * we save the current $metadata context in the $stack, then create a child array of $metadata and
+             * make that the current $metadata context. When a "close" XML tag is encountered, the operations are
+             * reversed: the most recently added $metadata context from $stack (IOW, the parent of the current
+             * element) is set as the current $metadata context.
+             */
+            $metadata = [];
+            $stack = [];
+            foreach ($values as $val) {
+                // Standardize to lowercase
+                $val['tag'] = strtolower($val['tag']);
+
+                // Ignore structural x: and rdf: XML elements
+                if (0 === strpos($val['tag'], 'x:')) {
+                    continue;
+                } elseif (0 === strpos($val['tag'], 'rdf:') && 'rdf:li' != $val['tag']) {
+                    continue;
+                }
+
+                switch ($val['type']) {
+                    case 'open':
+                        // Create an array of list items
+                        if ('rdf:li' == $val['tag']) {
+                            $metadata[] = [];
+
+                            // Move up one level in the stack
+                            $stack[\count($stack)] = &$metadata;
+                            $metadata = &$metadata[\count($metadata) - 1];
+                        } else {
+                            // Else create an array of named values
+                            $metadata[$val['tag']] = [];
+
+                            // Move up one level in the stack
+                            $stack[\count($stack)] = &$metadata;
+                            $metadata = &$metadata[$val['tag']];
+                        }
+                        break;
+
+                    case 'complete':
+                        if (isset($val['value'])) {
+                            // Assign a value to this list item
+                            if ('rdf:li' == $val['tag']) {
+                                $metadata[] = $val['value'];
+
+                                // Else assign a value to this property
+                            } else {
+                                $metadata[$val['tag']] = $val['value'];
+                            }
+                        }
+                        break;
+
+                    case 'close':
+                        // If the value of this property is a single-
+                        // element array where the element is of type
+                        // string, use the value of the first list item
+                        // as the value for this property
+                        if (\is_array($metadata) && isset($metadata[0]) && 1 == \count($metadata) && \is_string($metadata[0])) {
+                            $metadata = $metadata[0];
+                        }
+
+                        // Move down one level in the stack
+                        $metadata = &$stack[\count($stack) - 1];
+                        unset($stack[\count($stack) - 1]);
+                        break;
+                }
+            }
+
+            // Only use this metadata if it's referring to a PDF
+            if (isset($metadata['dc:format']) && 'application/pdf' == $metadata['dc:format']) {
+                // According to the XMP specifications: 'Conflict resolution
+                // for separate packets that describe the same resource is
+                // beyond the scope of this document.' - Section 6.1
+                // Source: https://www.adobe.com/devnet/xmp.html
+                // Source: https://github.com/adobe/XMP-Toolkit-SDK/blob/main/docs/XMPSpecificationPart1.pdf
+                // So if there are multiple XMP blocks, just merge the values
+                // of each found block over top of the existing values
+                $this->metadata = array_merge($this->metadata, $metadata);
+            }
+        }
+        xml_parser_free($xml);
+    }
+
     public function getDictionary(): array
     {
         return $this->dictionary;

diff --git a/src/Smalot/PdfParser/Encoding.php b/src/Smalot/PdfParser/Encoding.php
@@ -32,7 +32,6 @@
 
 namespace Smalot\PdfParser;
 
-use Exception;
 use Smalot\PdfParser\Element\ElementNumeric;
 use Smalot\PdfParser\Encoding\EncodingLocator;
 use Smalot\PdfParser\Encoding\PostScriptGlyphs;

diff --git a/src/Smalot/PdfParser/Parser.php b/src/Smalot/PdfParser/Parser.php
@@ -214,6 +214,9 @@ protected function parseObject(string $id, array $structure, ?Document $document
                         // It is not necessary to store this content.
 
                         return;
+                    } elseif ($header->get('Type')->equals('Metadata')) {
+                        // Attempt to parse XMP XML Metadata
+                        $document->extractXMPMetadata($content);
                     }
                     break;
 

diff --git a/tests/PHPUnit/Integration/DocumentTest.php b/tests/PHPUnit/Integration/DocumentTest.php
@@ -255,4 +255,20 @@ public function testGetTextWithPageLimit(): void
         // given text is on page 2, it has to be ignored because of that
         self::assertStringNotContainsString('Medeni Usul ve İcra İflas Hukuku', $document->getText(1));
     }
+
+    /**
+     * Tests extraction of XMP Metadata vs. getHeader() data.
+     *
+     * @see https://github.com/smalot/pdfparser/pull/606
+     */
+    public function testExtractXMPMetadata(): void
+    {
+        $document = (new Parser())->parseFile($this->rootDir.'/samples/XMP_Metadata.pdf');
+
+        $details = $document->getDetails();
+
+        // Test that the dc:title data was extracted from the XMP
+        // Metadata.
+        self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $details['dc:title']);
+    }
 }
diff --git a/tests/PHPUnit/Integration/EncodingTest.php b/tests/PHPUnit/Integration/EncodingTest.php
@@ -35,7 +35,6 @@
 
 namespace PHPUnitTests\Integration;
 
-use Exception;
 use PHPUnitTests\TestCase;
 use Smalot\PdfParser\Document;
 use Smalot\PdfParser\Element;