smalot · k00ni · Jul 4, 2023 · Jun 22, 2023 · Jun 22, 2023 · Jun 23, 2023
diff --git a/doc/Usage.md b/doc/Usage.md
@@ -132,17 +132,51 @@ $width = $font->calculateTextWidth($text, $missing);
 
 You can also extract metadata. The available data varies from PDF to PDF.
 
+Data taken from an Extensible Metadata Platform (XMP) XML block will have their key values prefixed with an XML tag namespace. You can read more about what values are available in the [XMP Specifications](https://github.com/adobe/XMP-Toolkit-SDK/tree/main/docs).
+
 ```php
 $metaData = $pdf->getDetails();
 
 Array
 (
+    [Creator] => Adobe Acrobat
     [Producer] => Adobe Acrobat
     [CreatedOn] => 2022-01-28T16:36:11+00:00
     [Pages] => 35
+    [dc:creator] => My Name
+    [pdf:producer] => Adobe Acrobat
+    [dc:title] => My Document Title
+    ...
 )
 ```
 
+Some XMP metadata values may have multiple values, or even named children with their own values. In these cases, the value will be an array. The XMP metadata will follow the structure of the XML so it is possible to have multiple levels of nested values.
+
+```php
+$metaData = $pdf->getDetails();
+
+Array
+(
+    ...
+    [dc:title] => My Document Title
+    [xmptpg:maxpagesize] => Array
+    (
+        [stdim:w] => 21.500000
+        [stdim:h] => 6.222222
+        [stdim:unit] => Inches
+    )
+    [xmptpg:platenames] => Array
+    (
+        [0] => Cyan
+        [1] => Magenta
+        [2] => Yellow
+        [3] => Black
+    )
+    ...
+)
+```
+
+
 ## Read Base64 encoded PDFs
 
 If working with [Base64](https://en.wikipedia.org/wiki/Base64) encoded PDFs, you might want to parse the PDF without saving the file to disk.

diff --git a/src/Smalot/PdfParser/Document.php b/src/Smalot/PdfParser/Document.php
@@ -163,100 +163,77 @@ public function extractXMPMetadata(string $content): void
         xml_parser_set_option($xml, \XML_OPTION_SKIP_WHITE, 1);
 
         if (xml_parse_into_struct($xml, $content, $values, $index)) {
-            $detail = '';
 
+            $metadata = [];
+            $stack = [];
             foreach ($values as $val) {
-                switch ($val['tag']) {
-                    case 'DC:CREATOR':
-                        $detail = ('open' == $val['type']) ? 'Author' : '';
-                        break;
-
-                    case 'DC:DESCRIPTION':
-                        $detail = ('open' == $val['type']) ? 'Description' : '';
-                        break;
-
-                    case 'DC:TITLE':
-                        $detail = ('open' == $val['type']) ? 'Title' : '';
-                        break;
-
-                    case 'DC:SUBJECT':
-                        $detail = ('open' == $val['type']) ? 'Subject' : '';
-                        break;
-
-                    case 'RDF:LI':
-                        if ('' !== $detail && 'complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata[$detail] = $val['value'];
-                        }
-                        break;
 
-                    case 'DC:FORMAT':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['Format'] = $val['value'];
+                // Standardize to lowercase
+                $val['tag'] = strtolower($val['tag']);
+
+                // Ignore structural x: and rdf: XML elements
+                if (strpos($val['tag'], 'x:') === 0) continue;
+                if (strpos($val['tag'], 'rdf:') === 0 && 'rdf:li' != $val['tag']) continue;
+
+                switch ($val['type']) {
+                    case 'open':
+                        // Create an array of list items
+                        if ('rdf:li' == $val['tag']) {
+                            $metadata[] = [];
+
+                            // Move up one level in the stack
+                            $stack[count($stack)] = &$metadata;
+                            $metadata = &$metadata[count($metadata) - 1];
+
+                        // Else create an array of named values
+                        } else {
+                            $metadata[$val['tag']] = [];
+
+                            // Move up one level in the stack
+                            $stack[count($stack)] = &$metadata;
+                            $metadata = &$metadata[$val['tag']];
                         }
                         break;
 
-                    case 'PDF:KEYWORDS':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['Keywords'] = $val['value'];
-                        }
-                        break;
+                    case 'complete':
+                        if (isset($val['value'])) {
 
-                    case 'PDF:PRODUCER':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['Producer'] = $val['value'];
-                        }
-                        break;
+                            // Assign a value to this list item
+                            if ('rdf:li' == $val['tag']) {
+                                $metadata[] = $val['value'];
 
-                    case 'PDFX:SOURCEMODIFIED':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['SourceModified'] = $val['value'];
+                            // Else assign a value to this property
+                            } else {
+                                $metadata[$val['tag']] = $val['value'];
+                            }
                         }
                         break;
 
-                    case 'PDFX:COMPANY':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['Company'] = $val['value'];
+                    case 'close':
+                        // If the value of this item is a single element
+                        // array of just one list item, use the value of
+                        // the first list item as the value for this
+                        // property
+                        if (is_array($metadata) && isset($metadata[0]) && count($metadata) == 1) {
+                            $metadata = $metadata[0];
                         }
-                        break;
 
-                    case 'XMP:CREATEDATE':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['CreationDate'] = $val['value'];
-                        }
+                        // Move down one level in the stack
+                        $metadata = &$stack[count($stack) - 1];
+                        unset($stack[count($stack) - 1]);
                         break;
 
-                    case 'XMP:CREATORTOOL':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['Creator'] = $val['value'];
-                        }
-                        break;
-
-                    case 'XMP:MODIFYDATE':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['ModDate'] = $val['value'];
-                        }
-                        break;
-
-                    case 'XMP:METADATADATE':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['MetadataDate'] = $val['value'];
-                        }
-                        break;
-
-                    case 'XMPMM:DOCUMENTID':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['DocumentUUID'] = $val['value'];
-                        }
-                        break;
-
-                    case 'XMPMM:INSTANCEID':
-                        if ('complete' == $val['type'] && isset($val['value'])) {
-                            $this->metadata['InstanceUUID'] = $val['value'];
-                        }
-                        break;
                 }
             }
+
+            // According to the XMP specifications: 'Conflict resolution
+            // for separate packets that describe the same resource is
+            // beyond the scope of this document.' - Section 6.1
+            // So if there are multiple XMP blocks, just merge the values
+            // of each found block over top of the existing values
+            $this->metadata = array_merge($this->metadata, $metadata);
         }
+        xml_parser_free($xml);
     }
 
     public function getDictionary(): array

diff --git a/tests/PHPUnit/Integration/DocumentTest.php b/tests/PHPUnit/Integration/DocumentTest.php
@@ -265,34 +265,10 @@ public function testExtractXMPMetadata(): void
     {
         $document = (new Parser())->parseFile($this->rootDir.'/samples/XMP_Metadata.pdf');
 
-        // Get the original parsed details from getHeader().
-        $ref = new \ReflectionClass('\Smalot\PdfParser\Document');
-        $prop = $ref->getProperty('trailer');
-        $prop->setAccessible(true);
-        $trailer = $prop->getValue($document);
-        $details = [];
-
-        if ($trailer->has('Info')) {
-            $info = $trailer->get('Info');
-            if (null !== $info && method_exists($info, 'getHeader')) {
-                $details = $info->getHeader()->getDetails();
-            }
-        }
-
-        // Check that the Title does not contain a Right Single
-        // Quotation Mark, a high UTF-8 glyph that cannot be encoded in
-        // ISO-8859-1 and is replaced by an uninterpretable UTF-8
-        // control character.
-        self::assertStringNotContainsString("\u{2019}", $details['Title']);
-
-        $detailsXMP = $document->getDetails();
-
-        // Test that the correct Right Single Quotation Mark glyph was
-        // extracted from the XMP Metadata.
-        self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $detailsXMP['Title']);
-
-        // Test that getDetails() data NOT contained in the XMP Metadata
-        // is still accessible and not discarded/overwritten.
-        self::assertEquals(1, $detailsXMP['Pages']);
+        $details = $document->getDetails();
+
+        // Test that the dc:title data was extracted from the XMP
+        // Metadata.
+        self::assertStringContainsString("Enhance PdfParser\u{2019}s Metadata Capabilities", $details['dc:title']);
     }
 }