Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance by not extracting compressed image data if retainImageContent was set to false #590

Merged
merged 12 commits into from
Apr 13, 2023
54 changes: 47 additions & 7 deletions src/Smalot/PdfParser/RawData/RawDataParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -553,16 +553,18 @@ protected function getIndirectObject(string $pdfData, array $xref, string $objRe
$offset += $objHeaderLen;
$objContentArr = [];
$i = 0; // object main index
$header = null;
do {
$oldOffset = $offset;
// get element
$element = $this->getRawObject($pdfData, $offset);
$element = $this->getRawObject($pdfData, $offset, null != $header ? $header[1] : null);
$offset = $element[2];
// decode stream using stream's dictionary information
if ($decoding && ('stream' === $element[0]) && (isset($objContentArr[$i - 1][0])) && ('<<' === $objContentArr[$i - 1][0])) {
$element[3] = $this->decodeStream($pdfData, $xref, $objContentArr[$i - 1][1], $element[1]);
if ($decoding && ('stream' === $element[0]) && null != $header) {
$element[3] = $this->decodeStream($pdfData, $xref, $header[1], $element[1]);
}
$objContentArr[$i] = $element;
$header = isset($element[0]) && '<<' === $element[0] ? $element : null;
++$i;
} while (('endobj' !== $element[0]) && ($offset !== $oldOffset));
// remove closing delimiter
Expand Down Expand Up @@ -605,11 +607,12 @@ protected function getObjectVal(string $pdfData, $xref, array $obj): array
/**
* Get object type, raw value and offset to next object
*
* @param int $offset Object offset
* @param int $offset Object offset
* @param array|null $headerDic obj header's dictionary, parsed by getRawObject. Used for stream parsing optimization
*
* @return array containing object type, raw value and offset to next object
*/
protected function getRawObject(string $pdfData, int $offset = 0): array
protected function getRawObject(string $pdfData, int $offset = 0, ?array $headerDic = null): array
{
$objtype = ''; // object type to be returned
$objval = ''; // object value to be returned
Expand Down Expand Up @@ -758,15 +761,20 @@ protected function getRawObject(string $pdfData, int $offset = 0): array
$offset += 6;
if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset, 4), $matches)) {
$offset += \strlen($matches[0]);

$streamLen = (int) $this->getHeaderValue($headerDic, 'Length', 'numeric', 0);
k00ni marked this conversation as resolved.
Show resolved Hide resolved
$skip = !$this->config->getRetainImageContent() && 'XObject' == $this->getHeaderValue($headerDic, 'Type', '/') && 'Image' == $this->getHeaderValue($headerDic, 'Subtype', '/');
k00ni marked this conversation as resolved.
Show resolved Hide resolved

$pregResult = preg_match(
'/(endstream)[\x09\x0a\x0c\x0d\x20]/isU',
$pdfData,
$matches,
\PREG_OFFSET_CAPTURE,
$offset
$offset + $streamLen
);

if (1 == $pregResult) {
$objval = substr($pdfData, $offset, $matches[0][1] - $offset);
$objval = $skip ? '' : substr($pdfData, $offset, $matches[0][1] - $offset);
$offset = $matches[1][1];
}
}
Expand Down Expand Up @@ -796,6 +804,38 @@ protected function getRawObject(string $pdfData, int $offset = 0): array
return [$objtype, $objval, $offset];
}

/**
* Get value of an object header's section
*
* @param string $key header's section name
* @param string $type type of the section (i.e. 'numeric', '/', '<<', etc.)
* @param string|null $default default value for header's section
*
* @return string|array|null value of obj header's section, or default value if none found, or its type doesn't match $type param
*/
private function getHeaderValue(?array $headerDic, string $key, string $type, $default = '')
{
if (!\is_array($headerDic)) {
k00ni marked this conversation as resolved.
Show resolved Hide resolved
return $default;
}

foreach ($headerDic as $i => $val) {
if (
\is_array($val)
&& 3 == \count($val)
&& '/' == $val[0]
&& $val[1] == $key
&& isset($headerDic[$i + 1])
) {
return \is_array($headerDic[$i + 1]) && 1 < \count($headerDic[$i + 1]) && $type == $headerDic[$i + 1][0]
? $headerDic[$i + 1][1]
: $default;
}
k00ni marked this conversation as resolved.
Show resolved Hide resolved
}

return $default;
}

/**
* Get Cross-Reference (xref) table and trailer data from PDF document data.
*
Expand Down