Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EZP-29289: Migrating ezxmltext with invalid name or id attributes #47

Merged
merged 2 commits into from
Jun 21, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions bundle/Command/ConvertXmlTextToRichTextCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,13 @@ protected function configure()
'disable-duplicate-id-check',
null,
InputOption::VALUE_NONE,
'Disable the check for duplicate html ids in every attribute. This might increase execution time on large databases'
'Disable the check for duplicate html ids in every attribute. This might decrease execution time on large databases'
)
->addOption(
'disable-id-value-check',
null,
InputOption::VALUE_NONE,
'Disable the check for non-validating id/name values. This might decrease execution time on large databases'
)
->addOption(
'test-content-object',
Expand Down Expand Up @@ -123,7 +129,7 @@ protected function execute(InputInterface $input, OutputInterface $output)
$dryRun = true;
}

$this->convertFields($dryRun, $testContentId, !$input->getOption('disable-duplicate-id-check'), $output);
$this->convertFields($dryRun, $testContentId, !$input->getOption('disable-duplicate-id-check'), !$input->getOption('disable-id-value-check'), $output);
}

protected function getContentTypeIds($contentTypeIdentifiers)
Expand Down Expand Up @@ -330,7 +336,7 @@ protected function updateFieldRow($dryRun, $id, $version, $datatext)
}
}

protected function convertFields($dryRun, $contentId, $checkDuplicateIds, OutputInterface $output)
protected function convertFields($dryRun, $contentId, $checkDuplicateIds, $checkIdValues, OutputInterface $output)
{
$count = $this->getRowCountOfContentObjectAttributes('ezxmltext', $contentId);

Expand All @@ -345,7 +351,7 @@ protected function convertFields($dryRun, $contentId, $checkDuplicateIds, Output
$inputValue = $row['data_text'];
}

$converted = $this->converter->convert($this->createDocument($inputValue), $checkDuplicateIds, $row['id']);
$converted = $this->converter->convert($this->createDocument($inputValue), $checkDuplicateIds, $checkIdValues, $row['id']);

$this->updateFieldRow($dryRun, $row['id'], $row['version'], $converted);

Expand Down
49 changes: 44 additions & 5 deletions lib/FieldType/XmlText/Converter/RichText.php
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,44 @@ protected function reportNonUniqueIds(DOMDocument $document, $contentFieldId)
$id = $node->attributes->getNamedItem('id')->nodeValue;
// id has format "duplicated_id_foo_bar_idm45226413447104" where "foo_bar" is the duplicated id
$duplicatedId = substr($id, strlen('duplicated_id_'), strrpos($id, '_') - strlen('duplicated_id_'));
if ($this->logger !== null) {
$this->logger->warning("Duplicated id in original ezxmltext for contentobject_attribute.id=$contentFieldId, automatically generated new id : $duplicatedId --> $id");
}
$this->logger->warning("Duplicated id in original ezxmltext for contentobject_attribute.id=$contentFieldId, automatically generated new id : $duplicatedId --> $id");
}
}

protected function validateAttributeValues(DOMDocument $document, $contentFieldId)
{
$xpath = new DOMXPath($document);
$whitelist1st = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_';
$replaceStr1st = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa';

$whitelist = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-';
$replaceStr = '';
/*
* We want to pick elements which has id value
* #1 not starting with a..z or '_'
* #2 not a..z, '0..9', '_' or '-' after 1st character
* So, no xpath v2 to our disposal...
* 1st line : we check the 1st char(substring) in id, converts it to 'a' if it in whitelist(translate), then check if it string now starts with 'a'(starts-with), then we invert result(not)
Copy link
Member

@kmadejski kmadejski Jun 21, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great to see this comment! 🙂

* : So we replace first char with 'a' if it is whitelisted, then we select the element if id value does not start with 'a'
* 2nd line: now we check remaining(omit 1st char) part of string (substring), removes any character that *is* whitelisted(translate), then check if there are any non-whitelisted characters left(string-lenght)
* 3rd line: Due to the not() in 1st line, we pick all elements not matching that 1st line. That also includes elements not having a xml:id at all..
* : So, we want to make sure we only pick elements which has a xml:id attribute.
*/
$nodes = $xpath->query("//*[
(
not(starts-with(translate(substring(@xml:id, 1, 1), '$whitelist1st', '$replaceStr1st'), 'a'))
or string-length(translate(substring(@xml:id, 2), '$whitelist', '$replaceStr')) > 0
) and string-length(@xml:id) > 0]");

if ($contentFieldId === null) {
$contentFieldId = '[unknown]';
}
foreach ($nodes as $node) {
$orgValue = $node->attributes->getNamedItem('id')->nodeValue;
$newValue = 'rewrite_' . $node->attributes->getNamedItem('id')->nodeValue;
$newValue = preg_replace("/[^$whitelist]/", '_', $newValue);
$node->attributes->getNamedItem('id')->nodeValue = $newValue;
$this->logger->warning("Replaced non-validating id value in richtext for contentobject_attribute.id=$contentFieldId, changed from : $orgValue --> $newValue");
}
}

Expand Down Expand Up @@ -317,10 +352,11 @@ protected function checkEmptyEmbedTags(DOMDocument $inputDocument)
*
* @param DOMDocument $inputDocument
* @param bool $checkDuplicateIds
* @param bool $checkIdValues
* @param null|int $contentFieldId
* @return string
*/
public function convert(DOMDocument $inputDocument, $checkDuplicateIds = false, $contentFieldId = null)
public function convert(DOMDocument $inputDocument, $checkDuplicateIds = false, $checkIdValues = false, $contentFieldId = null)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick: Missing checkIdValues parameter in docblock

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch!. Fixed

{
$this->removeComments($inputDocument);

Expand All @@ -329,6 +365,9 @@ public function convert(DOMDocument $inputDocument, $checkDuplicateIds = false,
if ($checkDuplicateIds) {
$this->reportNonUniqueIds($convertedDocument, $contentFieldId);
}
if ($checkIdValues) {
$this->validateAttributeValues($convertedDocument, $contentFieldId);
}

// Needed by some disabled output escaping (eg. legacy ezxml paragraph <line/> elements)
$convertedDocumentNormalized = new DOMDocument();
Expand All @@ -339,7 +378,7 @@ public function convert(DOMDocument $inputDocument, $checkDuplicateIds = false,

$result = $convertedDocumentNormalized->saveXML();

if (!empty($errors) && $this->logger !== null) {
if (!empty($errors)) {
$this->logger->error(
"Validation errors when converting ezxmltext for contentobject_attribute.id=$contentFieldId",
['result' => $result, 'errors' => $errors, 'xmlString' => $inputDocument->saveXML()]
Expand Down
50 changes: 46 additions & 4 deletions tests/lib/FieldType/Converter/RichTextTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
use eZ\Publish\API\Repository\LocationService;
use eZ\Publish\API\Repository\Values\Content\ContentInfo;
use eZ\Publish\API\Repository\Values\Content\Location;
use Psr\Log\NullLogger;

class RichTextTest extends TestCase
{
Expand Down Expand Up @@ -54,8 +55,12 @@ public function providerForTestConvert()
foreach (glob(__DIR__ . '/_fixtures/richtext/input/*.xml') as $inputFilePath) {
$basename = basename($inputFilePath, '.xml');
$outputFilePath = __DIR__ . "/_fixtures/richtext/output/{$basename}.xml";
$logFilePath = __DIR__ . "/_fixtures/richtext/log/{$basename}.log";
if (!file_exists($logFilePath)) {
$logFilePath = null;
}

$map[] = [$inputFilePath, $outputFilePath];
$map[] = [$inputFilePath, $outputFilePath, $logFilePath];
}

return $map;
Expand Down Expand Up @@ -123,21 +128,58 @@ private function createApiRepositoryStub()
return $apiRepositoryStub;
}

private function createLoggerStub($logFilePath)
{
$loggerStub = $this->createMock(NullLogger::class);

if ($logFilePath !== null) {
$log = file_get_contents($logFilePath);
$logLines = explode("\n", $log);
$logNo = 0;
foreach ($logLines as $expectedLogLine) {
if ($expectedLogLine === '') {
continue;
}
if (strpos($expectedLogLine, '*') !== false) {
$loggerStub->expects($this->at($logNo++))
->method('warning')
->with($this->callback(function ($logMessage) use ($expectedLogLine) {
$expectedLogMessage = substr($expectedLogLine, 0, strpos($expectedLogLine, '*'));

$this->assertEquals($expectedLogMessage, substr($logMessage, 0, strlen($expectedLogMessage)), 'Actual log message do not match the expected one');

return true;
}));
} else {
$loggerStub->expects($this->at($logNo++))
->method('warning')
->with($expectedLogLine);
}
}
} else {
$loggerStub->expects($this->never())
->method('warning');
}

return $loggerStub;
}

/**
* @param string $inputFilePath
* @param string $outputFilePath
*
* @dataProvider providerForTestConvert
*/
public function testConvert($inputFilePath, $outputFilePath)
public function testConvert($inputFilePath, $outputFilePath, $logFilePath)
{
$apiRepositoryStub = $this->createApiRepositoryStub();
$loggerStub = $this->createLoggerStub($logFilePath);

$inputDocument = $this->createDocument($inputFilePath);
$richText = new RichText($apiRepositoryStub);
$richText = new RichText($apiRepositoryStub, $loggerStub);
$richText->setImageContentTypes([27]);

$result = $richText->convert($inputDocument, true);
$result = $richText->convert($inputDocument, true, true);

$convertedDocument = $this->createDocument($result, false);
$expectedDocument = $this->createDocument($outputFilePath);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
<?xml version="1.0" encoding="utf-8"?>
<section
xmlns:image="http://ez.no/namespaces/ezpublish3/image/"
xmlns:xhtml="http://ez.no/namespaces/ezpublish3/xhtml/"
xmlns:custom="http://ez.no/namespaces/ezpublish3/custom/">
<paragraph align="justify">Here is an anchor
<anchor name="1name"/>
</paragraph>
<paragraph align="justify">Here is an anchor
<anchor name="n1ame"/>
</paragraph>
<paragraph align="justify">Here is an anchor
<anchor name="-1name"/>
</paragraph>
<paragraph align="justify">Here is an anchor
<anchor name="_name"/>
</paragraph>
<paragraph align="justify">Here is an anchor
<anchor name="aname"/>
</paragraph>
<paragraph align="justify">Here is an anchor
<anchor name="#aname"/>
</paragraph>
<paragraph align="justify">Here is an anchor
<anchor name="a@name"/>
</paragraph>
<paragraph align="justify">Here is an anchor
<anchor name="an£ame"/>
</paragraph>
<paragraph align="justify">Here is an anchor
<anchor name="aname["/>
</paragraph>
</section>
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Duplicated id in original ezxmltext for contentobject_attribute.id=[unknown], automatically generated new id : inv5 --> duplicated_id_inv5_*
Duplicated id in original ezxmltext for contentobject_attribute.id=[unknown], automatically generated new id : inv5 --> duplicated_id_inv5_*
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Duplicated id in original ezxmltext for contentobject_attribute.id=[unknown], automatically generated new id : myembed_id --> duplicated_id_myembed_id_idm*
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Duplicated id in original ezxmltext for contentobject_attribute.id=[unknown], automatically generated new id : anchor --> duplicated_id_anchor_idm*
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Replaced non-validating id value in richtext for contentobject_attribute.id=[unknown], changed from : 1name --> rewrite_1name
Replaced non-validating id value in richtext for contentobject_attribute.id=[unknown], changed from : -1name --> rewrite_-1name
Replaced non-validating id value in richtext for contentobject_attribute.id=[unknown], changed from : #aname --> rewrite__aname
Replaced non-validating id value in richtext for contentobject_attribute.id=[unknown], changed from : a@name --> rewrite_a_name
Replaced non-validating id value in richtext for contentobject_attribute.id=[unknown], changed from : an£ame --> rewrite_an__ame
Replaced non-validating id value in richtext for contentobject_attribute.id=[unknown], changed from : aname[ --> rewrite_aname_
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Warning: ezxmltext for contentobject_attribute.id=contains embed or embed-inline tag(s) without node_id or object_id
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Warning: ezxmltext for contentobject_attribute.id=contains embed or embed-inline tag(s) without node_id or object_id
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<?xml version="1.0" encoding="UTF-8"?>
<section
xmlns="http://docbook.org/ns/docbook"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:ezxhtml="http://ez.no/xmlns/ezpublish/docbook/xhtml"
xmlns:ezcustom="http://ez.no/xmlns/ezpublish/docbook/custom" version="5.0-variant ezpublish-1.0">
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="rewrite_1name"/>
</para>
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="n1ame"/>
</para>
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="rewrite_-1name"/>
</para>
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="_name"/>
</para>
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="aname"/>
</para>
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="rewrite__aname"/>
</para>
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="rewrite_a_name"/>
</para>
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="rewrite_an__ame"/>
</para>
<para ezxhtml:textalign="justify">Here is an anchor
<anchor xml:id="rewrite_aname_"/>
</para>
</section>