Skip to content

Commit

Permalink
Merge pull request #168 from Zegnat/plain-text-parsing
Browse files Browse the repository at this point in the history
New algorithm for plain text values. Merging to release as an `-alpha` release.
  • Loading branch information
aaronpk authored Mar 29, 2018
2 parents 5c056e8 + 7dbe03d commit e8da04f
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 97 deletions.
134 changes: 41 additions & 93 deletions Mf2/Parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -443,96 +443,44 @@ private function resolveChildUrls(DOMElement $el) {
}
}

public function textContent(DOMElement $el) {
$excludeTags = array('noframe', 'noscript', 'script', 'style', 'frames', 'frameset');

if (isset($el->tagName) and in_array(strtolower($el->tagName), $excludeTags)) {
return '';
}

$this->resolveChildUrls($el);

$clonedEl = $el->cloneNode(true);

foreach ($this->xpath->query('.//img', $clonedEl) as $imgEl) {
$newNode = $this->doc->createTextNode($imgEl->getAttribute($imgEl->hasAttribute('alt') ? 'alt' : 'src'));
$imgEl->parentNode->replaceChild($newNode, $imgEl);
}

foreach ($excludeTags as $tagName) {
foreach ($this->xpath->query(".//{$tagName}", $clonedEl) as $elToRemove) {
$elToRemove->parentNode->removeChild($elToRemove);
}
}

return $this->innerText($clonedEl);
/**
* The following two methods implements plain text parsing.
* @see https://wiki.zegnat.net/media/textparsing.html
**/
public function textContent(DOMElement $element)
{
return preg_replace(
'/(^[\t\n\f\r ]+| +(?=\n)|(?<=\n) +| +(?= )|[\t\n\f\r ]+$)/',
'',
$this->elementToString($element)
);
}

/**
* This method attempts to return a better 'innerText' representation than DOMNode::textContent
*
* @param DOMElement|DOMText $el
* @param bool $implied when parsing for implied name for h-*, rules may be slightly different
* @see: https://github.com/glennjones/microformat-shiv/blob/dev/lib/text.js
*/
public function innerText($el, $implied=false) {
$out = '';

$blockLevelTags = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'pre', 'table',
'address', 'article', 'aside', 'blockquote', 'caption', 'col', 'colgroup', 'dd', 'div',
'dt', 'dir', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'header', 'hgroup', 'hr',
'li', 'map', 'menu', 'nav', 'optgroup', 'option', 'section', 'tbody', 'testarea',
'tfoot', 'th', 'thead', 'tr', 'td', 'ul', 'ol', 'dl', 'details');

$excludeTags = array('noframe', 'noscript', 'script', 'style', 'frames', 'frameset');

// PHP DOMDocument doesn’t correctly handle whitespace around elements it doesn’t recognise.
$unsupportedTags = array('data');

if (isset($el->tagName)) {
if (in_array(strtolower($el->tagName), $excludeTags)) {
return $out;
} else if ($el->tagName == 'img') {
if ($el->hasAttribute('alt')) {
return $el->getAttribute('alt');
} else if (!$implied && $el->hasAttribute('src')) {
return $this->resolveUrl($el->getAttribute('src'));
}
} else if ($el->tagName == 'area' and $el->hasAttribute('alt')) {
return $el->getAttribute('alt');
} else if ($el->tagName == 'abbr' and $el->hasAttribute('title')) {
return $el->getAttribute('title');
}
}

// if node is a text node get its text
if (isset($el->nodeType) && $el->nodeType === 3) {
$out .= $el->textContent;
}

// get the text of the child nodes
if ($el->childNodes && $el->childNodes->length > 0) {
for ($j = 0; $j < $el->childNodes->length; $j++) {
$text = $this->innerText($el->childNodes->item($j), $implied);
if (!is_null($text)) {
$out .= $text;
}
}
}

if (isset($el->tagName)) {
// if its a block level tag add an additional space at the end
if (in_array(strtolower($el->tagName), $blockLevelTags)) {
$out .= ' ';
} elseif ($implied and in_array(strtolower($el->tagName), $unsupportedTags)) {
$out .= ' ';
} else if (strtolower($el->tagName) == 'br') {
// else if its a br, replace with newline
$out .= "\n";
}
}

return ($out === '') ? NULL : $out;
private function elementToString(DOMElement $input)
{
$output = '';
foreach ($input->childNodes as $child) {
if ($child->nodeType === XML_TEXT_NODE) {
$output .= str_replace(array("\t", "\n", "\r") , ' ', $child->textContent);
} else if ($child->nodeType === XML_ELEMENT_NODE) {
$tagName = strtoupper($child->tagName);
if (in_array($tagName, array('SCRIPT', 'STYLE'))) {
continue;
} else if ($tagName === 'IMG') {
if ($child->hasAttribute('alt')) {
$output .= ' ' . trim($child->getAttribute('alt'), "\t\n\f\r ") . ' ';
} else if ($child->hasAttribute('src')) {
$output .= ' ' . $this->resolveUrl(trim($child->getAttribute('src'), "\t\n\f\r ")) . ' ';
}
} else if ($tagName === 'BR') {
$output .= "\n";
} else if ($tagName === 'P') {
$output .= "\n" . $this->elementToString($child);
} else {
$output .= $this->elementToString($child);
}
}
}
return $output;
}

/**
Expand Down Expand Up @@ -648,7 +596,7 @@ public function parseP(\DOMElement $p) {
} elseif (in_array($p->tagName, array('data', 'input')) and $p->hasAttribute('value')) {
$pValue = $p->getAttribute('value');
} else {
$pValue = unicodeTrim($this->innerText($p));
$pValue = $this->textContent($p);
}

return $pValue;
Expand Down Expand Up @@ -685,7 +633,7 @@ public function parseU(\DOMElement $u) {
} elseif (in_array($u->tagName, array('data', 'input')) and $u->hasAttribute('value')) {
return $u->getAttribute('value');
} else {
return unicodeTrim($this->textContent($u));
return $this->textContent($u);
}
}

Expand Down Expand Up @@ -916,7 +864,7 @@ public function parseE(\DOMElement $e) {

$return = array(
'html' => unicodeTrim($html),
'value' => unicodeTrim($this->innerText($e)),
'value' => $this->textContent($e),
);

if($this->lang) {
Expand Down Expand Up @@ -1123,7 +1071,7 @@ public function parseH(\DOMElement $e, $is_backcompat = false, $has_nested_mf =
}
}

throw new Exception($this->innerText($e, true));
throw new Exception($this->textContent($e, true));
} catch (Exception $exc) {
$return['name'][] = unicodeTrim($exc->getMessage());
}
Expand Down
4 changes: 2 additions & 2 deletions tests/Mf2/ParseImpliedTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,8 @@ public function testParsesImpliedNameConsistentWithPName() {
$inner = "Name \nand more";
$test = '<span class="h-card"> ' . $inner .' </span><span class="h-card"><span class="p-name"> ' . $inner . ' </span></span>';
$result = Mf2\parse($test);
$this->assertEquals($inner, $result['items'][0]['properties']['name'][0]);
$this->assertEquals($inner, $result['items'][1]['properties']['name'][0]);
$this->assertEquals('Name and more', $result['items'][0]['properties']['name'][0]);
$this->assertEquals('Name and more', $result['items'][1]['properties']['name'][0]);
}


Expand Down
4 changes: 2 additions & 2 deletions tests/Mf2/ParserTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ public function testParseEResolvesRelativeLinks() {
$output = $parser->parse();

$this->assertEquals('Blah blah <a href="http://example.com/a-url">thing</a>. <object data="http://example.com/object"></object> <img src="http://example.com/img">', $output['items'][0]['properties']['content'][0]['html']);
$this->assertEquals('Blah blah thing. http://example.com/img', $output['items'][0]['properties']['content'][0]['value']);
$this->assertEquals('Blah blah thing. http://example.com/img', $output['items'][0]['properties']['content'][0]['value']);
}

public function testParseEWithBR() {
Expand Down Expand Up @@ -156,7 +156,7 @@ public function testHtmlEncodesNonEProperties() {


public function testHtmlEncodesImpliedProperties() {
$input = '<a class="h-card" href="&lt;url&gt;"><img src="&lt;img&gt;" />&lt;name&gt;</a>';
$input = '<a class="h-card" href="&lt;url&gt;"><img src="&lt;img&gt;" alt="" />&lt;name&gt;</a>';
$parser = new Parser($input);
$output = $parser->parse();

Expand Down
76 changes: 76 additions & 0 deletions tests/Mf2/PlainTextTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
<?php

namespace Mf2\Parser\Test;

class PlainTextTest extends \PHPUnit_Framework_TestCase {
/**
* @dataProvider aaronpkExpectations
*/
public function testAaronpkExpectations($input, $pName, $eValue, $eHtml) {
$parser = new \Mf2\Parser($input);
$output = $parser->parse();
$entryProperties = $output['items'][0]['properties'];
$this->assertEquals($pName, $entryProperties['name'][0]);
$this->assertEquals($eValue, $entryProperties['content'][0]['value']);
$this->assertEquals($eHtml, $entryProperties['content'][0]['html']);
}

public function aaronpkExpectations() {
return array(
1 => array(
"<div class=\"h-entry\">\n <div class=\"e-content p-name\"><p>Hello World</p></div>\n</div>",
"Hello World",
"Hello World",
"<p>Hello World</p>"
),
2 => array(
"<div class=\"h-entry\">\n <div class=\"e-content p-name\"><p>Hello<br>World</p></div>\n</div>",
"Hello\nWorld",
"Hello\nWorld",
"<p>Hello<br>World</p>"
),
3 => array(
"<div class=\"h-entry\">\n <div class=\"e-content p-name\"><p>Hello<br>\nWorld</p></div>\n</div>",
"Hello\nWorld",
"Hello\nWorld",
"<p>Hello<br>\nWorld</p>"
),
4 => array(
"<div class=\"h-entry\">\n <div class=\"e-content p-name\">\n <p>Hello World</p>\n </div>\n</div>",
"Hello World",
"Hello World",
"<p>Hello World</p>"
),
5 => array(
"<div class=\"h-entry\">\n <div class=\"e-content p-name\">Hello\nWorld</div>\n</div>",
"Hello World",
"Hello World",
"Hello\nWorld"
),
6 => array(
"<div class=\"h-entry\">\n <div class=\"e-content p-name\"><p>Hello</p><p>World</p></div>\n</div>",
"Hello\nWorld",
"Hello\nWorld",
"<p>Hello</p><p>World</p>"
),
7 => array(
"<div class=\"h-entry\">\n <div class=\"e-content p-name\">Hello<br>\n World</div>\n</div>",
"Hello\nWorld",
"Hello\nWorld",
"Hello<br>\n World",
),
8 => array(
"<div class=\"h-entry\">\n <div class=\"e-content p-name\"><br>Hello<br>World<br></div>\n</div>",
"Hello\nWorld",
"Hello\nWorld",
"<br>Hello<br>World<br>"
),
9 => array(
"<div class=\"h-entry\">\n <div class=\"e-content p-name\">\n <p>One</p>\n <p>Two</p>\n <p>Three</p>\n </div>\n</div>",
"One\nTwo\nThree",
"One\nTwo\nThree",
"<p>One</p>\n <p>Two</p>\n <p>Three</p>"
)
);
}
}

0 comments on commit e8da04f

Please sign in to comment.