Skip to content

Commit

Permalink
Merge pull request #109 from eHtmlu/issue108_base_href
Browse files Browse the repository at this point in the history
Enhancement: Consider baseURI of referringElement (<base href="...">)
  • Loading branch information
dantleech authored Aug 16, 2020
2 parents 6feb284 + 1f360a2 commit acac7cc
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 1 deletion.
7 changes: 7 additions & 0 deletions lib/Model/ReferringElement.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class ReferringElement
{
private $xpath = '';
private $title = '';
private $baseUri = '';

public static function none(): ReferringElement
{
Expand All @@ -19,10 +20,16 @@ public static function fromDOMNode(DOMNode $element): ReferringElement
$new = new self();
$new->xpath = $element->getNodePath();
$new->title = trim($element->nodeValue);
$new->baseUri = $element->baseURI;

return $new;
}

public function baseUri(): ?string
{
return $this->baseUri;
}

public function title(): string
{
return $this->title;
Expand Down
13 changes: 12 additions & 1 deletion lib/Model/Url.php
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ public function resolveUrl(string $link, ReferringElement $referringElement = nu
throw new InvalidUrl($e->getMessage(), 0, $e);
}

$resolvedLink = FollowRedirects::resolve($this->uri, $parsedLink);
$baseUri = $this->resolveBaseUri($referringElement);

$resolvedLink = FollowRedirects::resolve($baseUri->uri, $parsedLink);

if ('' !== $resolvedLink->getFragment()) {
// unconditionally remove fragments
Expand Down Expand Up @@ -158,4 +160,13 @@ public function referringElement(): ?ReferringElement
{
return $this->referringElement;
}

private function resolveBaseUri(ReferringElement $referringElement = null): Url
{
if ($referringElement && ($baseUri = $referringElement->baseUri())) {
return self::fromUrl($baseUri);
}

return $this;
}
}
15 changes: 15 additions & 0 deletions tests/EndToEnd/Command/CrawlCommandTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,21 @@ public function testCrawlsUrl()
$this->assertProcessSuccess($process);
}

public function testBaseUrl()
{
$process = $this->execute([
self::EXAMPLE_URL,
'--output='.$this->workspace()->path('/out.json'),
], 'base-url');

$this->assertProcessSuccess($process);

$rows = $this->parseResults($this->workspace()->path('/out.json'));

$url = $this->findUrl($rows, 'http://example.com/index.html');
$this->assertNotNull($url);
}

public function testPublishesReport()
{
$process = $this->execute([
Expand Down
9 changes: 9 additions & 0 deletions tests/Example/base-url/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<html>
<head>
<base href="http://example.com/">
</head>
<body>
<a href="index.html">index</a>
</body>
</html>

0 comments on commit acac7cc

Please sign in to comment.