Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: custom link parser #458

Merged
merged 11 commits into from
Jan 2, 2024
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,26 @@ This package comes with three `CrawlProfiles` out of the box:
- `CrawlInternalUrls`: this profile will only crawl the internal urls on the pages of a host.
- `CrawlSubdomains`: this profile will only crawl the internal urls and its subdomains on the pages of a host.

### Custom link extraction

You can customize how links are extracted from a page by passing a custom `UrlParser` to the crawler.

```php
Crawler::create()
->setUrlParserClass(<class that implements \Spatie\Crawler\UrlParsers\UrlParser>::class)
...
```

By default, the `LinkUrlParser` is used. This parser will extract all links from the `href` attribute of `a` tags.

There is also a built-in `SitemapUrlParser` that will extract & crawl all links from a sitemap. It does support sitemap index files.

```php
Crawler::create()
->setUrlParserClass(SitemapUrlParser::class)
...
```

### Ignoring robots.txt and robots meta

By default, the crawler will respect robots data. It is possible to disable these checks like so:
Expand Down
8 changes: 4 additions & 4 deletions src/CrawlObservers/CrawlObserver.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ public function willCrawl(UriInterface $url, ?string $linkText): void
public function crawled(
UriInterface $url,
ResponseInterface $response,
UriInterface $foundOnUrl = null,
string $linkText = null,
?UriInterface $foundOnUrl = null,
?string $linkText = null,
): void {
}

Expand All @@ -32,8 +32,8 @@ public function crawled(
public function crawlFailed(
UriInterface $url,
RequestException $requestException,
UriInterface $foundOnUrl = null,
string $linkText = null,
?UriInterface $foundOnUrl = null,
?string $linkText = null,
): void {
}

Expand Down
4 changes: 2 additions & 2 deletions src/CrawlUrl.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ class CrawlUrl

public static function create(
UriInterface $url,
UriInterface $foundOnUrl = null,
?UriInterface $foundOnUrl = null,
$id = null,
string $linkText = null,
?string $linkText = null,
): static {
$static = new static($url, $foundOnUrl, linkText: $linkText);

Expand Down
19 changes: 18 additions & 1 deletion src/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
use Spatie\Crawler\Exceptions\InvalidCrawlRequestHandler;
use Spatie\Crawler\Handlers\CrawlRequestFailed;
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
use Spatie\Crawler\UrlParsers\LinkUrlParser;
use Spatie\Robots\RobotsTxt;
use Tree\Node\Node;

Expand Down Expand Up @@ -62,6 +63,8 @@ class Crawler

protected string $crawlRequestFailedClass;

protected string $urlParserClass;

protected int $delayBetweenRequests = 0;

protected array $allowedMimeTypes = [];
Expand Down Expand Up @@ -102,6 +105,8 @@ public function __construct(
$this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class;

$this->crawlRequestFailedClass = CrawlRequestFailed::class;

$this->urlParserClass = LinkUrlParser::class;
}

public function getDefaultScheme(): string
Expand Down Expand Up @@ -345,6 +350,18 @@ public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): sel
return $this;
}

public function setUrlParserClass(string $urlParserClass): self
{
$this->urlParserClass = $urlParserClass;

return $this;
}

public function getUrlParserClass(): string
{
return $this->urlParserClass;
}

public function setBrowsershot(Browsershot $browsershot)
{
$this->browsershot = $browsershot;
Expand Down Expand Up @@ -430,7 +447,7 @@ public function startCrawling(UriInterface|string $baseUrl)
}
}

public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node
public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null): ?Node
{
if (is_null($this->maximumDepth)) {
return new Node((string) $url);
Expand Down
9 changes: 5 additions & 4 deletions src/Handlers/CrawlRequestFulfilled.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,17 @@
use Spatie\Crawler\CrawlerRobots;
use Spatie\Crawler\CrawlProfiles\CrawlSubdomains;
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\LinkAdder;
use Spatie\Crawler\ResponseWithCachedBody;
use Spatie\Crawler\UrlParsers\UrlParser;

class CrawlRequestFulfilled
{
protected LinkAdder $linkAdder;
protected UrlParser $urlParser;

public function __construct(protected Crawler $crawler)
{
$this->linkAdder = new LinkAdder($this->crawler);
$urlParserClass = $this->crawler->getUrlParserClass();
$this->urlParser = new $urlParserClass($this->crawler);
}

public function __invoke(ResponseInterface $response, $index)
Expand Down Expand Up @@ -62,7 +63,7 @@ public function __invoke(ResponseInterface $response, $index)

$baseUrl = $this->getBaseUrl($response, $crawlUrl);

$this->linkAdder->addFromHtml($body, $baseUrl);
$this->urlParser->addFromHtml($body, $baseUrl);

usleep($this->crawler->getDelayBetweenRequests());
}
Expand Down
2 changes: 1 addition & 1 deletion src/ResponseWithCachedBody.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public static function fromGuzzlePsr7Response(ResponseInterface $response): stat
);
}

public function setCachedBody(string $body = null): void
public function setCachedBody(?string $body = null): void
{
$this->cachedBody = $body;
}
Expand Down
9 changes: 6 additions & 3 deletions src/LinkAdder.php → src/UrlParsers/LinkUrlParser.php
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
<?php

namespace Spatie\Crawler;
namespace Spatie\Crawler\UrlParsers;

use Illuminate\Support\Collection;
use InvalidArgumentException;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler;
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\Url;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
use Symfony\Component\DomCrawler\Link;
use Tree\Node\Node;

class LinkAdder
class LinkUrlParser implements UrlParser
{
protected Crawler $crawler;

Expand Down Expand Up @@ -66,7 +69,7 @@ protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl):

return new Url($link->getUri(), $linkText);
} catch (InvalidArgumentException $exception) {
return;
return null;
}
})
->filter();
Expand Down
89 changes: 89 additions & 0 deletions src/UrlParsers/SitemapUrlParser.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
<?php

namespace Spatie\Crawler\UrlParsers;

use Illuminate\Support\Collection;
use InvalidArgumentException;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler;
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\Url;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
use Tree\Node\Node;

class SitemapUrlParser implements UrlParser
{
protected Crawler $crawler;

public function __construct(Crawler $crawler)
{
$this->crawler = $crawler;
}

public function addFromHtml(string $html, UriInterface $foundOnUrl): void
{
$allLinks = $this->extractLinksFromHtml($html, $foundOnUrl);

collect($allLinks)
->filter(fn (Url $url) => $this->hasCrawlableScheme($url))
->map(fn (Url $url) => $this->normalizeUrl($url))
->filter(function (Url $url) use ($foundOnUrl) {
if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl)) {
return false;
}

return $this->shouldCrawl($node);
})
->filter(fn (Url $url) => ! str_contains($url->getPath(), '/tel:'))
->each(function (Url $url) use ($foundOnUrl) {
$crawlUrl = CrawlUrl::create($url, $foundOnUrl, linkText: $url->linkText());

$this->crawler->addToCrawlQueue($crawlUrl);
});
}

protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl): ?Collection
{
$domCrawler = new DomCrawler($html, $foundOnUrl);

return collect($domCrawler->filterXPath('//loc')
->each(function (DomCrawler $node) {
try {
$linkText = $node->text();

if ($linkText) {
$linkText = substr($linkText, 0, 4000);
}

return new Url($linkText, $linkText);
} catch (InvalidArgumentException $exception) {
return null;
}
}));
}

protected function hasCrawlableScheme(UriInterface $uri): bool
{
return in_array($uri->getScheme(), ['http', 'https']);
}

protected function normalizeUrl(UriInterface $url): UriInterface
{
return $url->withFragment('');
}

protected function shouldCrawl(Node $node): bool
{
if ($this->crawler->mustRespectRobots() && ! $this->crawler->getRobotsTxt()->allows($node->getValue(), $this->crawler->getUserAgent())) {
return false;
}

$maximumDepth = $this->crawler->getMaximumDepth();

if (is_null($maximumDepth)) {
return true;
}

return $node->getDepth() <= $maximumDepth;
}
}
13 changes: 13 additions & 0 deletions src/UrlParsers/UrlParser.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<?php

namespace Spatie\Crawler\UrlParsers;

use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler;

interface UrlParser
{
public function __construct(Crawler $crawler);

public function addFromHtml(string $html, UriInterface $foundOnUrl): void;
}
8 changes: 4 additions & 4 deletions tests/CrawlObserverCollectionTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,17 @@
public function crawled(
UriInterface $url,
ResponseInterface $response,
UriInterface $foundOnUrl = null,
string $linkText = null,
?UriInterface $foundOnUrl = null,
?string $linkText = null,
): void {
$this->crawled = true;
}

public function crawlFailed(
UriInterface $url,
RequestException $requestException,
UriInterface $foundOnUrl = null,
string $linkText = null,
?UriInterface $foundOnUrl = null,
?string $linkText = null,
): void {
$this->failed = true;
}
Expand Down
46 changes: 46 additions & 0 deletions tests/SitemapUrlParserTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
<?php

use Spatie\Crawler\Test\TestClasses\Log;
use Spatie\Crawler\UrlParsers\SitemapUrlParser;

beforeEach(function () {
skipIfTestServerIsNotRunning();

Log::reset();
});

it('should extract child sitemaps from sitemap index', function () {
createCrawler()
->setUrlParserClass(SitemapUrlParser::class)
->startCrawling('http://localhost:8080/sitemap_index.xml');

expect(['url' => 'http://localhost:8080/sitemap1.xml', 'foundOn' => 'http://localhost:8080/sitemap_index.xml'])
->toBeCrawledOnce();

expect(['url' => 'http://localhost:8080/sitemap2.xml', 'foundOn' => 'http://localhost:8080/sitemap_index.xml'])
->toBeCrawledOnce();
});

it('should extract urls from sitemaps trough sitemap index', function () {
createCrawler()
->setUrlParserClass(SitemapUrlParser::class)
->startCrawling('http://localhost:8080/sitemap_index.xml');

expect(['url' => 'http://localhost:8080/', 'foundOn' => 'http://localhost:8080/sitemap1.xml'])
->toBeCrawledOnce();

expect(['url' => 'http://localhost:8080/link1', 'foundOn' => 'http://localhost:8080/sitemap1.xml'])
->toBeCrawledOnce();

expect(['url' => 'http://localhost:8080/link1-next', 'foundOn' => 'http://localhost:8080/sitemap2.xml'])
->toBeCrawledOnce();

expect(['url' => 'http://localhost:8080/link1-prev', 'foundOn' => 'http://localhost:8080/sitemap2.xml'])
->toBeCrawledOnce();

expect(['url' => 'http://localhost:8080/link2', 'foundOn' => 'http://localhost:8080/sitemap2.xml'])
->toBeCrawledOnce();

expect(['url' => 'http://localhost:8080/link3', 'foundOn' => 'http://localhost:8080/sitemap2.xml'])
->toBeCrawledOnce();
});
10 changes: 5 additions & 5 deletions tests/TestClasses/CrawlLogger.php
Original file line number Diff line number Diff line change
Expand Up @@ -34,22 +34,22 @@ public function willCrawl(UriInterface $url, ?string $linkText): void
public function crawled(
UriInterface $url,
ResponseInterface $response,
UriInterface $foundOnUrl = null,
string $linkText = null,
?UriInterface $foundOnUrl = null,
?string $linkText = null,
): void {
$this->logCrawl($url, $foundOnUrl, $linkText);
}

public function crawlFailed(
UriInterface $url,
RequestException $requestException,
UriInterface $foundOnUrl = null,
string $linkText = null,
?UriInterface $foundOnUrl = null,
?string $linkText = null,
): void {
$this->logCrawl($url, $foundOnUrl, $linkText);
}

protected function logCrawl(UriInterface $url, ?UriInterface $foundOnUrl, string $linkText = null)
protected function logCrawl(UriInterface $url, ?UriInterface $foundOnUrl, ?string $linkText = null)
{
$logText = "{$this->observerId}hasBeenCrawled: {$url}";

Expand Down
Loading