diff --git a/README.md b/README.md index 55e82bd..47a2f04 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,12 @@ More informations here https://support.google.com/webmasters/answer/178636?hl=en $ python main.py --domain https://blog.lesite.us --output sitemap.xml --images ``` +#### Allow fetching content from Iframes + + ``` + $ python main.py --domain https://blog.lesite.us --output sitemap.xml --fetch-iframes + ``` + #### Enable report for print summary of the crawl: ``` diff --git a/crawler.py b/crawler.py index 5b27bf3..6f180c5 100644 --- a/crawler.py +++ b/crawler.py @@ -51,6 +51,8 @@ class Crawler: # TODO also search for window.location={.*?} linkregex = re.compile(b']*href=[\'|"](.*?)[\'"][^>]*?>') imageregex = re.compile (b']*src=[\'|"](.*?)[\'"].*?>') + iframeregex = re.compile (b'