diff --git a/README.md b/README.md index 5f60a60..c2581f1 100644 --- a/README.md +++ b/README.md @@ -24,10 +24,14 @@ Skip url (by extension) (skip pdf AND xml url): >>> python main.py --domain http://blog.lesite.us --output sitemap.xml --skipext pdf --skipext xml +Drop attribute from url (regexp) : + + >>> python main.py --domain http://blog.lesite.us --output sitemap.xml --drop "id=[0-9]{5}" + Exclude url by filter a part of it : >>> python main.py --domain http://blog.lesite.us --output sitemap.xml --exclude "action=edit" Read the robots.txt to ignore some url: - >>> python main.py --domain http://blog.lesite.us --output sitemap.xml --parserobots \ No newline at end of file + >>> python main.py --domain http://blog.lesite.us --output sitemap.xml --parserobots diff --git a/main.py b/main.py index 4b3eb97..f06e43e 100755 --- a/main.py +++ b/main.py @@ -47,23 +47,24 @@ def exclude_url(exclude, link): parser.add_argument('--debug', action="store_true", default=False, help="Enable debug mode") parser.add_argument('--output', action="store", default=None, help="Output file") parser.add_argument('--exclude', action="append", default=[], required=False, help="Exclude Url if contain") +parser.add_argument('--drop', action="append", default=[], required=False, help="Drop a string from the url") group = parser.add_mutually_exclusive_group() group.add_argument('--config', action="store", default=None, help="Configuration file in json format") group.add_argument('--domain', action="store", default="", help="Target domain (ex: http://blog.lesite.us)") arg = parser.parse_args() - # Read the config file if needed if arg.config is not None: try: config_data=open(arg.config,'r') config = json.load(config_data) config_data.close() - except: + except Exception as e: if arg.debug: logging.debug ("Bad or unavailable config file") config = {} + print(e) else: config = {} @@ -101,6 +102,7 @@ def exclude_url(exclude, link): tocrawl = set([arg.domain]) crawled = set([]) +excluded = set([]) # TODO also search for window.location={.*?} linkregex = re.compile(b'') @@ -128,32 +130,53 @@ def exclude_url(exclude, link): rp.read() responseCode={} +nbUrl=1 +nbRp=0 print (header, file=output_file) while tocrawl: crawling = tocrawl.pop() + url = urlparse(crawling) + crawled.add(crawling) + try: request = Request(crawling, headers={"User-Agent":'Sitemap crawler'}) + # TODO : The urlopen() function has been removed in Python 3 in favor of urllib2.urlopen() response = urlopen(request) + except Exception as e: + if hasattr(e,'code'): + if e.code in responseCode: + responseCode[e.code]+=1 + else: + responseCode[e.code]=1 + #else: + # responseCode['erreur']+=1 + if arg.debug: + logging.debug ("{1} ==> {0}".format(e, crawling)) + response.close() + continue + + # Read the response + try: + msg = response.read() if response.getcode() in responseCode: responseCode[response.getcode()]+=1 else: - responseCode[response.getcode()] = 0 - if response.getcode()==200: - msg = response.read() - else: - msg = "" - + responseCode[response.getcode()]=1 response.close() except Exception as e: if arg.debug: - logging.debug ("{1} ==> {0}".format(e, crawling)) + logging.debug ("{1} ===> {0}".format(e, crawling)) continue - + + print (""+url.geturl()+"", file=output_file) + if output_file: + output_file.flush() + + # Found links links = linkregex.findall(msg) - crawled.add(crawling) for link in links: link = link.decode("utf-8") if link.startswith('/'): @@ -167,18 +190,52 @@ def exclude_url(exclude, link): if "#" in link: link = link[:link.index('#')] + # Drop attributes if needed + if arg.drop is not None: + for toDrop in arg.drop: + link=re.sub(toDrop,'',link) + # Parse the url to get domain and file extension parsed_link = urlparse(link) domain_link = parsed_link.netloc target_extension = os.path.splitext(parsed_link.path)[1][1:] - if (link not in crawled) and (link not in tocrawl) and (domain_link == target_domain) and can_fetch(arg.parserobots, rp, link,arg.debug) and ("javascript:" not in link) and (target_extension not in arg.skipext) and (exclude_url(arg.exclude, link)): - print (""+link+"", file=output_file) - tocrawl.add(link) + if (link in crawled): + continue + if (link in tocrawl): + continue + if (link in excluded): + continue + if (domain_link != target_domain): + continue + if ("javascript" in link): + continue + + # Count one more URL + nbUrl+=1 + + if (can_fetch(arg.parserobots, rp, link, arg.debug) == False): + if link not in excluded: + excluded.add(link) + nbRp+=1 + continue + if (target_extension in arg.skipext): + if link not in excluded: + excluded.add(link) + continue + if (exclude_url(arg.exclude, link)==False): + if link not in excluded: + excluded.add(link) + continue + + tocrawl.add(link) print (footer, file=output_file) if arg.debug: + logging.debug ("Number of found URL : {0}".format(nbUrl)) logging.debug ("Number of link crawled : {0}".format(len(crawled))) + if arg.parserobots: + logging.debug ("Number of link block by robots.txt : {0}".format(nbRp)) for code in responseCode: logging.debug ("Nb Code HTTP {0} : {1}".format(code, responseCode[code]))