From 13f058dffd62a4dee4c22e6c9175e2f1d2398688 Mon Sep 17 00:00:00 2001 From: Brosseau Valentin Date: Sun, 12 Aug 2012 12:20:13 +0200 Subject: [PATCH] Suppression du CamelCase. Ajout de commentaires. Modification des if pour supprimer le == false --- README.md | 4 +++- main.py | 41 +++++++++++++++++++++++------------------ 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index c2581f1..1b0c0b3 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,11 @@ Skip url (by extension) (skip pdf AND xml url): >>> python main.py --domain http://blog.lesite.us --output sitemap.xml --skipext pdf --skipext xml -Drop attribute from url (regexp) : +Drop url via regexp : >>> python main.py --domain http://blog.lesite.us --output sitemap.xml --drop "id=[0-9]{5}" + or (remove the index.html in the sitemap) + >>> python main.py --domain http://blog.lesite.us --drop "index.[a-z]{4}" Exclude url by filter a part of it : diff --git a/main.py b/main.py index f06e43e..17ef6a9 100755 --- a/main.py +++ b/main.py @@ -129,9 +129,9 @@ def exclude_url(exclude, link): rp.set_url(arg.domain+"robots.txt") rp.read() -responseCode={} -nbUrl=1 -nbRp=0 +response_code={} +nb_url=1 # Number of url. +nb_rp=0 # Number of url blocked by the robots.txt print (header, file=output_file) while tocrawl: crawling = tocrawl.pop() @@ -146,12 +146,12 @@ def exclude_url(exclude, link): response = urlopen(request) except Exception as e: if hasattr(e,'code'): - if e.code in responseCode: - responseCode[e.code]+=1 + if e.code in response_code: + response_code[e.code]+=1 else: - responseCode[e.code]=1 + response_code[e.code]=1 #else: - # responseCode['erreur']+=1 + # response_code['erreur']+=1 if arg.debug: logging.debug ("{1} ==> {0}".format(e, crawling)) response.close() @@ -160,10 +160,10 @@ def exclude_url(exclude, link): # Read the response try: msg = response.read() - if response.getcode() in responseCode: - responseCode[response.getcode()]+=1 + if response.getcode() in response_code: + response_code[response.getcode()]+=1 else: - responseCode[response.getcode()]=1 + response_code[response.getcode()]=1 response.close() except Exception as e: if arg.debug: @@ -212,18 +212,23 @@ def exclude_url(exclude, link): continue # Count one more URL - nbUrl+=1 + nb_url+=1 - if (can_fetch(arg.parserobots, rp, link, arg.debug) == False): + # Check if the navigation is allowed by the robots.txt + if (not can_fetch(arg.parserobots, rp, link, arg.debug)): if link not in excluded: excluded.add(link) - nbRp+=1 + nb_rp+=1 continue + + # Check if the current file extension is allowed or not. if (target_extension in arg.skipext): if link not in excluded: excluded.add(link) continue - if (exclude_url(arg.exclude, link)==False): + + # Check if the current url doesn't contain an excluded word + if (not exclude_url(arg.exclude, link)): if link not in excluded: excluded.add(link) continue @@ -232,13 +237,13 @@ def exclude_url(exclude, link): print (footer, file=output_file) if arg.debug: - logging.debug ("Number of found URL : {0}".format(nbUrl)) + logging.debug ("Number of found URL : {0}".format(nb_url)) logging.debug ("Number of link crawled : {0}".format(len(crawled))) if arg.parserobots: - logging.debug ("Number of link block by robots.txt : {0}".format(nbRp)) + logging.debug ("Number of link block by robots.txt : {0}".format(nb_rp)) - for code in responseCode: - logging.debug ("Nb Code HTTP {0} : {1}".format(code, responseCode[code])) + for code in response_code: + logging.debug ("Nb Code HTTP {0} : {1}".format(code, response_code[code])) if output_file: output_file.close()