Skip to content

Commit

Permalink
Merge pull request #4 from c4software/master
Browse files Browse the repository at this point in the history
Quelques ajustements
  • Loading branch information
sebclick committed Aug 12, 2012
2 parents 3e1f065 + 13f058d commit 56a2d7e
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 19 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@ Skip url (by extension) (skip pdf AND xml url):

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --skipext pdf --skipext xml

Drop attribute from url (regexp) :
Drop url via regexp :

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --drop "id=[0-9]{5}"
or (remove the index.html in the sitemap)
>>> python main.py --domain http://blog.lesite.us --drop "index.[a-z]{4}"

Exclude url by filter a part of it :

Expand Down
41 changes: 23 additions & 18 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,9 @@ def exclude_url(exclude, link):
rp.set_url(arg.domain+"robots.txt")
rp.read()

responseCode={}
nbUrl=1
nbRp=0
response_code={}
nb_url=1 # Number of url.
nb_rp=0 # Number of url blocked by the robots.txt
print (header, file=output_file)
while tocrawl:
crawling = tocrawl.pop()
Expand All @@ -146,12 +146,12 @@ def exclude_url(exclude, link):
response = urlopen(request)
except Exception as e:
if hasattr(e,'code'):
if e.code in responseCode:
responseCode[e.code]+=1
if e.code in response_code:
response_code[e.code]+=1
else:
responseCode[e.code]=1
response_code[e.code]=1
#else:
# responseCode['erreur']+=1
# response_code['erreur']+=1
if arg.debug:
logging.debug ("{1} ==> {0}".format(e, crawling))
response.close()
Expand All @@ -160,10 +160,10 @@ def exclude_url(exclude, link):
# Read the response
try:
msg = response.read()
if response.getcode() in responseCode:
responseCode[response.getcode()]+=1
if response.getcode() in response_code:
response_code[response.getcode()]+=1
else:
responseCode[response.getcode()]=1
response_code[response.getcode()]=1
response.close()
except Exception as e:
if arg.debug:
Expand Down Expand Up @@ -212,18 +212,23 @@ def exclude_url(exclude, link):
continue

# Count one more URL
nbUrl+=1
nb_url+=1

if (can_fetch(arg.parserobots, rp, link, arg.debug) == False):
# Check if the navigation is allowed by the robots.txt
if (not can_fetch(arg.parserobots, rp, link, arg.debug)):
if link not in excluded:
excluded.add(link)
nbRp+=1
nb_rp+=1
continue

# Check if the current file extension is allowed or not.
if (target_extension in arg.skipext):
if link not in excluded:
excluded.add(link)
continue
if (exclude_url(arg.exclude, link)==False):

# Check if the current url doesn't contain an excluded word
if (not exclude_url(arg.exclude, link)):
if link not in excluded:
excluded.add(link)
continue
Expand All @@ -232,13 +237,13 @@ def exclude_url(exclude, link):
print (footer, file=output_file)

if arg.debug:
logging.debug ("Number of found URL : {0}".format(nbUrl))
logging.debug ("Number of found URL : {0}".format(nb_url))
logging.debug ("Number of link crawled : {0}".format(len(crawled)))
if arg.parserobots:
logging.debug ("Number of link block by robots.txt : {0}".format(nbRp))
logging.debug ("Number of link block by robots.txt : {0}".format(nb_rp))

for code in responseCode:
logging.debug ("Nb Code HTTP {0} : {1}".format(code, responseCode[code]))
for code in response_code:
logging.debug ("Nb Code HTTP {0} : {1}".format(code, response_code[code]))

if output_file:
output_file.close()

0 comments on commit 56a2d7e

Please sign in to comment.