Merge pull request #4 from c4software/master

Quelques ajustements
c4software · Aug 12, 2012 · 56a2d7e · 56a2d7e
2 parents 3e1f065 + 13f058d
commit 56a2d7e
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -24,9 +24,11 @@ Skip url (by extension) (skip pdf AND xml url):
 
 	>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --skipext pdf --skipext xml 
 
-Drop attribute from url (regexp) :
+Drop url via regexp :
 
 	>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --drop "id=[0-9]{5}"
+	or (remove the index.html in the sitemap)
+	>>> python main.py --domain http://blog.lesite.us --drop "index.[a-z]{4}" 
 
 Exclude url by filter a part of it :
 

diff --git a/main.py b/main.py
@@ -129,9 +129,9 @@ def exclude_url(exclude, link):
 	rp.set_url(arg.domain+"robots.txt")
 	rp.read()
 
-responseCode={}
-nbUrl=1
-nbRp=0
+response_code={}
+nb_url=1 # Number of url.
+nb_rp=0 # Number of url blocked by the robots.txt
 print (header, file=output_file)
 while tocrawl:
 	crawling = tocrawl.pop()
@@ -146,12 +146,12 @@ def exclude_url(exclude, link):
 		response = urlopen(request)
 	except Exception as e:
 		if hasattr(e,'code'):
-			if e.code in responseCode:
-				responseCode[e.code]+=1
+			if e.code in response_code:
+				response_code[e.code]+=1
 			else:
-				responseCode[e.code]=1
+				response_code[e.code]=1
 		#else:
-		#	responseCode['erreur']+=1
+		#	response_code['erreur']+=1
 		if arg.debug:
 			logging.debug ("{1} ==> {0}".format(e, crawling))
 		response.close()
@@ -160,10 +160,10 @@ def exclude_url(exclude, link):
 	# Read the response
 	try:
 		msg = response.read()
-		if response.getcode() in responseCode:
-			responseCode[response.getcode()]+=1
+		if response.getcode() in response_code:
+			response_code[response.getcode()]+=1
 		else:
-			responseCode[response.getcode()]=1
+			response_code[response.getcode()]=1
 		response.close()
 	except Exception as e:
 		if arg.debug:
@@ -212,18 +212,23 @@ def exclude_url(exclude, link):
 			continue
 
 		# Count one more URL
-		nbUrl+=1
+		nb_url+=1
 
-		if (can_fetch(arg.parserobots, rp, link, arg.debug) == False):
+		# Check if the navigation is allowed by the robots.txt
+		if (not can_fetch(arg.parserobots, rp, link, arg.debug)):
 			if link not in excluded:
 				excluded.add(link)
-			nbRp+=1
+			nb_rp+=1
 			continue
+
+		# Check if the current file extension is allowed or not.
 		if (target_extension in arg.skipext):
 			if link not in excluded:
 				excluded.add(link)
 			continue
-		if (exclude_url(arg.exclude, link)==False):
+
+		# Check if the current url doesn't contain an excluded word
+		if (not exclude_url(arg.exclude, link)):
 			if link not in excluded:
 				excluded.add(link)
 			continue
@@ -232,13 +237,13 @@ def exclude_url(exclude, link):
 print (footer, file=output_file)
 
 if arg.debug:
-	logging.debug ("Number of found URL : {0}".format(nbUrl))
+	logging.debug ("Number of found URL : {0}".format(nb_url))
 	logging.debug ("Number of link crawled : {0}".format(len(crawled)))
 	if arg.parserobots:
-		logging.debug ("Number of link block by robots.txt : {0}".format(nbRp))
+		logging.debug ("Number of link block by robots.txt : {0}".format(nb_rp))
 
-	for code in responseCode:
-		logging.debug ("Nb Code HTTP {0} : {1}".format(code, responseCode[code]))
+	for code in response_code:
+		logging.debug ("Nb Code HTTP {0} : {1}".format(code, response_code[code]))
 
 if output_file:
 	output_file.close()