From 13f058dffd62a4dee4c22e6c9175e2f1d2398688 Mon Sep 17 00:00:00 2001
From: Brosseau Valentin <c4software@gmail.com>
Date: Sun, 12 Aug 2012 12:20:13 +0200
Subject: [PATCH] Suppression du CamelCase. Ajout de commentaires. Modification
 des if pour supprimer le == false

---
 README.md |  4 +++-
 main.py   | 41 +++++++++++++++++++++++------------------
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index c2581f1..1b0c0b3 100644
--- a/README.md
+++ b/README.md
@@ -24,9 +24,11 @@ Skip url (by extension) (skip pdf AND xml url):
 
 	>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --skipext pdf --skipext xml 
 
-Drop attribute from url (regexp) :
+Drop url via regexp :
 
 	>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --drop "id=[0-9]{5}"
+	or (remove the index.html in the sitemap)
+	>>> python main.py --domain http://blog.lesite.us --drop "index.[a-z]{4}" 
 
 Exclude url by filter a part of it :
 
diff --git a/main.py b/main.py
index f06e43e..17ef6a9 100755
--- a/main.py
+++ b/main.py
@@ -129,9 +129,9 @@ def exclude_url(exclude, link):
 	rp.set_url(arg.domain+"robots.txt")
 	rp.read()
 
-responseCode={}
-nbUrl=1
-nbRp=0
+response_code={}
+nb_url=1 # Number of url.
+nb_rp=0 # Number of url blocked by the robots.txt
 print (header, file=output_file)
 while tocrawl:
 	crawling = tocrawl.pop()
@@ -146,12 +146,12 @@ def exclude_url(exclude, link):
 		response = urlopen(request)
 	except Exception as e:
 		if hasattr(e,'code'):
-			if e.code in responseCode:
-				responseCode[e.code]+=1
+			if e.code in response_code:
+				response_code[e.code]+=1
 			else:
-				responseCode[e.code]=1
+				response_code[e.code]=1
 		#else:
-		#	responseCode['erreur']+=1
+		#	response_code['erreur']+=1
 		if arg.debug:
 			logging.debug ("{1} ==> {0}".format(e, crawling))
 		response.close()
@@ -160,10 +160,10 @@ def exclude_url(exclude, link):
 	# Read the response
 	try:
 		msg = response.read()
-		if response.getcode() in responseCode:
-			responseCode[response.getcode()]+=1
+		if response.getcode() in response_code:
+			response_code[response.getcode()]+=1
 		else:
-			responseCode[response.getcode()]=1
+			response_code[response.getcode()]=1
 		response.close()
 	except Exception as e:
 		if arg.debug:
@@ -212,18 +212,23 @@ def exclude_url(exclude, link):
 			continue
 		
 		# Count one more URL
-		nbUrl+=1
+		nb_url+=1
 
-		if (can_fetch(arg.parserobots, rp, link, arg.debug) == False):
+		# Check if the navigation is allowed by the robots.txt
+		if (not can_fetch(arg.parserobots, rp, link, arg.debug)):
 			if link not in excluded:
 				excluded.add(link)
-			nbRp+=1
+			nb_rp+=1
 			continue
+
+		# Check if the current file extension is allowed or not.
 		if (target_extension in arg.skipext):
 			if link not in excluded:
 				excluded.add(link)
 			continue
-		if (exclude_url(arg.exclude, link)==False):
+
+		# Check if the current url doesn't contain an excluded word
+		if (not exclude_url(arg.exclude, link)):
 			if link not in excluded:
 				excluded.add(link)
 			continue
@@ -232,13 +237,13 @@ def exclude_url(exclude, link):
 print (footer, file=output_file)
 
 if arg.debug:
-	logging.debug ("Number of found URL : {0}".format(nbUrl))
+	logging.debug ("Number of found URL : {0}".format(nb_url))
 	logging.debug ("Number of link crawled : {0}".format(len(crawled)))
 	if arg.parserobots:
-		logging.debug ("Number of link block by robots.txt : {0}".format(nbRp))
+		logging.debug ("Number of link block by robots.txt : {0}".format(nb_rp))
 
-	for code in responseCode:
-		logging.debug ("Nb Code HTTP {0} : {1}".format(code, responseCode[code]))
+	for code in response_code:
+		logging.debug ("Nb Code HTTP {0} : {1}".format(code, response_code[code]))
 
 if output_file:
 	output_file.close()