c4software · c4software · Aug 12, 2012 · Aug 7, 2012 · Aug 8, 2012 · Aug 8, 2012
diff --git a/README.md b/README.md
@@ -24,10 +24,14 @@ Skip url (by extension) (skip pdf AND xml url):
 
 	>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --skipext pdf --skipext xml 
 
+Drop attribute from url (regexp) :
+
+	>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --drop "id=[0-9]{5}"
+
 Exclude url by filter a part of it :
 
 	>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --exclude "action=edit"
 
 Read the robots.txt to ignore some url:
 
-	>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --parserobots
+	>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --parserobots
diff --git a/main.py b/main.py
@@ -47,23 +47,24 @@ def exclude_url(exclude, link):
 parser.add_argument('--debug', action="store_true", default=False, help="Enable debug mode")
 parser.add_argument('--output', action="store", default=None, help="Output file")
 parser.add_argument('--exclude', action="append", default=[], required=False, help="Exclude Url if contain")
+parser.add_argument('--drop', action="append", default=[], required=False, help="Drop a string from the url")
 
 group = parser.add_mutually_exclusive_group()
 group.add_argument('--config', action="store", default=None, help="Configuration file in json format")
 group.add_argument('--domain', action="store", default="", help="Target domain (ex: http://blog.lesite.us)")
 
 arg = parser.parse_args()
-
 # Read the config file if needed
 if arg.config is not None:
 	try:
 		config_data=open(arg.config,'r')
 		config = json.load(config_data)
 		config_data.close()
-	except:
+	except Exception as e:
 		if arg.debug:
 			logging.debug ("Bad or unavailable config file")
 		config = {}
+		print(e)
 else:
 	config = {}
 
@@ -101,6 +102,7 @@ def exclude_url(exclude, link):
 
 tocrawl = set([arg.domain])
 crawled = set([])
+excluded = set([])
 # TODO also search for window.location={.*?}
 linkregex = re.compile(b'<a href=[\'|"](.*?)[\'"].*?>')
 
@@ -128,32 +130,53 @@ def exclude_url(exclude, link):
 	rp.read()
 
 responseCode={}
+nbUrl=1
+nbRp=0
 print (header, file=output_file)
 while tocrawl:
 	crawling = tocrawl.pop()
 
+
 	url = urlparse(crawling)
+	crawled.add(crawling)
+
 	try:
 		request = Request(crawling, headers={"User-Agent":'Sitemap crawler'})
+		# TODO : The urlopen() function has been removed in Python 3 in favor of urllib2.urlopen()
 		response = urlopen(request)
+	except Exception as e:
+		if hasattr(e,'code'):
+			if e.code in responseCode:
+				responseCode[e.code]+=1
+			else:
+				responseCode[e.code]=1
+		#else:
+		#	responseCode['erreur']+=1
+		if arg.debug:
+			logging.debug ("{1} ==> {0}".format(e, crawling))
+		response.close()
+		continue
+
+	# Read the response
+	try:
+		msg = response.read()
 		if response.getcode() in responseCode:
 			responseCode[response.getcode()]+=1
 		else:
-			responseCode[response.getcode()] = 0
-		if response.getcode()==200:
-			msg = response.read()
-		else:
-			msg = ""
-
+			responseCode[response.getcode()]=1
 		response.close()
 	except Exception as e:
 		if arg.debug:
-			logging.debug ("{1} ==> {0}".format(e, crawling))
+			logging.debug ("{1} ===> {0}".format(e, crawling))
 		continue
 
-
+
+	print ("<url><loc>"+url.geturl()+"</loc></url>", file=output_file)
+	if output_file:
+		output_file.flush()
+
+	# Found links
 	links = linkregex.findall(msg)
-	crawled.add(crawling)
 	for link in links:
 		link = link.decode("utf-8")
 		if link.startswith('/'):
@@ -167,18 +190,52 @@ def exclude_url(exclude, link):
 		if "#" in link:
 			link = link[:link.index('#')]
 
+		# Drop attributes if needed
+		if arg.drop is not None:
+			for toDrop in arg.drop:
+				link=re.sub(toDrop,'',link)
+
 		# Parse the url to get domain and file extension
 		parsed_link = urlparse(link)
 		domain_link = parsed_link.netloc
 		target_extension = os.path.splitext(parsed_link.path)[1][1:]
 
-		if (link not in crawled) and (link not in tocrawl) and (domain_link == target_domain) and can_fetch(arg.parserobots, rp, link,arg.debug) and ("javascript:" not in link) and (target_extension not in arg.skipext) and (exclude_url(arg.exclude, link)):
-			print ("<url><loc>"+link+"</loc></url>", file=output_file)
-			tocrawl.add(link)
+		if (link in crawled):
+			continue
+		if (link in tocrawl):
+			continue
+		if (link in excluded):
+			continue
+		if (domain_link != target_domain):
+			continue
+		if ("javascript" in link):
+			continue
+
+		# Count one more URL
+		nbUrl+=1
+
+		if (can_fetch(arg.parserobots, rp, link, arg.debug) == False):
+			if link not in excluded:
+				excluded.add(link)
+			nbRp+=1
+			continue
+		if (target_extension in arg.skipext):
+			if link not in excluded:
+				excluded.add(link)
+			continue
+		if (exclude_url(arg.exclude, link)==False):
+			if link not in excluded:
+				excluded.add(link)
+			continue
+
+		tocrawl.add(link)
 print (footer, file=output_file)
 
 if arg.debug:
+	logging.debug ("Number of found URL : {0}".format(nbUrl))
 	logging.debug ("Number of link crawled : {0}".format(len(crawled)))
+	if arg.parserobots:
+		logging.debug ("Number of link block by robots.txt : {0}".format(nbRp))
 
 	for code in responseCode:
 		logging.debug ("Nb Code HTTP {0} : {1}".format(code, responseCode[code]))