Merge pull request #7 from c4software/master

Update from origin c4software
c4software · Apr 16, 2017 · aab7e47 · aab7e47
2 parents ab49738 + dda18bc
commit aab7e47
Show file tree

Hide file tree

Showing 8 changed files with 789 additions and 34 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,35 @@
+*.py[cod]
+
+# C extensions
+*.so
+
+# Packages
+*.egg
+*.egg-info
+dist
+build
+eggs
+parts
+bin
+var
+sdist
+develop-eggs
+.installed.cfg
+lib
+lib64
+
+# Installer logs
+pip-log.txt
+
+# Unit test / coverage reports
+.coverage
+.tox
+nosetests.xml
+
+# Translations
+*.mo
+
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
diff --git a/LICENSE.txt b/LICENSE.txt
diff --git a/README.md b/README.md
@@ -16,10 +16,14 @@ Read a config file to set parameters:
 
 	>>> python main.py --config config.json
 
-Enable debug :
+Enable debug:
 
 	>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --debug
 
+Enable verbose output:
+
+    >>> python main.py --domain http://blog.lesite.us --output sitemap.xml --verbose
+
 Enable report for print summary of the crawl:
 
 	>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --report

diff --git a/__pycache__/config.cpython-32.pyc b/__pycache__/config.cpython-32.pyc
diff --git a/__pycache__/crawler.cpython-32.pyc b/__pycache__/crawler.cpython-32.pyc
diff --git a/config.json b/config.json
@@ -4,10 +4,10 @@
 					"pdf",
 					"xml"
 				],
-	"parserobots":true,
+	"parserobots":false,
 	"debug":true,
-	"output":false,
+	"output":"sitemap.xml",
 	"exclude":	[
 				"action=edit"
 				]
-}
+}
diff --git a/crawler.py b/crawler.py
@@ -1,15 +1,17 @@
 import config
 import logging
+from urllib.parse import urljoin
 
 import re
+from urllib.parse import urlparse
 from urllib.request import urlopen, Request
 from urllib.robotparser import RobotFileParser
-from urllib.parse import urlparse
+from datetime import datetime
 
 import os
 
 class Crawler():
-	
+
 	# Variables
 	parserobots = False
 	output 	= None
@@ -21,26 +23,30 @@ class Crawler():
 	exclude = []
 	skipext = []
 	drop    = []
-	
+
 	debug	= False
 
 	tocrawl = set([])
 	crawled = set([])
 	excluded = set([])
+
+	marked = {}
+
 	# TODO also search for window.location={.*?}
-	linkregex = re.compile(b'<a href=[\'|"](.*?)[\'"].*?>')
+	linkregex = re.compile(b'<a [^>]*href=[\'|"](.*?)[\'"].*?>')
 
 	rp = None
 	response_code={}
 	nb_url=1 # Number of url.
 	nb_rp=0 # Number of url blocked by the robots.txt
 	nb_exclude=0 # Number of url excluded by extension or word
-	
+
 	output_file = None
 
 	target_domain = ""
 
-	def __init__(self, parserobots=False, output=None, report=False ,domain="", exclude=[], skipext=[], drop=[], debug=False):
+	def __init__(self, parserobots=False, output=None, report=False ,domain="",
+				 exclude=[], skipext=[], drop=[], debug=False, verbose=False):
 		self.parserobots = parserobots
 		self.output 	= output
 		self.report 	= report
@@ -49,31 +55,44 @@ def __init__(self, parserobots=False, output=None, report=False ,domain="", excl
 		self.skipext 	= skipext
 		self.drop		= drop
 		self.debug		= debug
+		self.verbose    = verbose
 
 		if self.debug:
-			logging.basicConfig(level=logging.DEBUG)
+			log_level = logging.DEBUG
+		elif self.verbose:
+			log_level = logging.INFO
+		else:
+			log_level = logging.ERROR
+
+		logging.basicConfig(level=log_level)
 
 		self.tocrawl = set([domain])
 
 		try:
 			self.target_domain = urlparse(domain)[1]
 		except:
+			logging.error("Invalide domain")
 			raise ("Invalid domain")
 
-
 		if self.output:
 			try:
 				self.output_file = open(self.output, 'w')
 			except:
-				logging.debug ("Output file not available.")
+				logging.error ("Output file not available.")
 				exit(255)
 
 	def run(self):
-		print (config.xml_header, file=self.output_file)
+		print(config.xml_header, file=self.output_file)
+
+		if self.parserobots:
+			self.check_robots()
+
+		logging.info("Start the crawling process")
+
+		while len(self.tocrawl) != 0:
+			self.__crawling()
 
-		logging.debug("Start the crawling process")
-		self.__crawling()
-		logging.debug("Crawling as reach the end of all found link")
+		logging.info("Crawling has reached end of all found links")
 
 		print (config.xml_footer, file=self.output_file)
 
@@ -83,18 +102,26 @@ def __crawling(self):
 
 		url = urlparse(crawling)
 		self.crawled.add(crawling)
-
+		logging.info("Crawling #{}: {}".format(len(self.crawled), url.geturl()))
+		request = Request(crawling, headers={"User-Agent":config.crawler_user_agent})
+
 		try:
-			request = Request(crawling, headers={"User-Agent":config.crawler_user_agent})
 			response = urlopen(request)
 		except Exception as e:
 			if hasattr(e,'code'):
 				if e.code in self.response_code:
 					self.response_code[e.code]+=1
 				else:
 					self.response_code[e.code]=1
+
+				# Gestion des urls marked pour le reporting
+				if self.report:
+					if e.code in self.marked:
+						self.marked[e.code].append(crawling)
+					else:
+						self.marked[e.code] = [crawling]
+
 			logging.debug ("{1} ==> {0}".format(e, crawling))
-			response.close()
 			return self.__continue_crawling()
 
 		# Read the response
@@ -104,28 +131,38 @@ def __crawling(self):
 				self.response_code[response.getcode()]+=1
 			else:
 				self.response_code[response.getcode()]=1
+
 			response.close()
+
+			# Get the last modify date
+			if 'last-modified' in response.headers:
+				date = response.headers['Last-Modified']
+			else:
+				date = response.headers['Date']
+
+			date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z')
+
 		except Exception as e:
 			logging.debug ("{1} ===> {0}".format(e, crawling))
-			return self.__continue_crawling()
+			return None
 
 
-		print ("<url><loc>"+url.geturl()+"</loc></url>", file=self.output_file)
+		print ("<url><loc>"+url.geturl()+"</loc><lastmod>"+date.strftime('%Y-%m-%dT%H:%M:%S+00:00')+"</lastmod></url>", file=self.output_file)
 		if self.output_file:
 			self.output_file.flush()
 
 		# Found links
 		links = self.linkregex.findall(msg)
 		for link in links:
 			link = link.decode("utf-8")
-			#logging.debug("Found : {0}".format(link))		
+			logging.debug("Found : {0}".format(link))
 			if link.startswith('/'):
 				link = 'http://' + url[1] + link
 			elif link.startswith('#'):
 				link = 'http://' + url[1] + url[2] + link
 			elif not link.startswith('http'):
 				link = 'http://' + url[1] + '/' + link
-			
+
 			# Remove the anchor part if needed
 			if "#" in link:
 				link = link[:link.index('#')]
@@ -149,7 +186,7 @@ def __crawling(self):
 				continue
 			if ("javascript" in link):
 				continue
-			
+
 			# Count one more URL
 			self.nb_url+=1
 
@@ -173,7 +210,7 @@ def __crawling(self):
 
 			self.tocrawl.add(link)
 
-		return self.__continue_crawling()
+		return None
 
 	def __continue_crawling(self):
 		if self.tocrawl:
@@ -183,12 +220,10 @@ def exclude_link(self,link):
 		if link not in self.excluded:
 			self.excluded.add(link)
 
-	def checkRobots(self):
-		if self.domain[len(self.domain)-1] != "/":
-			self.domain += "/"
-		request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent})
+	def check_robots(self):
+		robots_url = urljoin(self.domain, "robots.txt")
 		self.rp = RobotFileParser()
-		self.rp.set_url(self.domain+"robots.txt")
+		self.rp.set_url(robots_url)
 		self.rp.read()
 
 	def can_fetch(self, link):
@@ -224,4 +259,9 @@ def make_report(self):
 			print ("Number of link exclude : {0}".format(self.nb_exclude))
 
 		for code in self.response_code:
-			print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code]))
+			print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code]))
+
+		for code in self.marked:
+			print ("Link with status {0}:".format(code))
+			for uri in self.marked[code]:
+				print ("\t- {0}".format(uri))
diff --git a/main.py b/main.py
@@ -6,11 +6,12 @@
 import crawler
 
 # Gestion des parametres
-parser = argparse.ArgumentParser(version="0.1",description='Crawler pour la creation de site map')
+parser = argparse.ArgumentParser(description='Crawler pour la creation de site map')
 
 parser.add_argument('--skipext', action="append", default=[], required=False, help="File extension to skip")
 parser.add_argument('--parserobots', action="store_true", default=False, required=False, help="Ignore file defined in robots.txt")
 parser.add_argument('--debug', action="store_true", default=False, help="Enable debug mode")
+parser.add_argument('-v', '--verbose', action="store_true", help="Enable verbose output")
 parser.add_argument('--output', action="store", default=None, help="Output file")
 parser.add_argument('--exclude', action="append", default=[], required=False, help="Exclude Url if contain")
 parser.add_argument('--drop', action="append", default=[], required=False, help="Drop a string from the url")
@@ -51,4 +52,4 @@
 crawl.run()
 
 if arg.report:
-	crawl.make_report()
+	crawl.make_report()