Skip to content

Commit

Permalink
Merge pull request #7 from c4software/master
Browse files Browse the repository at this point in the history
Update from origin c4software
  • Loading branch information
sebclick authored Apr 16, 2017
2 parents ab49738 + dda18bc commit aab7e47
Show file tree
Hide file tree
Showing 8 changed files with 789 additions and 34 deletions.
35 changes: 35 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
*.py[cod]

# C extensions
*.so

# Packages
*.egg
*.egg-info
dist
build
eggs
parts
bin
var
sdist
develop-eggs
.installed.cfg
lib
lib64

# Installer logs
pip-log.txt

# Unit test / coverage reports
.coverage
.tox
nosetests.xml

# Translations
*.mo

# Mr Developer
.mr.developer.cfg
.project
.pydevproject
675 changes: 675 additions & 0 deletions LICENSE.txt

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,14 @@ Read a config file to set parameters:

>>> python main.py --config config.json

Enable debug :
Enable debug:

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --debug

Enable verbose output:

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --verbose

Enable report for print summary of the crawl:

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --report
Expand Down
Binary file removed __pycache__/config.cpython-32.pyc
Binary file not shown.
Binary file removed __pycache__/crawler.cpython-32.pyc
Binary file not shown.
6 changes: 3 additions & 3 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
"pdf",
"xml"
],
"parserobots":true,
"parserobots":false,
"debug":true,
"output":false,
"output":"sitemap.xml",
"exclude": [
"action=edit"
]
}
}
96 changes: 68 additions & 28 deletions crawler.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import config
import logging
from urllib.parse import urljoin

import re
from urllib.parse import urlparse
from urllib.request import urlopen, Request
from urllib.robotparser import RobotFileParser
from urllib.parse import urlparse
from datetime import datetime

import os

class Crawler():

# Variables
parserobots = False
output = None
Expand All @@ -21,26 +23,30 @@ class Crawler():
exclude = []
skipext = []
drop = []

debug = False

tocrawl = set([])
crawled = set([])
excluded = set([])

marked = {}

# TODO also search for window.location={.*?}
linkregex = re.compile(b'<a href=[\'|"](.*?)[\'"].*?>')
linkregex = re.compile(b'<a [^>]*href=[\'|"](.*?)[\'"].*?>')

rp = None
response_code={}
nb_url=1 # Number of url.
nb_rp=0 # Number of url blocked by the robots.txt
nb_exclude=0 # Number of url excluded by extension or word

output_file = None

target_domain = ""

def __init__(self, parserobots=False, output=None, report=False ,domain="", exclude=[], skipext=[], drop=[], debug=False):
def __init__(self, parserobots=False, output=None, report=False ,domain="",
exclude=[], skipext=[], drop=[], debug=False, verbose=False):
self.parserobots = parserobots
self.output = output
self.report = report
Expand All @@ -49,31 +55,44 @@ def __init__(self, parserobots=False, output=None, report=False ,domain="", excl
self.skipext = skipext
self.drop = drop
self.debug = debug
self.verbose = verbose

if self.debug:
logging.basicConfig(level=logging.DEBUG)
log_level = logging.DEBUG
elif self.verbose:
log_level = logging.INFO
else:
log_level = logging.ERROR

logging.basicConfig(level=log_level)

self.tocrawl = set([domain])

try:
self.target_domain = urlparse(domain)[1]
except:
logging.error("Invalide domain")
raise ("Invalid domain")


if self.output:
try:
self.output_file = open(self.output, 'w')
except:
logging.debug ("Output file not available.")
logging.error ("Output file not available.")
exit(255)

def run(self):
print (config.xml_header, file=self.output_file)
print(config.xml_header, file=self.output_file)

if self.parserobots:
self.check_robots()

logging.info("Start the crawling process")

while len(self.tocrawl) != 0:
self.__crawling()

logging.debug("Start the crawling process")
self.__crawling()
logging.debug("Crawling as reach the end of all found link")
logging.info("Crawling has reached end of all found links")

print (config.xml_footer, file=self.output_file)

Expand All @@ -83,18 +102,26 @@ def __crawling(self):

url = urlparse(crawling)
self.crawled.add(crawling)

logging.info("Crawling #{}: {}".format(len(self.crawled), url.geturl()))
request = Request(crawling, headers={"User-Agent":config.crawler_user_agent})

try:
request = Request(crawling, headers={"User-Agent":config.crawler_user_agent})
response = urlopen(request)
except Exception as e:
if hasattr(e,'code'):
if e.code in self.response_code:
self.response_code[e.code]+=1
else:
self.response_code[e.code]=1

# Gestion des urls marked pour le reporting
if self.report:
if e.code in self.marked:
self.marked[e.code].append(crawling)
else:
self.marked[e.code] = [crawling]

logging.debug ("{1} ==> {0}".format(e, crawling))
response.close()
return self.__continue_crawling()

# Read the response
Expand All @@ -104,28 +131,38 @@ def __crawling(self):
self.response_code[response.getcode()]+=1
else:
self.response_code[response.getcode()]=1

response.close()

# Get the last modify date
if 'last-modified' in response.headers:
date = response.headers['Last-Modified']
else:
date = response.headers['Date']

date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z')

except Exception as e:
logging.debug ("{1} ===> {0}".format(e, crawling))
return self.__continue_crawling()
return None


print ("<url><loc>"+url.geturl()+"</loc></url>", file=self.output_file)
print ("<url><loc>"+url.geturl()+"</loc><lastmod>"+date.strftime('%Y-%m-%dT%H:%M:%S+00:00')+"</lastmod></url>", file=self.output_file)
if self.output_file:
self.output_file.flush()

# Found links
links = self.linkregex.findall(msg)
for link in links:
link = link.decode("utf-8")
#logging.debug("Found : {0}".format(link))
logging.debug("Found : {0}".format(link))
if link.startswith('/'):
link = 'http://' + url[1] + link
elif link.startswith('#'):
link = 'http://' + url[1] + url[2] + link
elif not link.startswith('http'):
link = 'http://' + url[1] + '/' + link

# Remove the anchor part if needed
if "#" in link:
link = link[:link.index('#')]
Expand All @@ -149,7 +186,7 @@ def __crawling(self):
continue
if ("javascript" in link):
continue

# Count one more URL
self.nb_url+=1

Expand All @@ -173,7 +210,7 @@ def __crawling(self):

self.tocrawl.add(link)

return self.__continue_crawling()
return None

def __continue_crawling(self):
if self.tocrawl:
Expand All @@ -183,12 +220,10 @@ def exclude_link(self,link):
if link not in self.excluded:
self.excluded.add(link)

def checkRobots(self):
if self.domain[len(self.domain)-1] != "/":
self.domain += "/"
request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent})
def check_robots(self):
robots_url = urljoin(self.domain, "robots.txt")
self.rp = RobotFileParser()
self.rp.set_url(self.domain+"robots.txt")
self.rp.set_url(robots_url)
self.rp.read()

def can_fetch(self, link):
Expand Down Expand Up @@ -224,4 +259,9 @@ def make_report(self):
print ("Number of link exclude : {0}".format(self.nb_exclude))

for code in self.response_code:
print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code]))
print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code]))

for code in self.marked:
print ("Link with status {0}:".format(code))
for uri in self.marked[code]:
print ("\t- {0}".format(uri))
5 changes: 3 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
import crawler

# Gestion des parametres
parser = argparse.ArgumentParser(version="0.1",description='Crawler pour la creation de site map')
parser = argparse.ArgumentParser(description='Crawler pour la creation de site map')

parser.add_argument('--skipext', action="append", default=[], required=False, help="File extension to skip")
parser.add_argument('--parserobots', action="store_true", default=False, required=False, help="Ignore file defined in robots.txt")
parser.add_argument('--debug', action="store_true", default=False, help="Enable debug mode")
parser.add_argument('-v', '--verbose', action="store_true", help="Enable verbose output")
parser.add_argument('--output', action="store", default=None, help="Output file")
parser.add_argument('--exclude', action="append", default=[], required=False, help="Exclude Url if contain")
parser.add_argument('--drop', action="append", default=[], required=False, help="Drop a string from the url")
Expand Down Expand Up @@ -51,4 +52,4 @@
crawl.run()

if arg.report:
crawl.make_report()
crawl.make_report()

0 comments on commit aab7e47

Please sign in to comment.