Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ajout paramètre "drop" et résumé du crawler à la fin #7

Merged
merged 7 commits into from
Aug 12, 2012
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,14 @@ Skip url (by extension) (skip pdf AND xml url):

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --skipext pdf --skipext xml

Drop attribute from url (regexp) :

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --drop "id=[0-9]{5}"

Exclude url by filter a part of it :

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --exclude "action=edit"

Read the robots.txt to ignore some url:

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --parserobots
>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --parserobots
85 changes: 71 additions & 14 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,23 +47,24 @@ def exclude_url(exclude, link):
parser.add_argument('--debug', action="store_true", default=False, help="Enable debug mode")
parser.add_argument('--output', action="store", default=None, help="Output file")
parser.add_argument('--exclude', action="append", default=[], required=False, help="Exclude Url if contain")
parser.add_argument('--drop', action="append", default=[], required=False, help="Drop a string from the url")

group = parser.add_mutually_exclusive_group()
group.add_argument('--config', action="store", default=None, help="Configuration file in json format")
group.add_argument('--domain', action="store", default="", help="Target domain (ex: http://blog.lesite.us)")

arg = parser.parse_args()

# Read the config file if needed
if arg.config is not None:
try:
config_data=open(arg.config,'r')
config = json.load(config_data)
config_data.close()
except:
except Exception as e:
if arg.debug:
logging.debug ("Bad or unavailable config file")
config = {}
print(e)
else:
config = {}

Expand Down Expand Up @@ -101,6 +102,7 @@ def exclude_url(exclude, link):

tocrawl = set([arg.domain])
crawled = set([])
excluded = set([])
# TODO also search for window.location={.*?}
linkregex = re.compile(b'<a href=[\'|"](.*?)[\'"].*?>')

Expand Down Expand Up @@ -128,32 +130,53 @@ def exclude_url(exclude, link):
rp.read()

responseCode={}
nbUrl=1
nbRp=0
print (header, file=output_file)
while tocrawl:
crawling = tocrawl.pop()


url = urlparse(crawling)
crawled.add(crawling)

try:
request = Request(crawling, headers={"User-Agent":'Sitemap crawler'})
# TODO : The urlopen() function has been removed in Python 3 in favor of urllib2.urlopen()
response = urlopen(request)
except Exception as e:
if hasattr(e,'code'):
if e.code in responseCode:
responseCode[e.code]+=1
else:
responseCode[e.code]=1
#else:
# responseCode['erreur']+=1
if arg.debug:
logging.debug ("{1} ==> {0}".format(e, crawling))
response.close()
continue

# Read the response
try:
msg = response.read()
if response.getcode() in responseCode:
responseCode[response.getcode()]+=1
else:
responseCode[response.getcode()] = 0
if response.getcode()==200:
msg = response.read()
else:
msg = ""

responseCode[response.getcode()]=1
response.close()
except Exception as e:
if arg.debug:
logging.debug ("{1} ==> {0}".format(e, crawling))
logging.debug ("{1} ===> {0}".format(e, crawling))
continue



print ("<url><loc>"+url.geturl()+"</loc></url>", file=output_file)
if output_file:
output_file.flush()

# Found links
links = linkregex.findall(msg)
crawled.add(crawling)
for link in links:
link = link.decode("utf-8")
if link.startswith('/'):
Expand All @@ -167,18 +190,52 @@ def exclude_url(exclude, link):
if "#" in link:
link = link[:link.index('#')]

# Drop attributes if needed
if arg.drop is not None:
for toDrop in arg.drop:
link=re.sub(toDrop,'',link)

# Parse the url to get domain and file extension
parsed_link = urlparse(link)
domain_link = parsed_link.netloc
target_extension = os.path.splitext(parsed_link.path)[1][1:]

if (link not in crawled) and (link not in tocrawl) and (domain_link == target_domain) and can_fetch(arg.parserobots, rp, link,arg.debug) and ("javascript:" not in link) and (target_extension not in arg.skipext) and (exclude_url(arg.exclude, link)):
print ("<url><loc>"+link+"</loc></url>", file=output_file)
tocrawl.add(link)
if (link in crawled):
continue
if (link in tocrawl):
continue
if (link in excluded):
continue
if (domain_link != target_domain):
continue
if ("javascript" in link):
continue

# Count one more URL
nbUrl+=1

if (can_fetch(arg.parserobots, rp, link, arg.debug) == False):
if link not in excluded:
excluded.add(link)
nbRp+=1
continue
if (target_extension in arg.skipext):
if link not in excluded:
excluded.add(link)
continue
if (exclude_url(arg.exclude, link)==False):
if link not in excluded:
excluded.add(link)
continue

tocrawl.add(link)
print (footer, file=output_file)

if arg.debug:
logging.debug ("Number of found URL : {0}".format(nbUrl))
logging.debug ("Number of link crawled : {0}".format(len(crawled)))
if arg.parserobots:
logging.debug ("Number of link block by robots.txt : {0}".format(nbRp))

for code in responseCode:
logging.debug ("Nb Code HTTP {0} : {1}".format(code, responseCode[code]))
Expand Down