Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hop resync ton fork #1

Merged
merged 9 commits into from
Aug 5, 2012
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ Simple usage
Advanced usage
--------------

Read a config file to set parameters:
***You can overide (or add for list) any parameters define in the config.json***

>>> python main.py --config config.json

Enable debug :

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --debug
Expand All @@ -19,10 +24,10 @@ Skip url (by extension) (skip pdf AND xml url):

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --skipext pdf --skipext xml

Exclude url :
Exclude url by filter a part of it :

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --exclude "action=edit"

Read the robots.txt to ignore some url:

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --parserobots
>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --parserobots
13 changes: 13 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"domain":"http://blog.lesite.us",
"skipext": [
"pdf",
"xml"
],
"parserobots":true,
"debug":false,
"output":false,
"exclude": [
"action=edit"
]
}
67 changes: 54 additions & 13 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@
import argparse
import os

def can_fetch(parserobots, rp, link):
import json

def can_fetch(parserobots, rp, link, debug=False):
try:
if parserobots:
if rp.can_fetch("*", link):
return True
else:
if arg.debug:
if debug:
print ("Crawling of {0} disabled by robots.txt".format(link))
return False

Expand All @@ -22,7 +24,7 @@ def can_fetch(parserobots, rp, link):
return True
except:
# On error continue!
if arg.debug:
if debug:
print ("Error during parsing robots.txt")
return True

Expand All @@ -38,27 +40,63 @@ def exclude_url(exclude, link):

# Gestion des parametres
parser = argparse.ArgumentParser(version="0.1",description='Crawler pour la creation de site map')
parser.add_argument('--domain', action="store", default="",required=True, help="Target domain (ex: http://blog.lesite.us)")

parser.add_argument('--skipext', action="append", default=[], required=False, help="File extension to skip")
parser.add_argument('--parserobots', action="store_true", default=False, required=False, help="Ignore file defined in robots.txt")
parser.add_argument('--debug', action="store_true", default=False, help="Enable debug mode")
parser.add_argument('--output', action="store", default=None, help="Output file")
parser.add_argument('--exclude', action="append", default=[], required=False, help="Regular expression for exclude URL")
parser.add_argument('--exclude', action="append", default=[], required=False, help="Exclude Url if contain")

group = parser.add_mutually_exclusive_group()
group.add_argument('--config', action="store", default=None, help="Configuration file in json format")
group.add_argument('--domain', action="store", default="", help="Target domain (ex: http://blog.lesite.us)")

arg = parser.parse_args()

outputFile = None
if arg.output is not None:
# Read the config file if needed
if arg.config is not None:
try:
config_data=open(arg.config,'r')
config = json.load(config_data)
config_data.close()
except:
if arg.debug:
print ("Bad or unavailable config file")
config = {}
else:
config = {}

# Overload config with flag parameters
dict_arg = arg.__dict__
for argument in dict_arg:
if argument in config:
if type(config[argument]).__name__ == 'list':
dict_arg[argument].extend(config[argument])
else:
dict_arg[argument] = config[argument]
# if dict_arg[argument] is not (None or ""):
# # try:
# if "argument" in config and type(config[argument]).__name__ == 'list':
# config[argument].extend(dict_arg[argument])
# elif "argument" in config:
# config[argument] = dict_arg[argument]
# # except:
# # pass
if arg.debug:
print ("Configuration : ")
print (arg)

output_file = None
if arg.output:
try:
outputFile = open(arg.output, 'w')
output_file = open(arg.output, 'w')
except:
if not arg.debug:
print ("Output file not available.")
exit(255)
else:
print ("Continue without output file.")


tocrawl = set([arg.domain])
crawled = set([])
# TODO also search for window.location={.*?}
Expand Down Expand Up @@ -89,7 +127,7 @@ def exclude_url(exclude, link):
rp.read()


print (header, file=outputFile)
print (header, file=output_file)
while tocrawl:
crawling = tocrawl.pop()

Expand Down Expand Up @@ -125,10 +163,13 @@ def exclude_url(exclude, link):
domain_link = parsed_link.netloc
target_extension = os.path.splitext(parsed_link.path)[1][1:]

if (link not in crawled) and (link not in tocrawl) and (domain_link == target_domain) and can_fetch(arg.parserobots, rp, link) and ("javascript:" not in link) and (target_extension not in arg.skipext) and (exclude_url(arg.exclude, link)):
print ("<url><loc>"+link+"</loc></url>", file=outputFile)
if (link not in crawled) and (link not in tocrawl) and (domain_link == target_domain) and can_fetch(arg.parserobots, rp, link,arg.debug) and ("javascript:" not in link) and (target_extension not in arg.skipext) and (exclude_url(arg.exclude, link)):
print ("<url><loc>"+link+"</loc></url>", file=output_file)
tocrawl.add(link)
print (footer, file=outputFile)
print (footer, file=output_file)

if arg.debug:
print ("Number of link crawled : {0}".format(len(crawled)))

if output_file:
output_file.close()