Skip to content

Commit

Permalink
spidy v1.3
Browse files Browse the repository at this point in the history
Improved file saving: All files saved and opened with UTF-8 encoding, and UnicodeErrors are ignored.
  • Loading branch information
rivermont committed Sep 14, 2017
1 parent e8c9405 commit 6fb72be
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 70 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,21 @@ Pretty simple!
Developed by [rivermont](https://github.com/rivermont) (/rɪvɜːrmɒnt/) and [FalconWarriorr](https://github.com/Casillas-) (/fælcʌnraɪjɔːr/).<br>
Looking for technical documentation? Check out [docs.md](https://github.com/rivermont/spidy/blob/master/docs.md)

[![Version: 1.2.0](https://img.shields.io/badge/version-1.2.0-brightgreen.svg)](https://github.com/rivermont/spidy/releases)
[![Version: 1.3.0](https://img.shields.io/badge/version-1.2.0-brightgreen.svg)](https://github.com/rivermont/spidy/releases)
[![License: GPL v3](https://img.shields.io/badge/license-GPLv3.0-blue.svg)](http://www.gnu.org/licenses/gpl-3.0)
[![Python: 3.5](https://img.shields.io/badge/python-3.5-brightgreen.svg)](https://docs.python.org/3/)
[![Python: 3](https://img.shields.io/badge/python-3-lightgrey.svg)](https://docs.python.org/3/)
<br>
[![Lines of Code: 933](https://img.shields.io/badge/lines%20of%20code-933-green.svg)](#)
[![Lines of Docs: 537](https://img.shields.io/badge/lines%20of%20docs-537-orange.svg)](#)
[![Lines of Code: 930](https://img.shields.io/badge/lines%20of%20code-930-green.svg)](#)
[![Lines of Docs: 538](https://img.shields.io/badge/lines%20of%20docs-538-orange.svg)](#)

--------------------

# New Features!

### Domain Limiting - #[e229b01](https://github.com/rivermont/spidy/commit/e229b01eed7e1f95530d06afc671e40dbf4dac53)
Scrape only a single site instead of the whole internet. May use slightly less space on your disk.
Scrape only a single site instead of the whole internet. May use slightly less space on your disk.<br>
See `/config/wsj.cfg` for an example.

### Release v1.0!
[spidy Web Crawler Release 1.0](https://github.com/rivermont/spidy/releases/tag/1.0)
Expand Down
2 changes: 1 addition & 1 deletion config/rivermont.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
OVERWRITE = False
RAISE_ERRORS = True
RAISE_ERRORS = False
SAVE_PAGES = True
ZIP_FILES = False
SAVE_WORDS = False
Expand Down
23 changes: 23 additions & 0 deletions config/wsj.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
OVERWRITE = False
RAISE_ERRORS = False
SAVE_PAGES = True
SAVE_WORDS = False
ZIP_FILES = False

# Whether to restrict crawling to a single domain or not.
RESTRICT = True

# The domain within which to restrict crawling.
DOMAIN = 'wsj.com/'

TODO_FILE = 'wsj_todo.txt'
DONE_FILE = 'wsj_done.txt'
WORD_FILE = 'wsj_words.txt'
BAD_FILE = 'wsj_bad.txt'
SAVE_COUNT = 60
HEADER = HEADERS['spidy']
MAX_NEW_ERRORS = 100
MAX_KNOWN_ERRORS = 100
MAX_HTTP_ERRORS = 100
MAX_NEW_MIMES = 5
START = ['https://www.wsj.com/']
127 changes: 62 additions & 65 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
spidy Web Crawler
Built by rivermont and FalconWarriorr
"""
VERSION = '1.2.0'
VERSION = '1.3.0'

##########
# IMPORT #
Expand All @@ -20,6 +20,7 @@ def get_time():
def get_full_time():
return t.strftime('%H:%M:%S, %A %b %Y')


START_TIME = int(t.time())
START_TIME_LONG = get_time()

Expand All @@ -34,7 +35,7 @@ def get_full_time():
except OSError:
pass # Assumes only OSError wil complain logs/ already exists

LOG_FILE = open('{0}\\logs\\spidy_log_{1}.txt'.format(CRAWLER_DIR, START_TIME), 'w+')
LOG_FILE = open('{0}\\logs\\spidy_log_{1}.txt'.format(CRAWLER_DIR, START_TIME), 'w+', encoding='utf-8', errors='ignore')
LOG_FILE_NAME = 'logs\\spidy_log_{0}'.format(START_TIME)


Expand All @@ -47,6 +48,7 @@ def write_log(message):
print(message)
LOG_FILE.write('\n' + message)


write_log('[INIT]: Starting spidy Web Crawler version {0}'.format(VERSION))
write_log('[INIT]: Importing required libraries...')

Expand Down Expand Up @@ -131,7 +133,7 @@ def make_words(site):
"""
Returns list of all valid words in page.
"""
page = str(site.content) # Get page content
page = site.text # Get page content
word_list = page.split() # Split content into lists of words, as separated by spaces
del page
word_list = list(set(word_list)) # Remove duplicates
Expand All @@ -146,15 +148,15 @@ def save_files():
Saves the TODO, done, word, and bad lists into their respective files.
Also logs the action to the console.
"""
with open(TODO_FILE, 'w') as todoList:
with open(TODO_FILE, 'w', encoding='utf-8', errors='ignore') as todoList:
for site in TODO:
try:
todoList.write(site + '\n') # Save TODO list
except UnicodeError:
continue
write_log('[LOG]: Saved TODO list to {0}'.format(TODO_FILE))

with open(DONE_FILE, 'w') as done_list:
with open(DONE_FILE, 'w', encoding='utf-8', errors='ignore') as done_list:
for site in DONE:
try:
done_list.write(site + '\n') # Save done list
Expand Down Expand Up @@ -219,16 +221,16 @@ def save_page(url, page):
file_path = '{0}\\saved\\{1}{2}'.format(CRAWLER_DIR, cropped_url, ext)

# Save file
with open(file_path, 'wb+') as file:
file.write(bytes('''<!-- "{0}" -->
with open(file_path, 'w', encoding='utf-8', errors='ignore') as file:
file.write('''<!-- "{0}" -->
<!-- Downloaded with the spidy Web Crawler -->
<!-- https://github.com/rivermont/spidy -->
'''.format(url), 'ascii'))
file.write(page.content)
'''.format(url))
file.write(page.text)


def update_file(file, content, file_type):
with open(file, 'r+') as open_file: # Open save file for reading and writing
with open(file, 'r+', encoding='utf-8', errors='ignore') as open_file: # Open save file for reading and writing
file_content = open_file.readlines() # Make list of all lines in file
contents = []
for x in file_content:
Expand Down Expand Up @@ -263,7 +265,7 @@ def log(message):
Logs a single message to the error log file.
Prints message verbatim, so message must be formatted correctly in the function call.
"""
with open(ERR_LOG_FILE, 'a') as open_file:
with open(ERR_LOG_FILE, 'a', encoding='utf-8', errors='ignore') as open_file:
open_file.write('\n\n======LOG======') # Write opening line
open_file.write('\nTIME: {0}'.format(get_full_time())) # Write current time
open_file.write(message) # Write message
Expand Down Expand Up @@ -293,7 +295,7 @@ def err_log(url, error1, error2):
error2 is the extended text of the error.
"""
time = t.strftime('%H:%M:%S, %A %b %Y') # Get the current time
with open(ERR_LOG_FILE, 'a') as work_log:
with open(ERR_LOG_FILE, 'a', encoding='utf-8', errors='ignore') as work_log:
work_log.write('\n\n=====ERROR=====') # Write opening line
work_log.write('\nTIME: {0}\nURL: {1}\nERROR: {2}\nEXT: {3}'.format(time, url, error1, str(error2)))
work_log.write(LOG_END) # Write closing line
Expand Down Expand Up @@ -539,7 +541,7 @@ def init():
else:
file_path = 'config\\{0}.cfg'.format(input_)
write_log('[INFO]: Loading configuration settings from {0}'.format(file_path))
with open(file_path, 'r') as file:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
for line in file.readlines():
exec(line, globals())
except FileNotFoundError:
Expand Down Expand Up @@ -730,15 +732,15 @@ def init():
write_log('[INIT]: Loading save files...')
# Import saved TODO file data
try:
with open(TODO_FILE, 'r') as f:
with open(TODO_FILE, 'r', encoding='utf-8', errors='ignore') as f:
contents = f.readlines()
except FileNotFoundError: # If no TODO file is present
contents = []
for line in contents:
TODO.append(line.strip())
# Import saved done file data
try:
with open(DONE_FILE, 'r') as f:
with open(DONE_FILE, 'r', encoding='utf-8', errors='ignore') as f:
contents = f.readlines()
except FileNotFoundError: # If no DONE file is present
contents = []
Expand Down Expand Up @@ -775,10 +777,10 @@ def main():
pass # Assumes only OSError wil complain saved/ already exists

# Create required files
with open(WORD_FILE, 'w'):
with open(WORD_FILE, 'w', encoding='utf-8', errors='ignore'):
pass

with open(BAD_FILE, 'w'):
with open(BAD_FILE, 'w', encoding='utf-8', errors='ignore'):
pass

write_log('[INIT]: Successfully started spidy Web Crawler version {0}...'.format(VERSION))
Expand Down Expand Up @@ -817,7 +819,7 @@ def main():
word_list = make_words(page) # Get all words from page
WORDS.update(word_list) # Add words to word list
try:
links = [link for element, attribute, link, pos in html.iterlinks(page.content)]
links = [link for element, attribute, link, pos in html.iterlinks(page.text)]
except (etree.XMLSyntaxError, etree.ParserError):
links = []
links = list(set(links)) # Remove duplicates and shuffle links
Expand All @@ -844,58 +846,53 @@ def main():
write_log('[INFO]: An error was raised trying to process {0}'.format(link))
err_mro = type(e).mro()

# HTTP Errors
if str(e) == 'HTTP Error 403: Forbidden':
write_log('[ERR]: HTTP 403: Access Forbidden.')
BAD_LINKS.add(link)

elif str(e) == 'HTTP Error 429: Too Many Requests':
write_log('[ERR]: HTTP 429: Too Many Requests.')
TODO += TODO[0] # Move link to end of TODO list

elif etree.XMLSyntaxError in err_mro or etree.ParserError in err_mro: # Error processing html/xml
KNOWN_ERROR_COUNT += 1
write_log('[ERR]: An XMLSyntaxError occurred. A web dev screwed up somewhere.')
err_log(link, 'XMLSyntaxError', e)

elif UnicodeError in err_mro: # Error trying to convert foreign characters to Unicode
KNOWN_ERROR_COUNT += 1
write_log('[ERR]: A UnicodeError occurred. URL had a foreign character or something.')
err_log(link, 'UnicodeError', e)

elif requests.exceptions.SSLError in err_mro: # Invalid SSL certificate
KNOWN_ERROR_COUNT += 1
write_log('[ERR]: An SSLError occurred. Site is using an invalid certificate.')
err_log(link, 'SSLError', e)
BAD_LINKS.add(link)

elif requests.exceptions.ConnectionError in err_mro: # Error connecting to page
KNOWN_ERROR_COUNT += 1
write_log('[ERR]: A ConnectionError occurred. There\'s something wrong with somebody\'s network.')
err_log(link, 'ConnectionError', e)

elif requests.exceptions.TooManyRedirects in err_mro: # Exceeded 30 redirects.
KNOWN_ERROR_COUNT += 1
write_log('[ERR]: A TooManyRedirects error occurred. Page is probably part of a redirect loop.')
err_log(link, 'TooManyRedirects', e)
BAD_LINKS.add(link)

elif requests.exceptions.ContentDecodingError in err_mro:
# Received response with content-encoding: gzip, but failed to decode it.
KNOWN_ERROR_COUNT += 1
write_log('[ERR]: A ContentDecodingError occurred. Probably just a zip bomb, nothing to worry about.')
err_log(link, 'ContentDecodingError', e)

elif OSError in err_mro:
if OSError in err_mro:
KNOWN_ERROR_COUNT += 1
write_log('[ERR]: An OSError occurred.')
err_log(link, 'OSError', e)
BAD_LINKS.add(link)

elif 'Unknown MIME type' in str(e):
NEW_MIME_COUNT += 1
write_log('[ERR]: Unknown MIME type: {0}'.format(str(e)[18:]))
err_log(link, 'Unknown MIME', e)
# HTTP Errors
# elif str(e) == 'HTTP Error 403: Forbidden':
# write_log('[ERR]: HTTP 403: Access Forbidden.')
# BAD_LINKS.add(link)

# elif str(e) == 'HTTP Error 429: Too Many Requests':
# write_log('[ERR]: HTTP 429: Too Many Requests.')
# TODO += TODO[0] # Move link to end of TODO list

# elif etree.XMLSyntaxError in err_mro or etree.ParserError in err_mro: # Error processing html/xml
# KNOWN_ERROR_COUNT += 1
# write_log('[ERR]: An XMLSyntaxError occurred. A web dev screwed up somewhere.')
# err_log(link, 'XMLSyntaxError', e)

# elif requests.exceptions.SSLError in err_mro: # Invalid SSL certificate
# KNOWN_ERROR_COUNT += 1
# write_log('[ERR]: An SSLError occurred. Site is using an invalid certificate.')
# err_log(link, 'SSLError', e)
# BAD_LINKS.add(link)

# elif requests.exceptions.ConnectionError in err_mro: # Error connecting to page
# KNOWN_ERROR_COUNT += 1
# write_log('[ERR]: A ConnectionError occurred. There\'s something wrong with somebody\'s network.')
# err_log(link, 'ConnectionError', e)

# elif requests.exceptions.TooManyRedirects in err_mro: # Exceeded 30 redirects.
# KNOWN_ERROR_COUNT += 1
# write_log('[ERR]: A TooManyRedirects error occurred. Page is probably part of a redirect loop.')
# err_log(link, 'TooManyRedirects', e)
# BAD_LINKS.add(link)

# elif requests.exceptions.ContentDecodingError in err_mro:
# # Received response with content-encoding: gzip, but failed to decode it.
# KNOWN_ERROR_COUNT += 1
# write_log('[ERR]: A ContentDecodingError occurred. Probably just a zip bomb, nothing to worry about.')
# err_log(link, 'ContentDecodingError', e)

# elif 'Unknown MIME type' in str(e):
# NEW_MIME_COUNT += 1
# write_log('[ERR]: Unknown MIME type: {0}'.format(str(e)[18:]))
# err_log(link, 'Unknown MIME', e)

else: # Any other error
NEW_ERROR_COUNT += 1
Expand Down
Binary file removed media/physics.dll.png
Binary file not shown.

0 comments on commit 6fb72be

Please sign in to comment.