From 0de72834b86dc06f8a4f859b0150434d03473afd Mon Sep 17 00:00:00 2001 From: Aaron Bishop Date: Mon, 14 Mar 2022 23:09:21 +0000 Subject: [PATCH 1/4] Resolve pep 8 violations. WIP - Resolves PEP8 violations in modules/checker.py - Resolves PEP8 violations in torcrawler.py. --- .gitignore | 1 + modules/checker.py | 133 +++++++++++------- modules/crawler.py | 296 +++++++++++++++++++++------------------ modules/extractor.py | 201 +++++++++++++++------------ torcrawl.py | 321 ++++++++++++++++++++++--------------------- 5 files changed, 521 insertions(+), 431 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..723ef36 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea \ No newline at end of file diff --git a/modules/checker.py b/modules/checker.py index c4a2152..abf5741 100644 --- a/modules/checker.py +++ b/modules/checker.py @@ -1,67 +1,96 @@ #!/usr/bin/python -import sys +import os import re import subprocess -import os -from urllib.request import urlopen +import sys from json import load +from urllib.error import HTTPError from urllib.parse import urlparse +from urllib.request import urlopen -def urlcanon(website, verbose): - if not website.startswith("http"): - if not website.startswith("www."): - website = "www." + website - if verbose: - print(("## URL fixed: " + website)) - website = "http://" + website - if verbose: - print(("## URL fixed: " + website)) - return website +def url_canon(website, verbose): + """ + + :param website: String - + :param verbose: Boolean - + :return: String 'website' - + """ + if not website.startswith("http"): + if not website.startswith("www."): + website = "www." + website + if verbose: + print(("## URL fixed: " + website)) + website = "http://" + website + if verbose: + print(("## URL fixed: " + website)) + return website def extract_domain(url, remove_http=True): - uri = urlparse(url) - if remove_http: - domain_name = f"{uri.netloc}" - else: - domain_name = f"{uri.netloc}://{uri.netloc}" - return domain_name + """ + + :param url: String - + :param remove_http: Boolean - + :return: String 'domain_name' - + """ + uri = urlparse(url) + if remove_http: + domain_name = f"{uri.netloc}" + else: + domain_name = f"{uri.netloc}://{uri.netloc}" + return domain_name # Create output path def folder(website, verbose): - outpath = website - if not os.path.exists(outpath): - os.makedirs(outpath) - if verbose: - print(("## Folder created: " + outpath)) - return outpath - - -# Check if TOR service is running -def checktor(verbose): - checkfortor = subprocess.check_output(['ps', '-e']) - - def findwholeword(w): - return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search - - if findwholeword('tor')(str(checkfortor)): - if verbose: - print("## TOR is ready!") - else: - print("## TOR is NOT running!") - print('## Enable tor with \'service tor start\' or add -w argument') - sys.exit(2) - - -# Check your IP from external website -def checkip(): - try: - webipcheck = 'https://api.ipify.org/?format=json' - my_ip = load(urlopen(webipcheck))['ip'] - print(('## Your IP: ' + my_ip)) - except: - e = sys.exc_info()[0] - print(("Error: %s" % e + "\n## IP can't obtain \n## Is " + webipcheck + "up?")) + """ Creates an output path for the findings. + + :param website: String - URL of website to crawl. + :param verbose: Boolean - Logging level. + :return: String 'out_path' - Path of the output folder. + """ + out_path = website + if not os.path.exists(out_path): + os.makedirs(out_path) + if verbose: + print(f"## Folder created: {out_path}") + return out_path + + +def check_tor(verbose): + """Checks to see if TOR service is running on device. + Will exit if (-w) with argument is provided on application startup and TOR + service is not found to be running on the device. + + :param verbose: Boolean -'verbose' logging argument. + :return: None + """ + check_for_tor = subprocess.check_output(['ps', '-e']) + + def find_whole_word(word): + return re.compile(r'\b({0})\b'.format(word), + flags=re.IGNORECASE).search + + if find_whole_word('tor')(str(check_for_tor)): + if verbose: + print("## TOR is ready!") + else: + print("## TOR is NOT running!") + print('## Enable tor with \'service tor start\' or add -w argument') + sys.exit(2) + + +def check_ip(): + """ Checks users IP from external resource. + :return: None or HTTPError + """ + addr = 'https://api.ipify.org/?format=json' + try: + with load(urlopen(addr))['ip'] as my_ip: + print(f'## Your IP: {my_ip}') + except HTTPError as err: + error = sys.exc_info()[0] + print(f"Error: {error} \n## IP cannot be obtained. \n## Is {addr} up? " + f"\n## HTTPError: {err}") diff --git a/modules/crawler.py b/modules/crawler.py index aaf8d38..9eb0b0f 100644 --- a/modules/crawler.py +++ b/modules/crawler.py @@ -1,145 +1,167 @@ #!/usr/bin/python -import sys import re -import urllib.request +import sys import time +import urllib.request +from urllib.error import HTTPError + from bs4 import BeautifulSoup -# Exclude links that we dont need -def excludes(link, website, outpath): - # BUG: For NoneType Exceptions, got to find a solution here - if link is None: - return True - # Links - elif '#' in link: - return True - # External links - elif link.startswith('http') and not link.startswith(website): - lstfile = open(outpath + '/extlinks.txt', 'w+') - lstfile.write(str(link) + '\n') - lstfile.close() - return True - # Telephone Number - elif link.startswith('tel:'): - lstfile = open(outpath + '/telephones.txt', 'w+') - lstfile.write(str(link) + '\n') - lstfile.close() - return True - # Mails - elif link.startswith('mailto:'): - lstfile = open(outpath + '/mails.txt', 'w+') - lstfile.write(str(link) + '\n') - lstfile.close() - return True - # Type of files - elif re.search('^.*\.(pdf|jpg|jpeg|png|gif|doc)$', link, re.IGNORECASE): - return True - - -# Canonization of the link +def excludes(link, website, out_path): + """ Excludes links that are not required. + + :param link: + :param website: + :param out_path: + :return: + """ + # BUG: For NoneType Exceptions, got to find a solution here + if link is None: + return True + # Links + elif '#' in link: + return True + # External links + elif link.startswith('http') and not link.startswith(website): + with open(out_path + '/extlinks.txt', 'w+') as lst_file: + lst_file.write(str(link) + '\n') + return True + # Telephone Number + elif link.startswith('tel:'): + with open(out_path + '/telephones.txt', 'w+') as lst_file: + lst_file.write(str(link) + '\n') + return True + # Mails + elif link.startswith('mailto:'): + with open(out_path + '/mails.txt', 'w+') as lst_file: + lst_file.write(str(link) + '\n') + return True + # Type of files + elif re.search('^.*\.(pdf|jpg|jpeg|png|gif|doc)$', link, re.IGNORECASE): + return True + + def canonical(link, website): - # Already formatted - if link.startswith(website): - return link - # For relative paths with / in front - elif link.startswith('/'): - if website[-1] == '/': - finalLink = website[:-1] + link - else: - finalLink = website + link - return finalLink - # For relative paths without / - elif re.search('^.*\.(html|htm|aspx|php|doc|css|js|less)$', link, re.IGNORECASE): - # Pass to - if website[-1] == '/': - finalLink = website + link - else: - finalLink = website + "/" + link - return finalLink - # Clean links from '?page=' arguments - - -# Core of crawler -def crawler(website, cdepth, cpause, outpath, logs, verbose): - lst = set() - ordlst = [] - ordlst.insert(0, website) - ordlstind = 0 - - if logs: - global logfile - logfile = open(outpath + '/log.txt', 'w+') - - print(( - "## Crawler started from " + website + - " with " + str(cdepth) + " depth crawl and " + str(cpause) + " second(s) delay:" - )) - - # Depth - for x in range(0, int(cdepth)): - - # For every element of list - for item in ordlst: - - # Check if is the first element - if ordlstind > 0: - try: - if item is not None: - global html_page - html_page = urllib.request.urlopen(item) - except urllib.error.HTTPError as e: - print(e) - else: - html_page = urllib.request.urlopen(website) - ordlstind += 1 - - soup = BeautifulSoup(html_page, features="html.parser") - - # For each tag - for link in soup.findAll('a'): - link = link.get('href') - - if excludes(link, website, outpath): - continue - - verlink = canonical(link, website) - lst.add(verlink) - - # For each tag - for link in soup.findAll('area'): - link = link.get('href') - - if excludes(link, website, outpath): - continue - - verlink = canonical(link, website) - lst.add(verlink) - - # TODO: For images - # TODO: For scripts - - # Pass new on list and re-set it to delete duplicates - ordlst = ordlst + list(set(lst)) - ordlst = list(set(ordlst)) - - if verbose: - sys.stdout.write("-- Results: " + str(len(ordlst)) + "\r") - sys.stdout.flush() - - # Pause time - if (ordlst.index(item) != len(ordlst) - 1) and float(cpause) > 0: - time.sleep(float(cpause)) - - # Keeps logs for every webpage visited - if logs: - itcode = html_page.getcode() - logfile.write("[" + str(itcode) + "] " + str(item) + "\n") - - print(("## Step " + str(x + 1) + " completed with: " + str(len(ordlst)) + " result(s)")) - - if logs: - logfile.close() - - return ordlst + """ Canonization of the link. + + :param link: + :param website: + :return: + """ + # Already formatted + if link.startswith(website): + return link + # For relative paths with / in front + elif link.startswith('/'): + if website[-1] == '/': + final_link = website[:-1] + link + else: + final_link = website + link + return final_link + # For relative paths without / + elif re.search('^.*\.(html|htm|aspx|php|doc|css|js|less)$', link, + re.IGNORECASE): + # Pass to + if website[-1] == '/': + final_link = website + link + else: + final_link = website + "/" + link + return final_link + + +# Clean links from '?page=' arguments + + +def crawler(website, c_depth, c_pause, out_path, logs, verbose): + """ Core of the crawler. + + :param website: + :param c_depth: + :param c_pause: + :param out_path: + :param logs: + :param verbose: + :return: + """ + lst = set() + ord_lst = [] + ord_lst.insert(0, website) + ord_lstind = 0 + + if logs: + global log_file + log_file = open(out_path + '/log.txt', 'w+') + + print(f"## Crawler started from {website} with {str(c_depth)} depth " + f"crawl, and {str(c_pause)} second(s) delay.") + + # Depth + for x in range(0, int(c_depth)): + + # For every element of list + for item in ord_lst: + + # Check if is the first element + if ord_lstind > 0: + try: + if item is not None: + global html_page + html_page = urllib.request.urlopen(item) + except HTTPError as error: + print(error) + else: + html_page = urllib.request.urlopen(website) + ord_lstind += 1 + + soup = BeautifulSoup(html_page, features="html.parser") + + # For each tag + for link in soup.findAll('a'): + link = link.get('href') + + if excludes(link, website, out_path): + continue + + ver_link = canonical(link, website) + lst.add(ver_link) + + # For each tag + for link in soup.findAll('area'): + link = link.get('href') + + if excludes(link, website, out_path): + continue + + ver_link = canonical(link, website) + lst.add(ver_link) + + # TODO: For images + # TODO: For scripts + + # Pass new on list and re-set it to delete duplicates + ord_lst = ord_lst + list(set(lst)) + ord_lst = list(set(ord_lst)) + + if verbose: + sys.stdout.write("-- Results: " + str(len(ord_lst)) + "\r") + sys.stdout.flush() + + # Pause time + if (ord_lst.index(item) != len(ord_lst) - 1) and \ + float(c_pause) > 0: + time.sleep(float(c_pause)) + + # Keeps logs for every webpage visited + if logs: + it_code = html_page.getcode() + log_file.write("[" + str(it_code) + "] " + str(item) + "\n") + + print(("## Step " + str(x + 1) + " completed with: " + str( + len(ord_lst)) + " result(s)")) + + if logs: + log_file.close() + + return ord_lst diff --git a/modules/extractor.py b/modules/extractor.py index 8c446dc..5df4be1 100644 --- a/modules/extractor.py +++ b/modules/extractor.py @@ -2,91 +2,118 @@ import os import sys -import urllib.request, urllib.parse, urllib.error - - -# Input links from file and extract them into path/files -def cinex(inputFile, outpath): - try: - global f - f = open(inputFile, 'r') - # print f - except IOError: - e = sys.exc_info()[0] - print(("Error: %s" % e + "\n## Can't open " + inputFile)) - - for line in f: - - # Generate name for every file - try: - pagename = line.rsplit('/', 1) - clpagename = str(pagename[1]) - clpagename = clpagename[:-1] - if len(clpagename) == 0: - outputFile = "index.htm" - else: - outputFile = clpagename - except IndexError as e: - print("Error: %s" % e) - continue - - # Extract page to file - try: - f = open(outpath + "/" + outputFile, 'wb') - f.write(urllib.request.urlopen(line).read()) - f.close() - print(("## File created on " + os.getcwd() + "/" + outpath + "/" + outputFile)) - except: - e = sys.exc_info()[0] - print(("Error: %s" % e + "\n Can't write on file " + outputFile)) - - -# Input links from file and extract them into terminal -def intermex(inputFile): - try: - f = open(inputFile, 'r') - for line in f: - print((urllib.request.urlopen(line).read())) - except: - e = sys.exc_info()[0] - print(("Error: %s" % e + "\n## Not valid file")) - - -# Output webpage into a file -def outex(website, outputFile, outpath): - # Extract page to file - try: - outputFile = outpath + "/" + outputFile - f = open(outputFile, 'wb') - f.write(urllib.request.urlopen(website).read()) - f.close() - print(("## File created on " + os.getcwd() + "/" + outputFile)) - except: - e = sys.exc_info()[0] - print(("Error: %s" % e + "\n Can't write on file " + outputFile)) - - -# Output to terminal +import urllib.error +import urllib.parse +import urllib.request + + +def cinex(input_file, out_path): + """ Ingests the input links from file and extract them into path/files. + + :param input_file: + :param out_path: + :return: + """ + try: + global f + f = open(input_file, 'r') + except IOError as err: + error = sys.exc_info()[0] + print(f"Error: {error}\n## Can't open: {input_file}") + + for line in f: + + # Generate the name for every file. + try: + page_name = line.rsplit('/', 1) + cl_page_name = str(page_name[1]) + cl_page_name = cl_page_name[:-1] + if len(cl_page_name) == 0: + output_file = "index.htm" + else: + output_file = cl_page_name + except IndexError as error: + print(f"Error: {error}") + continue + + # Extract page to file + try: + with open(out_path + "/" + output_file, 'wb') as f: + f.write(urllib.request.urlopen(line).read()) + print(f"# File created on: {os.getcwd()}/{out_path}/{output_file}") + except IOError as err: + error = sys.exc_info()[0] + print(f"Error: {error}\n Can't write on file: {output_file}") + + +def intermex(input_file): + """ Input links from file and extract them into terminal. + + :param input_file: + :return: + """ + try: + with open(input_file, 'r') as f: + for line in f: + print((urllib.request.urlopen(line).read())) + except IOError as err: + error = sys.exc_info()[0] + print(f"Error: {error}\n## Not valid file") + + +def outex(website, output_file, out_path): + """ Output the contents of the webpage into a file. + + :param website: + :param output_file: + :param out_path: + :return: + """ + # Extract page to file + try: + output_file = out_path + "/" + output_file + with open(output_file, 'wb') as f: + f.write(urllib.request.urlopen(website).read()) + print(f"## File created on: {os.getcwd()}/{output_file}") + except IOError as err: + error = sys.exc_info()[0] + print(f"Error: {error}\n Can't write on file: {output_file}") + + def termex(website): - try: - print((urllib.request.urlopen(website).read())) - except (urllib.error.HTTPError, urllib.error.URLError) as e: - print(("Error: (%s) %s" % (e, website))) - return None - - -def extractor(website, crawl, outputFile, inputFile, outpath): - # TODO: Return output to torcrawl.py - if len(inputFile) > 0: - if crawl: - cinex(inputFile, outpath) - # TODO: Extract from list into a folder - # elif len(outputFile) > 0: - # inoutex(website, inputFile, outputFile) - else: - intermex(inputFile) - else: - if len(outputFile) > 0: - outex(website, outputFile, outpath) - else: - termex(website) + """ Output findings to the terminal. + + :param website: + :return: + """ + try: + print((urllib.request.urlopen(website).read())) + except (urllib.error.HTTPError, urllib.error.URLError) as err: + print(f"Error: ({err}) {website}") + return None + + +def extractor(website, crawl, output_file, input_ile, out_path): + """ + + :param website: + :param crawl: + :param output_file: + :param input_ile: + :param out_path: + :return: + """ + # TODO: Return output to torcrawl.py + if len(input_ile) > 0: + if crawl: + cinex(input_ile, out_path) + # TODO: Extract from list into a folder + # elif len(output_file) > 0: + # inoutex(website, input_ile, output_file) + else: + intermex(input_ile) + else: + if len(output_file) > 0: + outex(website, output_file, out_path) + else: + termex(website) diff --git a/torcrawl.py b/torcrawl.py index 3fd401c..68764e1 100755 --- a/torcrawl.py +++ b/torcrawl.py @@ -1,22 +1,11 @@ #!/usr/bin/python - -import socket -import socks -import argparse - -# TorCrawl Modules -from modules.crawler import crawler -from modules.extractor import extractor -from modules.checker import * - -help = ''' - -TorCrawl.py is a python script to crawl and extract (regular or onion) -webpages through TOR network. +""" +TorCrawl.py is a python script to crawl and extract (regular or onion) +webpages through TOR network. usage: python torcrawl.py [options] -python torcrawl.py -u l0r3m1p5umD0lorS1t4m3t.onion -python torcrawl.py -v -w -u http://www.github.com -o github.htm +python torcrawl.py -u l0r3m1p5umD0lorS1t4m3t.onion +python torcrawl.py -v -w -u http://www.github.com -o github.htm python torcrawl.py -v -u l0r3m1p5umD0lorS1t4m3t.onion -c -d 2 -p 5 python torcrawl.py -v -w -u http://www.github.com -c -d 2 -p 5 -e -f GitHub @@ -24,7 +13,7 @@ -h, --help : Help -v, --verbose : Show more informations about the progress -u, --url *.onion : URL of Webpage to crawl or extract --w, --without : Without the use of Relay TOR +-w, --without : Without the use of Relay TOR Extract: -e, --extract : Extract page's code to terminal or file. @@ -37,156 +26,178 @@ -d, --cdepth : Set depth of crawl's travel (Default: 1) -z, --exclusions : Paths that you don't want to include (TODO) -s, --simultaneous: How many pages to visit at the same time (TODO) --p, --pause : The length of time the crawler will pause +-p, --pause : The length of time the crawler will pause (Default: 0) --f, --folder : The root directory which will contain the +-f, --folder : The root directory which will contain the generated files -l, --log : Log file with visited URLs and their response code. GitHub: github.com/MikeMeliz/TorCrawl.py License: GNU General Public License v3.0 -''' - +""" -# Set socket and connection with TOR network -def connecttor(): - try: - port = 9050 - # Set socks proxy and wrap the urllib module - socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', port) - socket.socket = socks.socksocket +import argparse +import os +import socket +import sys - # Perform DNS resolution through the socket - def getaddrinfo(*args): - return [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))] +import socks # noqa - pysocks - socket.getaddrinfo = getaddrinfo - except: - e = sys.exc_info()[0] - print(("Error: %s" % e + "\n## Can't establish connection with TOR")) +from modules.checker import check_ip +from modules.checker import check_tor +from modules.checker import extract_domain +from modules.checker import folder +from modules.checker import url_canon +# TorCrawl Modules +from modules.crawler import crawler +from modules.extractor import extractor -def main(): - # Initialize necessary variables - inputfile = outputfile = '' - cpause = 0 - cdepth = 1 - - # Get arguments with argparse - parser = argparse.ArgumentParser( - description="TorCrawl.py is a python script to crawl and extract (regular or onion) webpages through TOR network.") - - # General - parser.add_argument( - '-v', - '--verbose', - action='store_true', - help='Show more information about the progress' - ) - parser.add_argument( - '-u', - '--url', - help='URL of webpage to crawl or extract' - ) - parser.add_argument( - '-w', - '--without', - action='store_true', - help='Without the use of Relay TOR' - ) - - # Extract - parser.add_argument( - '-e', - '--extract', - action='store_true', - help='Extract page\'s code to terminal or file.' - ) - parser.add_argument( - '-i', - '--input', - help='Input file with URL(s) (seperated by line)' - ) - parser.add_argument( - '-o', - '--output', - help='Output page(s) to file(s) (for one page)' - ) - - # Crawl - parser.add_argument( - '-c', - '--crawl', - action='store_true', - help='Crawl website (Default output on /links.txt)' - ) - parser.add_argument( - '-d', - '--cdepth', - help='Set depth of crawl\'s travel (Default: 1)' - ) - parser.add_argument( - '-p', - '--cpause', - help='The length of time the crawler will pause' - ) - parser.add_argument( - '-l', - '--log', - action='store_true', - help='A save log will let you see which URLs were visited and their response code' - ) - parser.add_argument( - '-f', - '--folder', - help='The root directory which will contain the generated files' - ) - - args = parser.parse_args() - - # Parse arguments to variables - if args.input: - inputfile = args.input - if args.output: - outputfile = args.output - if args.cdepth: - cdepth = args.cdepth - if args.cpause: - cpause = args.cpause - - # Connect to TOR - if args.without is False: - checktor(args.verbose) - connecttor() - - if args.verbose: - checkip() - print(('## URL: ' + args.url)) - - # Canon/ion of website and create path for output - if len(args.url) > 0: - global website - global outpath - website = urlcanon(args.url, args.verbose) - if args.folder is not None: - outpath = folder(args.folder, args.verbose) - else: - outpath = folder(extract_domain(website), args.verbose) - - if args.crawl: - lst = crawler(website, cdepth, cpause, outpath, args.log, args.verbose) - lstfile = open(outpath + '/links.txt', 'w+') - for item in lst: - lstfile.write("%s\n" % item) - lstfile.close() - print(("## File created on " + os.getcwd() + "/" + outpath + "/links.txt")) - if args.extract: - inputfile = outpath + "/links.txt" - extractor(website, args.crawl, outputfile, inputfile, outpath) - else: - extractor(website, args.crawl, outputfile, inputfile, outpath) +# Set socket and connection with TOR network +def connect_tor(): + """ Connect to TOR via DNS resolution through a socket. + :return: None or HTTPError. + """ + try: + port = 9050 + # Set socks proxy and wrap the urllib module + socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', port) + socket.socket = socks.socksocket + + # Perform DNS resolution through the socket + def getaddrinfo(*args): # noqa + return [(socket.AF_INET, socket.SOCK_STREAM, 6, '', + (args[0], args[1]))] + + socket.getaddrinfo = getaddrinfo # noqa + except socks.HTTPError as err: + error = sys.exc_info()[0] + print(f"Error: {error} \n## Cannot establish connection with TOR\n" + f"HTTPError: {err}") +def main(): + """ Main method of TorCrawl application. Collects and parses arguments and + instructs the rest of the application on how to run. + + :return: None + """ + + # Get arguments with argparse. + parser = argparse.ArgumentParser( + description="TorCrawl.py is a python script to crawl and extract " + "(regular or onion) webpages through TOR network.") + + # General + parser.add_argument( + '-v', + '--verbose', + action='store_true', + help='Show more information about the progress' + ) + parser.add_argument( + '-u', + '--url', + help='URL of webpage to crawl or extract' + ) + parser.add_argument( + '-w', + '--without', + action='store_true', + help='Without the use of Relay TOR' + ) + + # Extract + parser.add_argument( + '-e', + '--extract', + action='store_true', + help='Extract page\'s code to terminal or file.' + ) + parser.add_argument( + '-i', + '--input', + help='Input file with URL(s) (seperated by line)' + ) + parser.add_argument( + '-o', + '--output', + help='Output page(s) to file(s) (for one page)' + ) + + # Crawl + parser.add_argument( + '-c', + '--crawl', + action='store_true', + help='Crawl website (Default output on /links.txt)' + ) + parser.add_argument( + '-d', + '--cdepth', + help='Set depth of crawl\'s travel (Default: 1)' + ) + parser.add_argument( + '-p', + '--cpause', + help='The length of time the crawler will pause' + ) + parser.add_argument( + '-l', + '--log', + action='store_true', + help='A save log will let you see which URLs were visited and their ' + 'response code' + ) + parser.add_argument( + '-f', + '--folder', + help='The root directory which will contain the generated files' + ) + + args = parser.parse_args() + + # Parse arguments to variables else initiate variables. + input_file = args.input if args.input else '' + output_file = args.output if args.output else '' + c_depth = args.cdepth if args.cdepth else 0 + c_pause = args.cpause if args.cpause else 1 + + # Connect to TOR + if args.without is False: + check_tor(args.verbose) + connect_tor() + + if args.verbose: + check_ip() + print(('## URL: ' + args.url)) + + website = '' + out_path = '' + + # Canon/ion of website and create path for output + if len(args.url) > 0: + website = url_canon(args.url, args.verbose) + if args.folder is not None: + out_path = folder(args.folder, args.verbose) + else: + out_path = folder(extract_domain(website), args.verbose) + + if args.crawl: + lst = crawler(website, c_depth, c_pause, out_path, args.log, + args.verbose) + with open(out_path + '/links.txt', 'w+', encoding='UTF-8') as file: + for item in lst: + file.write(f"{item}\n") + print(f"## File created on {os.getcwd()}/{out_path}/links.txt") + if args.extract: + input_file = out_path + "/links.txt" + extractor(website, args.crawl, output_file, input_file, out_path) + else: + extractor(website, args.crawl, output_file, input_file, out_path) + + +# Stub to call main method. if __name__ == "__main__": - main() + main() From cbbf9dfab70e18b94ee25bcfadd8528f13ce6f98 Mon Sep 17 00:00:00 2001 From: Aaron Bishop Date: Tue, 15 Mar 2022 22:39:25 +0000 Subject: [PATCH 2/4] Resolve PEP8 Violations - Reverts use of 'with' statement in check_ip function. modules/checker.py. - Refactors modules/crawler.py to implement Crawler function. - Refactors previous modules/crawler.py crawler method into 'crawl' method. - Resolves PEP8 violations in modules/extractor.py - Refactors use of string formating to enforce use of new string format convention. - Ammends Try/Catch statements to handles additional HTTP error cases within extractor methods. - Refactors torcrawl.py to utilise Crawler class and crawl method. resolve-pep8-violations --- modules/checker.py | 4 +- modules/crawler.py | 310 +++++++++++++++++++++---------------------- modules/extractor.py | 97 ++++++++------ torcrawl.py | 7 +- 4 files changed, 214 insertions(+), 204 deletions(-) diff --git a/modules/checker.py b/modules/checker.py index abf5741..127fe68 100644 --- a/modules/checker.py +++ b/modules/checker.py @@ -88,8 +88,8 @@ def check_ip(): """ addr = 'https://api.ipify.org/?format=json' try: - with load(urlopen(addr))['ip'] as my_ip: - print(f'## Your IP: {my_ip}') + my_ip = load(urlopen(addr))['ip'] + print(f'## Your IP: {my_ip}') except HTTPError as err: error = sys.exc_info()[0] print(f"Error: {error} \n## IP cannot be obtained. \n## Is {addr} up? " diff --git a/modules/crawler.py b/modules/crawler.py index 9eb0b0f..2f7f591 100644 --- a/modules/crawler.py +++ b/modules/crawler.py @@ -1,5 +1,6 @@ #!/usr/bin/python - +import http.client +import os import re import sys import time @@ -9,159 +10,156 @@ from bs4 import BeautifulSoup -def excludes(link, website, out_path): - """ Excludes links that are not required. - - :param link: - :param website: - :param out_path: - :return: - """ - # BUG: For NoneType Exceptions, got to find a solution here - if link is None: - return True - # Links - elif '#' in link: - return True - # External links - elif link.startswith('http') and not link.startswith(website): - with open(out_path + '/extlinks.txt', 'w+') as lst_file: - lst_file.write(str(link) + '\n') - return True - # Telephone Number - elif link.startswith('tel:'): - with open(out_path + '/telephones.txt', 'w+') as lst_file: - lst_file.write(str(link) + '\n') - return True - # Mails - elif link.startswith('mailto:'): - with open(out_path + '/mails.txt', 'w+') as lst_file: - lst_file.write(str(link) + '\n') - return True - # Type of files - elif re.search('^.*\.(pdf|jpg|jpeg|png|gif|doc)$', link, re.IGNORECASE): - return True - - -def canonical(link, website): - """ Canonization of the link. - - :param link: - :param website: - :return: - """ - # Already formatted - if link.startswith(website): - return link - # For relative paths with / in front - elif link.startswith('/'): - if website[-1] == '/': - final_link = website[:-1] + link - else: - final_link = website + link - return final_link - # For relative paths without / - elif re.search('^.*\.(html|htm|aspx|php|doc|css|js|less)$', link, - re.IGNORECASE): - # Pass to - if website[-1] == '/': - final_link = website + link - else: - final_link = website + "/" + link - return final_link - - -# Clean links from '?page=' arguments - - -def crawler(website, c_depth, c_pause, out_path, logs, verbose): - """ Core of the crawler. - - :param website: - :param c_depth: - :param c_pause: - :param out_path: - :param logs: - :param verbose: - :return: - """ - lst = set() - ord_lst = [] - ord_lst.insert(0, website) - ord_lstind = 0 - - if logs: - global log_file - log_file = open(out_path + '/log.txt', 'w+') - - print(f"## Crawler started from {website} with {str(c_depth)} depth " - f"crawl, and {str(c_pause)} second(s) delay.") - - # Depth - for x in range(0, int(c_depth)): - - # For every element of list - for item in ord_lst: - - # Check if is the first element - if ord_lstind > 0: - try: - if item is not None: - global html_page - html_page = urllib.request.urlopen(item) - except HTTPError as error: - print(error) +class Crawler: + def __init__(self, website, c_depth, c_pause, out_path, logs, verbose): + self.website = website + self.c_depth = c_depth + self.c_pause = c_pause + self.out_path = out_path + self.logs = logs + self.verbose = verbose + + def excludes(self, link): + """ Excludes links that are not required. + + :param link: + :return: Boolean + """ + # BUG: For NoneType Exceptions, got to find a solution here + if link is None: + return True + # Links + elif '#' in link: + return True + # External links + elif link.startswith('http') and not link.startswith(self.website): + file_path = self.out_path + '/extlinks.txt' + with open(file_path, 'w+', encoding='UTF-8') as lst_file: + lst_file.write(str(link) + '\n') + return True + # Telephone Number + elif link.startswith('tel:'): + file_path = self.out_path + '/telephones.txt' + with open(file_path, 'w+', encoding='UTF-8') as lst_file: + lst_file.write(str(link) + '\n') + return True + # Mails + elif link.startswith('mailto:'): + file_path = self.out_path + '/mails.txt' + with open(file_path, 'w+', encoding='UTF-8') as lst_file: + lst_file.write(str(link) + '\n') + return True + # Type of files + elif re.search('^.*\\.(pdf|jpg|jpeg|png|gif|doc)$', link, + re.IGNORECASE): + return True + + def canonical(self, link): + """ Canonization of the link. + + :param link: + :return: + """ + # Already formatted + if link.startswith(self.website): + return link + # For relative paths with / in front + elif link.startswith('/'): + if self.website[-1] == '/': + final_link = self.website[:-1] + link else: - html_page = urllib.request.urlopen(website) - ord_lstind += 1 - - soup = BeautifulSoup(html_page, features="html.parser") - - # For each tag - for link in soup.findAll('a'): - link = link.get('href') - - if excludes(link, website, out_path): - continue - - ver_link = canonical(link, website) - lst.add(ver_link) - - # For each tag - for link in soup.findAll('area'): - link = link.get('href') - - if excludes(link, website, out_path): - continue - - ver_link = canonical(link, website) - lst.add(ver_link) - - # TODO: For images - # TODO: For scripts - - # Pass new on list and re-set it to delete duplicates - ord_lst = ord_lst + list(set(lst)) - ord_lst = list(set(ord_lst)) - - if verbose: - sys.stdout.write("-- Results: " + str(len(ord_lst)) + "\r") - sys.stdout.flush() - - # Pause time - if (ord_lst.index(item) != len(ord_lst) - 1) and \ - float(c_pause) > 0: - time.sleep(float(c_pause)) - - # Keeps logs for every webpage visited - if logs: - it_code = html_page.getcode() - log_file.write("[" + str(it_code) + "] " + str(item) + "\n") - - print(("## Step " + str(x + 1) + " completed with: " + str( - len(ord_lst)) + " result(s)")) - - if logs: - log_file.close() - - return ord_lst + final_link = self.website + link + return final_link + # For relative paths without / + elif re.search('^.*\\.(html|htm|aspx|php|doc|css|js|less)$', link, + re.IGNORECASE): + # Pass to + if self.website[-1] == '/': + final_link = self.website + link + else: + final_link = self.website + "/" + link + return final_link + + def crawl(self): + """ Core of the crawler. + :return: List (ord_lst) - List of crawled links. + """ + lst = set() + ord_lst = [] + ord_lst.insert(0, self.website) + ord_lst_ind = 0 + log_path = self.out_path + '/log.txt' + + if self.logs is True and os.access(log_path, os.W_OK) is False: + print(f"## Unable to write to {self.out_path}/log.txt - Exiting") + sys.exit(2) + + print(f"## Crawler started from {self.website} with " + f"{str(self.c_depth)} depth crawl, and {str(self.c_pause)} " + f"second(s) delay.") + + # Depth + for index in range(0, int(self.c_depth)): + + # For every element of list. + for item in ord_lst: + html_page = http.client.HTTPResponse + # Check if is the first element + if ord_lst_ind > 0: + try: + if item is not None: + html_page = urllib.request.urlopen(item) + except HTTPError as error: + print(error) + else: + html_page = urllib.request.urlopen(self.website) + ord_lst_ind += 1 + + soup = BeautifulSoup(html_page, features="html.parser") + + # For each tag. + for link in soup.findAll('a'): + link = link.get('href') + + if self.excludes(link): + continue + + ver_link = self.canonical(link) + lst.add(ver_link) + + # For each tag. + for link in soup.findAll('area'): + link = link.get('href') + + if self.excludes(link): + continue + + ver_link = self.canonical(link) + lst.add(ver_link) + + # TODO: For images + # TODO: For scripts + + # Pass new on list and re-set it to delete duplicates. + ord_lst = ord_lst + list(set(lst)) + ord_lst = list(set(ord_lst)) + + if self.verbose: + sys.stdout.write("-- Results: " + str(len(ord_lst)) + "\r") + sys.stdout.flush() + + # Pause time. + if (ord_lst.index(item) != len(ord_lst) - 1) and \ + float(self.c_pause) > 0: + time.sleep(float(self.c_pause)) + + # Keeps logs for every webpage visited. + if self.logs: + it_code = html_page.getcode() + with open(log_path, 'w+', encoding='UTF-8') as log_file: + log_file.write(f"[{str(it_code)}] {str(item)} \n") + + print(f"## Step {str(index + 1)} completed \n\t " + f"with: {str(len(ord_lst))} result(s)") + + return ord_lst diff --git a/modules/extractor.py b/modules/extractor.py index 5df4be1..5c2e422 100644 --- a/modules/extractor.py +++ b/modules/extractor.py @@ -1,27 +1,31 @@ #!/usr/bin/python - +import io import os import sys import urllib.error import urllib.parse import urllib.request +from urllib.error import HTTPError +from urllib.error import URLError def cinex(input_file, out_path): - """ Ingests the input links from file and extract them into path/files. + """ Ingests the crawled links from the input_file, + scrapes the contents of the resulting web pages and writes the contents to + the into out_path/{url_address}. - :param input_file: - :param out_path: - :return: + :param input_file: String: Filename of the crawled Urls. + :param out_path: String: Pathname of results. + :return: None """ + file = io.TextIOWrapper try: - global f - f = open(input_file, 'r') + file = open(input_file, 'r') except IOError as err: - error = sys.exc_info()[0] - print(f"Error: {error}\n## Can't open: {input_file}") + # error = sys.exc_info()[0] + print(f"Error: {err}\n## Can't open: {input_file}") - for line in f: + for line in file: # Generate the name for every file. try: @@ -38,80 +42,87 @@ def cinex(input_file, out_path): # Extract page to file try: - with open(out_path + "/" + output_file, 'wb') as f: - f.write(urllib.request.urlopen(line).read()) + with open(out_path + "/" + output_file, 'wb') as results: + results.write(urllib.request.urlopen(line).read()) print(f"# File created on: {os.getcwd()}/{out_path}/{output_file}") except IOError as err: error = sys.exc_info()[0] - print(f"Error: {error}\n Can't write on file: {output_file}") + print(f"Error: {error}\nCan't write on file: {output_file}") + file.close() def intermex(input_file): """ Input links from file and extract them into terminal. - :param input_file: - :return: + :param input_file: String: File name of links file. + :return: None """ try: - with open(input_file, 'r') as f: - for line in f: + with open(input_file, 'r') as file: + for line in file: print((urllib.request.urlopen(line).read())) + except (HTTPError, URLError) as err: + print(f"HTTPError: {err}") except IOError as err: - error = sys.exc_info()[0] - print(f"Error: {error}\n## Not valid file") + # error = sys.exc_info()[0] + print(f"Error: {err}\n## Not valid file") def outex(website, output_file, out_path): - """ Output the contents of the webpage into a file. + """ Scrapes the contents of the provided web address and outputs the + contents to file. - :param website: - :param output_file: - :param out_path: - :return: + :param website: String: Url of web address to scrape. + :param output_file: String: Filename of the results. + :param out_path: String: Folder name of the output findings. + :return: None """ # Extract page to file try: output_file = out_path + "/" + output_file - with open(output_file, 'wb') as f: - f.write(urllib.request.urlopen(website).read()) + with open(output_file, 'wb') as file: + file.write(urllib.request.urlopen(website).read()) print(f"## File created on: {os.getcwd()}/{output_file}") + except (HTTPError, URLError) as err: + print(f"HTTPError: {err}") except IOError as err: - error = sys.exc_info()[0] - print(f"Error: {error}\n Can't write on file: {output_file}") + # error = sys.exc_info()[0] + print(f"Error: {err}\n Can't write on file: {output_file}") def termex(website): - """ Output findings to the terminal. + """ Scrapes provided web address and prints the results to the terminal. - :param website: - :return: + :param website: String: URL of website to scrape. + :return: None """ try: print((urllib.request.urlopen(website).read())) except (urllib.error.HTTPError, urllib.error.URLError) as err: print(f"Error: ({err}) {website}") - return None + return -def extractor(website, crawl, output_file, input_ile, out_path): - """ +def extractor(website, crawl, output_file, input_file, out_path): + """ Extractor - scrapes the resulting website or discovered links. - :param website: - :param crawl: - :param output_file: - :param input_ile: - :param out_path: - :return: + :param website: String: URL of website to scrape. + :param crawl: Boolean: Cinex trigger. + If used iteratively scrape the urls from input_file. + :param output_file: String: Filename of resulting output from scrape. + :param input_file: String: Filename of crawled/discovered URLs + :param out_path: String: Dir path for output files. + :return: None """ # TODO: Return output to torcrawl.py - if len(input_ile) > 0: + if len(input_file) > 0: if crawl: - cinex(input_ile, out_path) + cinex(input_file, out_path) # TODO: Extract from list into a folder # elif len(output_file) > 0: # inoutex(website, input_ile, output_file) else: - intermex(input_ile) + intermex(input_file) else: if len(output_file) > 0: outex(website, output_file, out_path) diff --git a/torcrawl.py b/torcrawl.py index 68764e1..a7a0157 100755 --- a/torcrawl.py +++ b/torcrawl.py @@ -50,7 +50,7 @@ from modules.checker import folder from modules.checker import url_canon # TorCrawl Modules -from modules.crawler import crawler +from modules.crawler import Crawler from modules.extractor import extractor @@ -185,8 +185,9 @@ def main(): out_path = folder(extract_domain(website), args.verbose) if args.crawl: - lst = crawler(website, c_depth, c_pause, out_path, args.log, - args.verbose) + crawler = Crawler(website, c_depth, c_pause, out_path, args.log, + args.verbose) + lst = crawler.crawl() with open(out_path + '/links.txt', 'w+', encoding='UTF-8') as file: for item in lst: file.write(f"{item}\n") From 52bcf14988ea173fe29076c8355166da7f4ed7a2 Mon Sep 17 00:00:00 2001 From: Aaron Bishop <93538312+the-siegfried@users.noreply.github.com> Date: Tue, 15 Mar 2022 22:46:10 +0000 Subject: [PATCH 3/4] Delete .gitignore file --- .gitignore | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .gitignore diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 723ef36..0000000 --- a/.gitignore +++ /dev/null @@ -1 +0,0 @@ -.idea \ No newline at end of file From 83763130662be6e5b6edd23ce65cfff84a1bcfcf Mon Sep 17 00:00:00 2001 From: Aaron Bishop Date: Wed, 16 Mar 2022 21:13:27 +0000 Subject: [PATCH 4/4] Resolve PEP8 violations - Implements Error handling for uncaught http exceptions. - Implements TypeError handling for uncaught exceptions from BeautifulSoup. resolve-pep8-violations --- modules/crawler.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/modules/crawler.py b/modules/crawler.py index 2f7f591..6594cf8 100644 --- a/modules/crawler.py +++ b/modules/crawler.py @@ -111,11 +111,22 @@ def crawl(self): html_page = urllib.request.urlopen(item) except HTTPError as error: print(error) + continue else: - html_page = urllib.request.urlopen(self.website) - ord_lst_ind += 1 + try: + html_page = urllib.request.urlopen(self.website) + ord_lst_ind += 1 + except HTTPError as error: + print(error) + ord_lst_ind += 1 + continue - soup = BeautifulSoup(html_page, features="html.parser") + try: + soup = BeautifulSoup(html_page, features="html.parser") + except TypeError as err: + print(f"## Soup Error Encountered:: could to parse " + f"ord_list # {ord_lst_ind}::{ord_lst[ord_lst_ind]}") + continue # For each tag. for link in soup.findAll('a'):