From 0de72834b86dc06f8a4f859b0150434d03473afd Mon Sep 17 00:00:00 2001
From: Aaron Bishop <aaron.bishopalb@gmail.com>
Date: Mon, 14 Mar 2022 23:09:21 +0000
Subject: [PATCH 1/4] Resolve pep 8 violations. WIP

- Resolves PEP8 violations in modules/checker.py
- Resolves PEP8 violations in torcrawler.py.
---
 .gitignore           |   1 +
 modules/checker.py   | 133 +++++++++++-------
 modules/crawler.py   | 296 +++++++++++++++++++++------------------
 modules/extractor.py | 201 +++++++++++++++------------
 torcrawl.py          | 321 ++++++++++++++++++++++---------------------
 5 files changed, 521 insertions(+), 431 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..723ef36
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.idea
\ No newline at end of file
diff --git a/modules/checker.py b/modules/checker.py
index c4a2152..abf5741 100644
--- a/modules/checker.py
+++ b/modules/checker.py
@@ -1,67 +1,96 @@
 #!/usr/bin/python
 
-import sys
+import os
 import re
 import subprocess
-import os
-from urllib.request import urlopen
+import sys
 from json import load
+from urllib.error import HTTPError
 from urllib.parse import urlparse
+from urllib.request import urlopen
 
 
-def urlcanon(website, verbose):
-	if not website.startswith("http"):
-		if not website.startswith("www."):
-			website = "www." + website
-			if verbose:
-				print(("## URL fixed: " + website))
-		website = "http://" + website
-		if verbose:
-			print(("## URL fixed: " + website))
-	return website
+def url_canon(website, verbose):
+    """
+
+    :param website: String -
+    :param verbose: Boolean -
+    :return: String 'website' -
+    """
+    if not website.startswith("http"):
+        if not website.startswith("www."):
+            website = "www." + website
+            if verbose:
+                print(("## URL fixed: " + website))
+        website = "http://" + website
+        if verbose:
+            print(("## URL fixed: " + website))
+    return website
 
 
 def extract_domain(url, remove_http=True):
-	uri = urlparse(url)
-	if remove_http:
-		domain_name = f"{uri.netloc}"
-	else:
-		domain_name = f"{uri.netloc}://{uri.netloc}"
-	return domain_name
+    """
+
+    :param url: String -
+    :param remove_http: Boolean -
+    :return: String 'domain_name' -
+    """
+    uri = urlparse(url)
+    if remove_http:
+        domain_name = f"{uri.netloc}"
+    else:
+        domain_name = f"{uri.netloc}://{uri.netloc}"
+    return domain_name
 
 
 # Create output path
 def folder(website, verbose):
-	outpath = website
-	if not os.path.exists(outpath):
-		os.makedirs(outpath)
-	if verbose:
-		print(("## Folder created: " + outpath))
-	return outpath
-
-
-# Check if TOR service is running
-def checktor(verbose):
-	checkfortor = subprocess.check_output(['ps', '-e'])
-
-	def findwholeword(w):
-		return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search
-
-	if findwholeword('tor')(str(checkfortor)):
-		if verbose:
-			print("## TOR is ready!")
-	else:
-		print("## TOR is NOT running!")
-		print('## Enable tor with \'service tor start\' or add -w argument')
-		sys.exit(2)
-
-
-# Check your IP from external website
-def checkip():
-	try:
-		webipcheck = 'https://api.ipify.org/?format=json'
-		my_ip = load(urlopen(webipcheck))['ip']
-		print(('## Your IP: ' + my_ip))
-	except:
-		e = sys.exc_info()[0]
-		print(("Error: %s" % e + "\n## IP can't obtain \n## Is " + webipcheck + "up?"))
+    """ Creates an output path for the findings.
+
+    :param website: String - URL of website to crawl.
+    :param verbose: Boolean - Logging level.
+    :return: String 'out_path' - Path of the output folder.
+    """
+    out_path = website
+    if not os.path.exists(out_path):
+        os.makedirs(out_path)
+    if verbose:
+        print(f"## Folder created: {out_path}")
+    return out_path
+
+
+def check_tor(verbose):
+    """Checks to see if TOR service is running on device.
+    Will exit if (-w) with argument is provided on application startup and TOR
+    service is not found to be running on the device.
+
+    :param verbose: Boolean -'verbose' logging argument.
+    :return: None
+    """
+    check_for_tor = subprocess.check_output(['ps', '-e'])
+
+    def find_whole_word(word):
+        return re.compile(r'\b({0})\b'.format(word),
+                          flags=re.IGNORECASE).search
+
+    if find_whole_word('tor')(str(check_for_tor)):
+        if verbose:
+            print("## TOR is ready!")
+    else:
+        print("## TOR is NOT running!")
+        print('## Enable tor with \'service tor start\' or add -w argument')
+        sys.exit(2)
+
+
+def check_ip():
+    """ Checks users IP from external resource.
+    :return: None or HTTPError
+    """
+    addr = 'https://api.ipify.org/?format=json'
+    try:
+        with load(urlopen(addr))['ip'] as my_ip:
+            print(f'## Your IP: {my_ip}')
+    except HTTPError as err:
+        error = sys.exc_info()[0]
+        print(f"Error: {error} \n## IP cannot be obtained. \n## Is {addr} up? "
+              f"\n## HTTPError: {err}")
diff --git a/modules/crawler.py b/modules/crawler.py
index aaf8d38..9eb0b0f 100644
--- a/modules/crawler.py
+++ b/modules/crawler.py
@@ -1,145 +1,167 @@
 #!/usr/bin/python
 
-import sys
 import re
-import urllib.request
+import sys
 import time
+import urllib.request
+from urllib.error import HTTPError
+
 from bs4 import BeautifulSoup
 
 
-# Exclude links that we dont need
-def excludes(link, website, outpath):
-	# BUG: For NoneType Exceptions, got to find a solution here
-	if link is None:
-		return True
-	# Links
-	elif '#' in link:
-		return True
-		# External links
-	elif link.startswith('http') and not link.startswith(website):
-		lstfile = open(outpath + '/extlinks.txt', 'w+')
-		lstfile.write(str(link) + '\n')
-		lstfile.close()
-		return True
-		# Telephone Number
-	elif link.startswith('tel:'):
-		lstfile = open(outpath + '/telephones.txt', 'w+')
-		lstfile.write(str(link) + '\n')
-		lstfile.close()
-		return True
-		# Mails
-	elif link.startswith('mailto:'):
-		lstfile = open(outpath + '/mails.txt', 'w+')
-		lstfile.write(str(link) + '\n')
-		lstfile.close()
-		return True
-		# Type of files
-	elif re.search('^.*\.(pdf|jpg|jpeg|png|gif|doc)$', link, re.IGNORECASE):
-		return True
-
-
-# Canonization of the link
+def excludes(link, website, out_path):
+    """ Excludes links that are not required.
+
+    :param link:
+    :param website:
+    :param out_path:
+    :return:
+    """
+    # BUG: For NoneType Exceptions, got to find a solution here
+    if link is None:
+        return True
+    # Links
+    elif '#' in link:
+        return True
+    # External links
+    elif link.startswith('http') and not link.startswith(website):
+        with open(out_path + '/extlinks.txt', 'w+') as lst_file:
+            lst_file.write(str(link) + '\n')
+        return True
+    # Telephone Number
+    elif link.startswith('tel:'):
+        with open(out_path + '/telephones.txt', 'w+') as lst_file:
+            lst_file.write(str(link) + '\n')
+        return True
+    # Mails
+    elif link.startswith('mailto:'):
+        with open(out_path + '/mails.txt', 'w+') as lst_file:
+            lst_file.write(str(link) + '\n')
+        return True
+    # Type of files
+    elif re.search('^.*\.(pdf|jpg|jpeg|png|gif|doc)$', link, re.IGNORECASE):
+        return True
+
+
 def canonical(link, website):
-	# Already formatted
-	if link.startswith(website):
-		return link
-	# For relative paths with / in front
-	elif link.startswith('/'):
-		if website[-1] == '/':
-			finalLink = website[:-1] + link
-		else:
-			finalLink = website + link
-		return finalLink
-	# For relative paths without /
-	elif re.search('^.*\.(html|htm|aspx|php|doc|css|js|less)$', link, re.IGNORECASE):
-		# Pass to
-		if website[-1] == '/':
-			finalLink = website + link
-		else:
-			finalLink = website + "/" + link
-		return finalLink
-	# Clean links from '?page=' arguments
-
-
-# Core of crawler
-def crawler(website, cdepth, cpause, outpath, logs, verbose):
-	lst = set()
-	ordlst = []
-	ordlst.insert(0, website)
-	ordlstind = 0
-
-	if logs:
-		global logfile
-		logfile = open(outpath + '/log.txt', 'w+')
-
-	print((
-			"## Crawler started from " + website +
-			" with " + str(cdepth) + " depth crawl and " + str(cpause) + " second(s) delay:"
-	))
-
-	# Depth
-	for x in range(0, int(cdepth)):
-
-		# For every element of list
-		for item in ordlst:
-
-			# Check if is the first element
-			if ordlstind > 0:
-				try:
-					if item is not None:
-						global html_page
-						html_page = urllib.request.urlopen(item)
-				except urllib.error.HTTPError as e:
-					print(e)
-			else:
-				html_page = urllib.request.urlopen(website)
-				ordlstind += 1
-
-			soup = BeautifulSoup(html_page, features="html.parser")
-
-			# For each <a href=""> tag
-			for link in soup.findAll('a'):
-				link = link.get('href')
-
-				if excludes(link, website, outpath):
-					continue
-
-				verlink = canonical(link, website)
-				lst.add(verlink)
-
-			# For each <area> tag
-			for link in soup.findAll('area'):
-				link = link.get('href')
-
-				if excludes(link, website, outpath):
-					continue
-
-				verlink = canonical(link, website)
-				lst.add(verlink)
-
-			# TODO: For images
-			# TODO: For scripts
-
-			# Pass new on list and re-set it to delete duplicates
-			ordlst = ordlst + list(set(lst))
-			ordlst = list(set(ordlst))
-
-			if verbose:
-				sys.stdout.write("-- Results: " + str(len(ordlst)) + "\r")
-				sys.stdout.flush()
-
-			# Pause time
-			if (ordlst.index(item) != len(ordlst) - 1) and float(cpause) > 0:
-				time.sleep(float(cpause))
-
-			# Keeps logs for every webpage visited
-			if logs:
-				itcode = html_page.getcode()
-				logfile.write("[" + str(itcode) + "] " + str(item) + "\n")
-
-		print(("## Step " + str(x + 1) + " completed with: " + str(len(ordlst)) + " result(s)"))
-
-	if logs:
-		logfile.close()
-
-	return ordlst
+    """ Canonization of the link.
+
+    :param link:
+    :param website:
+    :return:
+    """
+    # Already formatted
+    if link.startswith(website):
+        return link
+    # For relative paths with / in front
+    elif link.startswith('/'):
+        if website[-1] == '/':
+            final_link = website[:-1] + link
+        else:
+            final_link = website + link
+        return final_link
+    # For relative paths without /
+    elif re.search('^.*\.(html|htm|aspx|php|doc|css|js|less)$', link,
+                   re.IGNORECASE):
+        # Pass to
+        if website[-1] == '/':
+            final_link = website + link
+        else:
+            final_link = website + "/" + link
+        return final_link
+
+
+# Clean links from '?page=' arguments
+
+
+def crawler(website, c_depth, c_pause, out_path, logs, verbose):
+    """ Core of the crawler.
+
+    :param website:
+    :param c_depth:
+    :param c_pause:
+    :param out_path:
+    :param logs:
+    :param verbose:
+    :return:
+    """
+    lst = set()
+    ord_lst = []
+    ord_lst.insert(0, website)
+    ord_lstind = 0
+
+    if logs:
+        global log_file
+        log_file = open(out_path + '/log.txt', 'w+')
+
+    print(f"## Crawler started from {website} with {str(c_depth)} depth "
+          f"crawl, and {str(c_pause)} second(s) delay.")
+
+    # Depth
+    for x in range(0, int(c_depth)):
+
+        # For every element of list
+        for item in ord_lst:
+
+            # Check if is the first element
+            if ord_lstind > 0:
+                try:
+                    if item is not None:
+                        global html_page
+                        html_page = urllib.request.urlopen(item)
+                except HTTPError as error:
+                    print(error)
+            else:
+                html_page = urllib.request.urlopen(website)
+                ord_lstind += 1
+
+            soup = BeautifulSoup(html_page, features="html.parser")
+
+            # For each <a href=""> tag
+            for link in soup.findAll('a'):
+                link = link.get('href')
+
+                if excludes(link, website, out_path):
+                    continue
+
+                ver_link = canonical(link, website)
+                lst.add(ver_link)
+
+            # For each <area> tag
+            for link in soup.findAll('area'):
+                link = link.get('href')
+
+                if excludes(link, website, out_path):
+                    continue
+
+                ver_link = canonical(link, website)
+                lst.add(ver_link)
+
+            # TODO: For images
+            # TODO: For scripts
+
+            # Pass new on list and re-set it to delete duplicates
+            ord_lst = ord_lst + list(set(lst))
+            ord_lst = list(set(ord_lst))
+
+            if verbose:
+                sys.stdout.write("-- Results: " + str(len(ord_lst)) + "\r")
+                sys.stdout.flush()
+
+            # Pause time
+            if (ord_lst.index(item) != len(ord_lst) - 1) and \
+                    float(c_pause) > 0:
+                time.sleep(float(c_pause))
+
+            # Keeps logs for every webpage visited
+            if logs:
+                it_code = html_page.getcode()
+                log_file.write("[" + str(it_code) + "] " + str(item) + "\n")
+
+        print(("## Step " + str(x + 1) + " completed with: " + str(
+            len(ord_lst)) + " result(s)"))
+
+    if logs:
+        log_file.close()
+
+    return ord_lst
diff --git a/modules/extractor.py b/modules/extractor.py
index 8c446dc..5df4be1 100644
--- a/modules/extractor.py
+++ b/modules/extractor.py
@@ -2,91 +2,118 @@
 
 import os
 import sys
-import urllib.request, urllib.parse, urllib.error
-
-
-# Input links from file and extract them into path/files
-def cinex(inputFile, outpath):
-	try:
-		global f
-		f = open(inputFile, 'r')
-	# print f
-	except IOError:
-		e = sys.exc_info()[0]
-		print(("Error: %s" % e + "\n## Can't open " + inputFile))
-
-	for line in f:
-
-		# Generate name for every file
-		try:
-			pagename = line.rsplit('/', 1)
-			clpagename = str(pagename[1])
-			clpagename = clpagename[:-1]
-			if len(clpagename) == 0:
-				outputFile = "index.htm"
-			else:
-				outputFile = clpagename
-		except IndexError as e:
-			print("Error: %s" % e)
-			continue
-
-		# Extract page to file
-		try:
-			f = open(outpath + "/" + outputFile, 'wb')
-			f.write(urllib.request.urlopen(line).read())
-			f.close()
-			print(("## File created on " + os.getcwd() + "/" + outpath + "/" + outputFile))
-		except:
-			e = sys.exc_info()[0]
-			print(("Error: %s" % e + "\n Can't write on file " + outputFile))
-
-
-# Input links from file and extract them into terminal
-def intermex(inputFile):
-	try:
-		f = open(inputFile, 'r')
-		for line in f:
-			print((urllib.request.urlopen(line).read()))
-	except:
-		e = sys.exc_info()[0]
-		print(("Error: %s" % e + "\n## Not valid file"))
-
-
-# Output webpage into a file
-def outex(website, outputFile, outpath):
-	# Extract page to file
-	try:
-		outputFile = outpath + "/" + outputFile
-		f = open(outputFile, 'wb')
-		f.write(urllib.request.urlopen(website).read())
-		f.close()
-		print(("## File created on " + os.getcwd() + "/" + outputFile))
-	except:
-		e = sys.exc_info()[0]
-		print(("Error: %s" % e + "\n Can't write on file " + outputFile))
-
-
-# Output to terminal
+import urllib.error
+import urllib.parse
+import urllib.request
+
+
+def cinex(input_file, out_path):
+    """ Ingests the input links from file and extract them into path/files.
+
+    :param input_file:
+    :param out_path:
+    :return:
+    """
+    try:
+        global f
+        f = open(input_file, 'r')
+    except IOError as err:
+        error = sys.exc_info()[0]
+        print(f"Error: {error}\n## Can't open: {input_file}")
+
+    for line in f:
+
+        # Generate the name for every file.
+        try:
+            page_name = line.rsplit('/', 1)
+            cl_page_name = str(page_name[1])
+            cl_page_name = cl_page_name[:-1]
+            if len(cl_page_name) == 0:
+                output_file = "index.htm"
+            else:
+                output_file = cl_page_name
+        except IndexError as error:
+            print(f"Error: {error}")
+            continue
+
+        # Extract page to file
+        try:
+            with open(out_path + "/" + output_file, 'wb') as f:
+                f.write(urllib.request.urlopen(line).read())
+            print(f"# File created on: {os.getcwd()}/{out_path}/{output_file}")
+        except IOError as err:
+            error = sys.exc_info()[0]
+            print(f"Error: {error}\n Can't write on file: {output_file}")
+
+
+def intermex(input_file):
+    """ Input links from file and extract them into terminal.
+
+    :param input_file:
+    :return:
+    """
+    try:
+        with open(input_file, 'r') as f:
+            for line in f:
+                print((urllib.request.urlopen(line).read()))
+    except IOError as err:
+        error = sys.exc_info()[0]
+        print(f"Error: {error}\n## Not valid file")
+
+
+def outex(website, output_file, out_path):
+    """ Output the contents of the webpage into a file.
+
+    :param website:
+    :param output_file:
+    :param out_path:
+    :return:
+    """
+    # Extract page to file
+    try:
+        output_file = out_path + "/" + output_file
+        with open(output_file, 'wb') as f:
+            f.write(urllib.request.urlopen(website).read())
+        print(f"## File created on: {os.getcwd()}/{output_file}")
+    except IOError as err:
+        error = sys.exc_info()[0]
+        print(f"Error: {error}\n Can't write on file: {output_file}")
+
+
 def termex(website):
-	try:
-		print((urllib.request.urlopen(website).read()))
-	except (urllib.error.HTTPError, urllib.error.URLError) as e:
-		print(("Error: (%s) %s" % (e, website)))
-		return None
-
-
-def extractor(website, crawl, outputFile, inputFile, outpath):
-	# TODO: Return output to torcrawl.py
-	if len(inputFile) > 0:
-		if crawl:
-			cinex(inputFile, outpath)
-		# TODO: Extract from list into a folder
-		# elif len(outputFile) > 0:
-		# 	inoutex(website, inputFile, outputFile)
-		else:
-			intermex(inputFile)
-	else:
-		if len(outputFile) > 0:
-			outex(website, outputFile, outpath)
-		else:
-			termex(website)
+    """ Output findings to the terminal.
+
+    :param website:
+    :return:
+    """
+    try:
+        print((urllib.request.urlopen(website).read()))
+    except (urllib.error.HTTPError, urllib.error.URLError) as err:
+        print(f"Error: ({err}) {website}")
+        return None
+
+
+def extractor(website, crawl, output_file, input_ile, out_path):
+    """
+
+    :param website:
+    :param crawl:
+    :param output_file:
+    :param input_ile:
+    :param out_path:
+    :return:
+    """
+    # TODO: Return output to torcrawl.py
+    if len(input_ile) > 0:
+        if crawl:
+            cinex(input_ile, out_path)
+        # TODO: Extract from list into a folder
+        # elif len(output_file) > 0:
+        # 	inoutex(website, input_ile, output_file)
+        else:
+            intermex(input_ile)
+    else:
+        if len(output_file) > 0:
+            outex(website, output_file, out_path)
+        else:
+            termex(website)
diff --git a/torcrawl.py b/torcrawl.py
index 3fd401c..68764e1 100755
--- a/torcrawl.py
+++ b/torcrawl.py
@@ -1,22 +1,11 @@
 #!/usr/bin/python
-
-import socket
-import socks
-import argparse
-
-# TorCrawl Modules
-from modules.crawler import crawler
-from modules.extractor import extractor
-from modules.checker import *
-
-help = '''
-
-TorCrawl.py is a python script to crawl and extract (regular or onion) 
-webpages through TOR network. 
+"""
+TorCrawl.py is a python script to crawl and extract (regular or onion)
+webpages through TOR network.
 
 usage: python torcrawl.py [options]
-python torcrawl.py -u l0r3m1p5umD0lorS1t4m3t.onion 
-python torcrawl.py -v -w -u http://www.github.com -o github.htm 
+python torcrawl.py -u l0r3m1p5umD0lorS1t4m3t.onion
+python torcrawl.py -v -w -u http://www.github.com -o github.htm
 python torcrawl.py -v -u l0r3m1p5umD0lorS1t4m3t.onion -c -d 2 -p 5
 python torcrawl.py -v -w -u http://www.github.com -c -d 2 -p 5 -e -f GitHub
 
@@ -24,7 +13,7 @@
 -h, --help         : Help
 -v, --verbose      : Show more informations about the progress
 -u, --url *.onion  : URL of Webpage to crawl or extract
--w, --without      : Without the use of Relay TOR 
+-w, --without      : Without the use of Relay TOR
 
 Extract:
 -e, --extract           : Extract page's code to terminal or file.
@@ -37,156 +26,178 @@
 -d, --cdepth      : Set depth of crawl's travel (Default: 1)
 -z, --exclusions  : Paths that you don't want to include (TODO)
 -s, --simultaneous: How many pages to visit at the same time (TODO)
--p, --pause       : The length of time the crawler will pause 
+-p, --pause       : The length of time the crawler will pause
                     (Default: 0)
--f, --folder	  : The root directory which will contain the 
+-f, --folder	  : The root directory which will contain the
                     generated files
 -l, --log         : Log file with visited URLs and their response code.
 
 GitHub: github.com/MikeMeliz/TorCrawl.py
 License: GNU General Public License v3.0
 
-'''
-
+"""
 
-# Set socket and connection with TOR network
-def connecttor():
-	try:
-		port = 9050
-		# Set socks proxy and wrap the urllib module
-		socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', port)
-		socket.socket = socks.socksocket
+import argparse
+import os
+import socket
+import sys
 
-		# Perform DNS resolution through the socket
-		def getaddrinfo(*args):
-			return [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))]
+import socks  # noqa - pysocks
 
-		socket.getaddrinfo = getaddrinfo
-	except:
-		e = sys.exc_info()[0]
-		print(("Error: %s" % e + "\n## Can't establish connection with TOR"))
+from modules.checker import check_ip
+from modules.checker import check_tor
+from modules.checker import extract_domain
+from modules.checker import folder
+from modules.checker import url_canon
+# TorCrawl Modules
+from modules.crawler import crawler
+from modules.extractor import extractor
 
 
-def main():
-	# Initialize necessary variables
-	inputfile = outputfile = ''
-	cpause = 0
-	cdepth = 1
-
-	# Get arguments with argparse
-	parser = argparse.ArgumentParser(
-		description="TorCrawl.py is a python script to crawl and extract (regular or onion) webpages through TOR network.")
-
-	# General
-	parser.add_argument(
-		'-v',
-		'--verbose',
-		action='store_true',
-		help='Show more information about the progress'
-	)
-	parser.add_argument(
-		'-u',
-		'--url',
-		help='URL of webpage to crawl or extract'
-	)
-	parser.add_argument(
-		'-w',
-		'--without',
-		action='store_true',
-		help='Without the use of Relay TOR'
-	)
-
-	# Extract
-	parser.add_argument(
-		'-e',
-		'--extract',
-		action='store_true',
-		help='Extract page\'s code to terminal or file.'
-	)
-	parser.add_argument(
-		'-i',
-		'--input',
-		help='Input file with URL(s) (seperated by line)'
-	)
-	parser.add_argument(
-		'-o',
-		'--output',
-		help='Output page(s) to file(s) (for one page)'
-	)
-
-	# Crawl
-	parser.add_argument(
-		'-c',
-		'--crawl',
-		action='store_true',
-		help='Crawl website (Default output on /links.txt)'
-	)
-	parser.add_argument(
-		'-d',
-		'--cdepth',
-		help='Set depth of crawl\'s travel (Default: 1)'
-	)
-	parser.add_argument(
-		'-p',
-		'--cpause',
-		help='The length of time the crawler will pause'
-	)
-	parser.add_argument(
-		'-l',
-		'--log',
-		action='store_true',
-		help='A save log will let you see which URLs were visited and their response code'
-	)
-	parser.add_argument(
-		'-f',
-		'--folder',
-		help='The root directory which will contain the generated files'
-	)
-
-	args = parser.parse_args()
-
-	# Parse arguments to variables
-	if args.input:
-		inputfile = args.input
-	if args.output:
-		outputfile = args.output
-	if args.cdepth:
-		cdepth = args.cdepth
-	if args.cpause:
-		cpause = args.cpause
-
-	# Connect to TOR
-	if args.without is False:
-		checktor(args.verbose)
-		connecttor()
-
-	if args.verbose:
-		checkip()
-		print(('## URL: ' + args.url))
-
-	# Canon/ion of website and create path for output
-	if len(args.url) > 0:
-		global website
-		global outpath
-		website = urlcanon(args.url, args.verbose)
-		if args.folder is not None:
-			outpath = folder(args.folder, args.verbose)
-		else:
-			outpath = folder(extract_domain(website), args.verbose)
-
-	if args.crawl:
-		lst = crawler(website, cdepth, cpause, outpath, args.log, args.verbose)
-		lstfile = open(outpath + '/links.txt', 'w+')
-		for item in lst:
-			lstfile.write("%s\n" % item)
-		lstfile.close()
-		print(("## File created on " + os.getcwd() + "/" + outpath + "/links.txt"))
-		if args.extract:
-			inputfile = outpath + "/links.txt"
-			extractor(website, args.crawl, outputfile, inputfile, outpath)
-	else:
-		extractor(website, args.crawl, outputfile, inputfile, outpath)
+# Set socket and connection with TOR network
+def connect_tor():
+    """ Connect to TOR via DNS resolution through a socket.
+    :return: None or HTTPError.
+    """
+    try:
+        port = 9050
+        # Set socks proxy and wrap the urllib module
+        socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', port)
+        socket.socket = socks.socksocket
+
+        # Perform DNS resolution through the socket
+        def getaddrinfo(*args):  # noqa
+            return [(socket.AF_INET, socket.SOCK_STREAM, 6, '',
+                     (args[0], args[1]))]
+
+        socket.getaddrinfo = getaddrinfo  # noqa
+    except socks.HTTPError as err:
+        error = sys.exc_info()[0]
+        print(f"Error: {error} \n## Cannot establish connection with TOR\n"
+              f"HTTPError: {err}")
 
 
+def main():
+    """ Main method of TorCrawl application. Collects and parses arguments and
+    instructs the rest of the application on how to run.
+
+    :return: None
+    """
+
+    # Get arguments with argparse.
+    parser = argparse.ArgumentParser(
+        description="TorCrawl.py is a python script to crawl and extract "
+                    "(regular or onion) webpages through TOR network.")
+
+    # General
+    parser.add_argument(
+        '-v',
+        '--verbose',
+        action='store_true',
+        help='Show more information about the progress'
+    )
+    parser.add_argument(
+        '-u',
+        '--url',
+        help='URL of webpage to crawl or extract'
+    )
+    parser.add_argument(
+        '-w',
+        '--without',
+        action='store_true',
+        help='Without the use of Relay TOR'
+    )
+
+    # Extract
+    parser.add_argument(
+        '-e',
+        '--extract',
+        action='store_true',
+        help='Extract page\'s code to terminal or file.'
+    )
+    parser.add_argument(
+        '-i',
+        '--input',
+        help='Input file with URL(s) (seperated by line)'
+    )
+    parser.add_argument(
+        '-o',
+        '--output',
+        help='Output page(s) to file(s) (for one page)'
+    )
+
+    # Crawl
+    parser.add_argument(
+        '-c',
+        '--crawl',
+        action='store_true',
+        help='Crawl website (Default output on /links.txt)'
+    )
+    parser.add_argument(
+        '-d',
+        '--cdepth',
+        help='Set depth of crawl\'s travel (Default: 1)'
+    )
+    parser.add_argument(
+        '-p',
+        '--cpause',
+        help='The length of time the crawler will pause'
+    )
+    parser.add_argument(
+        '-l',
+        '--log',
+        action='store_true',
+        help='A save log will let you see which URLs were visited and their '
+             'response code'
+    )
+    parser.add_argument(
+        '-f',
+        '--folder',
+        help='The root directory which will contain the generated files'
+    )
+
+    args = parser.parse_args()
+
+    # Parse arguments to variables else initiate variables.
+    input_file = args.input if args.input else ''
+    output_file = args.output if args.output else ''
+    c_depth = args.cdepth if args.cdepth else 0
+    c_pause = args.cpause if args.cpause else 1
+
+    # Connect to TOR
+    if args.without is False:
+        check_tor(args.verbose)
+        connect_tor()
+
+    if args.verbose:
+        check_ip()
+        print(('## URL: ' + args.url))
+
+    website = ''
+    out_path = ''
+
+    # Canon/ion of website and create path for output
+    if len(args.url) > 0:
+        website = url_canon(args.url, args.verbose)
+        if args.folder is not None:
+            out_path = folder(args.folder, args.verbose)
+        else:
+            out_path = folder(extract_domain(website), args.verbose)
+
+    if args.crawl:
+        lst = crawler(website, c_depth, c_pause, out_path, args.log,
+                      args.verbose)
+        with open(out_path + '/links.txt', 'w+', encoding='UTF-8') as file:
+            for item in lst:
+                file.write(f"{item}\n")
+        print(f"## File created on {os.getcwd()}/{out_path}/links.txt")
+        if args.extract:
+            input_file = out_path + "/links.txt"
+            extractor(website, args.crawl, output_file, input_file, out_path)
+    else:
+        extractor(website, args.crawl, output_file, input_file, out_path)
+
+
+# Stub to call main method.
 if __name__ == "__main__":
-	main()
+    main()

From cbbf9dfab70e18b94ee25bcfadd8528f13ce6f98 Mon Sep 17 00:00:00 2001
From: Aaron Bishop <aaron.bishopalb@gmail.com>
Date: Tue, 15 Mar 2022 22:39:25 +0000
Subject: [PATCH 2/4] Resolve PEP8 Violations

- Reverts use of 'with' statement in check_ip function. modules/checker.py.
- Refactors modules/crawler.py to implement Crawler function.
- Refactors previous modules/crawler.py crawler method into 'crawl'
method.
- Resolves PEP8 violations in modules/extractor.py
- Refactors use of string formating to enforce use of new string format convention.
- Ammends Try/Catch statements to handles additional HTTP error cases within extractor methods.
- Refactors torcrawl.py to utilise Crawler class and crawl method.

resolve-pep8-violations
---
 modules/checker.py   |   4 +-
 modules/crawler.py   | 310 +++++++++++++++++++++----------------------
 modules/extractor.py |  97 ++++++++------
 torcrawl.py          |   7 +-
 4 files changed, 214 insertions(+), 204 deletions(-)

diff --git a/modules/checker.py b/modules/checker.py
index abf5741..127fe68 100644
--- a/modules/checker.py
+++ b/modules/checker.py
@@ -88,8 +88,8 @@ def check_ip():
     """
     addr = 'https://api.ipify.org/?format=json'
     try:
-        with load(urlopen(addr))['ip'] as my_ip:
-            print(f'## Your IP: {my_ip}')
+        my_ip = load(urlopen(addr))['ip']
+        print(f'## Your IP: {my_ip}')
     except HTTPError as err:
         error = sys.exc_info()[0]
         print(f"Error: {error} \n## IP cannot be obtained. \n## Is {addr} up? "
diff --git a/modules/crawler.py b/modules/crawler.py
index 9eb0b0f..2f7f591 100644
--- a/modules/crawler.py
+++ b/modules/crawler.py
@@ -1,5 +1,6 @@
 #!/usr/bin/python
-
+import http.client
+import os
 import re
 import sys
 import time
@@ -9,159 +10,156 @@
 from bs4 import BeautifulSoup
 
 
-def excludes(link, website, out_path):
-    """ Excludes links that are not required.
-
-    :param link:
-    :param website:
-    :param out_path:
-    :return:
-    """
-    # BUG: For NoneType Exceptions, got to find a solution here
-    if link is None:
-        return True
-    # Links
-    elif '#' in link:
-        return True
-    # External links
-    elif link.startswith('http') and not link.startswith(website):
-        with open(out_path + '/extlinks.txt', 'w+') as lst_file:
-            lst_file.write(str(link) + '\n')
-        return True
-    # Telephone Number
-    elif link.startswith('tel:'):
-        with open(out_path + '/telephones.txt', 'w+') as lst_file:
-            lst_file.write(str(link) + '\n')
-        return True
-    # Mails
-    elif link.startswith('mailto:'):
-        with open(out_path + '/mails.txt', 'w+') as lst_file:
-            lst_file.write(str(link) + '\n')
-        return True
-    # Type of files
-    elif re.search('^.*\.(pdf|jpg|jpeg|png|gif|doc)$', link, re.IGNORECASE):
-        return True
-
-
-def canonical(link, website):
-    """ Canonization of the link.
-
-    :param link:
-    :param website:
-    :return:
-    """
-    # Already formatted
-    if link.startswith(website):
-        return link
-    # For relative paths with / in front
-    elif link.startswith('/'):
-        if website[-1] == '/':
-            final_link = website[:-1] + link
-        else:
-            final_link = website + link
-        return final_link
-    # For relative paths without /
-    elif re.search('^.*\.(html|htm|aspx|php|doc|css|js|less)$', link,
-                   re.IGNORECASE):
-        # Pass to
-        if website[-1] == '/':
-            final_link = website + link
-        else:
-            final_link = website + "/" + link
-        return final_link
-
-
-# Clean links from '?page=' arguments
-
-
-def crawler(website, c_depth, c_pause, out_path, logs, verbose):
-    """ Core of the crawler.
-
-    :param website:
-    :param c_depth:
-    :param c_pause:
-    :param out_path:
-    :param logs:
-    :param verbose:
-    :return:
-    """
-    lst = set()
-    ord_lst = []
-    ord_lst.insert(0, website)
-    ord_lstind = 0
-
-    if logs:
-        global log_file
-        log_file = open(out_path + '/log.txt', 'w+')
-
-    print(f"## Crawler started from {website} with {str(c_depth)} depth "
-          f"crawl, and {str(c_pause)} second(s) delay.")
-
-    # Depth
-    for x in range(0, int(c_depth)):
-
-        # For every element of list
-        for item in ord_lst:
-
-            # Check if is the first element
-            if ord_lstind > 0:
-                try:
-                    if item is not None:
-                        global html_page
-                        html_page = urllib.request.urlopen(item)
-                except HTTPError as error:
-                    print(error)
+class Crawler:
+    def __init__(self, website, c_depth, c_pause, out_path, logs, verbose):
+        self.website = website
+        self.c_depth = c_depth
+        self.c_pause = c_pause
+        self.out_path = out_path
+        self.logs = logs
+        self.verbose = verbose
+
+    def excludes(self, link):
+        """ Excludes links that are not required.
+
+        :param link:
+        :return: Boolean
+        """
+        # BUG: For NoneType Exceptions, got to find a solution here
+        if link is None:
+            return True
+        # Links
+        elif '#' in link:
+            return True
+        # External links
+        elif link.startswith('http') and not link.startswith(self.website):
+            file_path = self.out_path + '/extlinks.txt'
+            with open(file_path, 'w+', encoding='UTF-8') as lst_file:
+                lst_file.write(str(link) + '\n')
+            return True
+        # Telephone Number
+        elif link.startswith('tel:'):
+            file_path = self.out_path + '/telephones.txt'
+            with open(file_path, 'w+', encoding='UTF-8') as lst_file:
+                lst_file.write(str(link) + '\n')
+            return True
+        # Mails
+        elif link.startswith('mailto:'):
+            file_path = self.out_path + '/mails.txt'
+            with open(file_path, 'w+', encoding='UTF-8') as lst_file:
+                lst_file.write(str(link) + '\n')
+            return True
+        # Type of files
+        elif re.search('^.*\\.(pdf|jpg|jpeg|png|gif|doc)$', link,
+                       re.IGNORECASE):
+            return True
+
+    def canonical(self, link):
+        """ Canonization of the link.
+
+        :param link:
+        :return:
+        """
+        # Already formatted
+        if link.startswith(self.website):
+            return link
+        # For relative paths with / in front
+        elif link.startswith('/'):
+            if self.website[-1] == '/':
+                final_link = self.website[:-1] + link
             else:
-                html_page = urllib.request.urlopen(website)
-                ord_lstind += 1
-
-            soup = BeautifulSoup(html_page, features="html.parser")
-
-            # For each <a href=""> tag
-            for link in soup.findAll('a'):
-                link = link.get('href')
-
-                if excludes(link, website, out_path):
-                    continue
-
-                ver_link = canonical(link, website)
-                lst.add(ver_link)
-
-            # For each <area> tag
-            for link in soup.findAll('area'):
-                link = link.get('href')
-
-                if excludes(link, website, out_path):
-                    continue
-
-                ver_link = canonical(link, website)
-                lst.add(ver_link)
-
-            # TODO: For images
-            # TODO: For scripts
-
-            # Pass new on list and re-set it to delete duplicates
-            ord_lst = ord_lst + list(set(lst))
-            ord_lst = list(set(ord_lst))
-
-            if verbose:
-                sys.stdout.write("-- Results: " + str(len(ord_lst)) + "\r")
-                sys.stdout.flush()
-
-            # Pause time
-            if (ord_lst.index(item) != len(ord_lst) - 1) and \
-                    float(c_pause) > 0:
-                time.sleep(float(c_pause))
-
-            # Keeps logs for every webpage visited
-            if logs:
-                it_code = html_page.getcode()
-                log_file.write("[" + str(it_code) + "] " + str(item) + "\n")
-
-        print(("## Step " + str(x + 1) + " completed with: " + str(
-            len(ord_lst)) + " result(s)"))
-
-    if logs:
-        log_file.close()
-
-    return ord_lst
+                final_link = self.website + link
+            return final_link
+        # For relative paths without /
+        elif re.search('^.*\\.(html|htm|aspx|php|doc|css|js|less)$', link,
+                       re.IGNORECASE):
+            # Pass to
+            if self.website[-1] == '/':
+                final_link = self.website + link
+            else:
+                final_link = self.website + "/" + link
+            return final_link
+
+    def crawl(self):
+        """ Core of the crawler.
+        :return: List (ord_lst) - List of crawled links.
+        """
+        lst = set()
+        ord_lst = []
+        ord_lst.insert(0, self.website)
+        ord_lst_ind = 0
+        log_path = self.out_path + '/log.txt'
+
+        if self.logs is True and os.access(log_path, os.W_OK) is False:
+            print(f"## Unable to write to {self.out_path}/log.txt - Exiting")
+            sys.exit(2)
+
+        print(f"## Crawler started from {self.website} with "
+              f"{str(self.c_depth)} depth crawl, and {str(self.c_pause)} "
+              f"second(s) delay.")
+
+        # Depth
+        for index in range(0, int(self.c_depth)):
+
+            # For every element of list.
+            for item in ord_lst:
+                html_page = http.client.HTTPResponse
+                # Check if is the first element
+                if ord_lst_ind > 0:
+                    try:
+                        if item is not None:
+                            html_page = urllib.request.urlopen(item)
+                    except HTTPError as error:
+                        print(error)
+                else:
+                    html_page = urllib.request.urlopen(self.website)
+                    ord_lst_ind += 1
+
+                soup = BeautifulSoup(html_page, features="html.parser")
+
+                # For each <a href=""> tag.
+                for link in soup.findAll('a'):
+                    link = link.get('href')
+
+                    if self.excludes(link):
+                        continue
+
+                    ver_link = self.canonical(link)
+                    lst.add(ver_link)
+
+                # For each <area> tag.
+                for link in soup.findAll('area'):
+                    link = link.get('href')
+
+                    if self.excludes(link):
+                        continue
+
+                    ver_link = self.canonical(link)
+                    lst.add(ver_link)
+
+                # TODO: For images
+                # TODO: For scripts
+
+                # Pass new on list and re-set it to delete duplicates.
+                ord_lst = ord_lst + list(set(lst))
+                ord_lst = list(set(ord_lst))
+
+                if self.verbose:
+                    sys.stdout.write("-- Results: " + str(len(ord_lst)) + "\r")
+                    sys.stdout.flush()
+
+                # Pause time.
+                if (ord_lst.index(item) != len(ord_lst) - 1) and \
+                        float(self.c_pause) > 0:
+                    time.sleep(float(self.c_pause))
+
+                # Keeps logs for every webpage visited.
+                if self.logs:
+                    it_code = html_page.getcode()
+                    with open(log_path, 'w+', encoding='UTF-8') as log_file:
+                        log_file.write(f"[{str(it_code)}] {str(item)} \n")
+
+            print(f"## Step {str(index + 1)} completed \n\t "
+                  f"with: {str(len(ord_lst))} result(s)")
+
+        return ord_lst
diff --git a/modules/extractor.py b/modules/extractor.py
index 5df4be1..5c2e422 100644
--- a/modules/extractor.py
+++ b/modules/extractor.py
@@ -1,27 +1,31 @@
 #!/usr/bin/python
-
+import io
 import os
 import sys
 import urllib.error
 import urllib.parse
 import urllib.request
+from urllib.error import HTTPError
+from urllib.error import URLError
 
 
 def cinex(input_file, out_path):
-    """ Ingests the input links from file and extract them into path/files.
+    """ Ingests the crawled links from the input_file,
+    scrapes the contents of the resulting web pages and writes the contents to
+    the into out_path/{url_address}.
 
-    :param input_file:
-    :param out_path:
-    :return:
+    :param input_file: String: Filename of the crawled Urls.
+    :param out_path: String: Pathname of results.
+    :return: None
     """
+    file = io.TextIOWrapper
     try:
-        global f
-        f = open(input_file, 'r')
+        file = open(input_file, 'r')
     except IOError as err:
-        error = sys.exc_info()[0]
-        print(f"Error: {error}\n## Can't open: {input_file}")
+        # error = sys.exc_info()[0]
+        print(f"Error: {err}\n## Can't open: {input_file}")
 
-    for line in f:
+    for line in file:
 
         # Generate the name for every file.
         try:
@@ -38,80 +42,87 @@ def cinex(input_file, out_path):
 
         # Extract page to file
         try:
-            with open(out_path + "/" + output_file, 'wb') as f:
-                f.write(urllib.request.urlopen(line).read())
+            with open(out_path + "/" + output_file, 'wb') as results:
+                results.write(urllib.request.urlopen(line).read())
             print(f"# File created on: {os.getcwd()}/{out_path}/{output_file}")
         except IOError as err:
             error = sys.exc_info()[0]
-            print(f"Error: {error}\n Can't write on file: {output_file}")
+            print(f"Error: {error}\nCan't write on file: {output_file}")
+    file.close()
 
 
 def intermex(input_file):
     """ Input links from file and extract them into terminal.
 
-    :param input_file:
-    :return:
+    :param input_file: String: File name of links file.
+    :return: None
     """
     try:
-        with open(input_file, 'r') as f:
-            for line in f:
+        with open(input_file, 'r') as file:
+            for line in file:
                 print((urllib.request.urlopen(line).read()))
+    except (HTTPError, URLError) as err:
+        print(f"HTTPError: {err}")
     except IOError as err:
-        error = sys.exc_info()[0]
-        print(f"Error: {error}\n## Not valid file")
+        # error = sys.exc_info()[0]
+        print(f"Error: {err}\n## Not valid file")
 
 
 def outex(website, output_file, out_path):
-    """ Output the contents of the webpage into a file.
+    """ Scrapes the contents of the provided web address and outputs the
+    contents to file.
 
-    :param website:
-    :param output_file:
-    :param out_path:
-    :return:
+    :param website: String: Url of web address to scrape.
+    :param output_file: String: Filename of the results.
+    :param out_path: String: Folder name of the output findings.
+    :return: None
     """
     # Extract page to file
     try:
         output_file = out_path + "/" + output_file
-        with open(output_file, 'wb') as f:
-            f.write(urllib.request.urlopen(website).read())
+        with open(output_file, 'wb') as file:
+            file.write(urllib.request.urlopen(website).read())
         print(f"## File created on: {os.getcwd()}/{output_file}")
+    except (HTTPError, URLError) as err:
+        print(f"HTTPError: {err}")
     except IOError as err:
-        error = sys.exc_info()[0]
-        print(f"Error: {error}\n Can't write on file: {output_file}")
+        # error = sys.exc_info()[0]
+        print(f"Error: {err}\n Can't write on file: {output_file}")
 
 
 def termex(website):
-    """ Output findings to the terminal.
+    """ Scrapes provided web address and prints the results to the terminal.
 
-    :param website:
-    :return:
+    :param website: String: URL of website to scrape.
+    :return: None
     """
     try:
         print((urllib.request.urlopen(website).read()))
     except (urllib.error.HTTPError, urllib.error.URLError) as err:
         print(f"Error: ({err}) {website}")
-        return None
+        return
 
 
-def extractor(website, crawl, output_file, input_ile, out_path):
-    """
+def extractor(website, crawl, output_file, input_file, out_path):
+    """ Extractor - scrapes the resulting website or discovered links.
 
-    :param website:
-    :param crawl:
-    :param output_file:
-    :param input_ile:
-    :param out_path:
-    :return:
+    :param website: String: URL of website to scrape.
+    :param crawl: Boolean: Cinex trigger.
+        If used iteratively scrape the urls from input_file.
+    :param output_file: String: Filename of resulting output from scrape.
+    :param input_file: String: Filename of crawled/discovered URLs
+    :param out_path: String: Dir path for output files.
+    :return: None
     """
     # TODO: Return output to torcrawl.py
-    if len(input_ile) > 0:
+    if len(input_file) > 0:
         if crawl:
-            cinex(input_ile, out_path)
+            cinex(input_file, out_path)
         # TODO: Extract from list into a folder
         # elif len(output_file) > 0:
         # 	inoutex(website, input_ile, output_file)
         else:
-            intermex(input_ile)
+            intermex(input_file)
     else:
         if len(output_file) > 0:
             outex(website, output_file, out_path)
diff --git a/torcrawl.py b/torcrawl.py
index 68764e1..a7a0157 100755
--- a/torcrawl.py
+++ b/torcrawl.py
@@ -50,7 +50,7 @@
 from modules.checker import folder
 from modules.checker import url_canon
 # TorCrawl Modules
-from modules.crawler import crawler
+from modules.crawler import Crawler
 from modules.extractor import extractor
 
 
@@ -185,8 +185,9 @@ def main():
             out_path = folder(extract_domain(website), args.verbose)
 
     if args.crawl:
-        lst = crawler(website, c_depth, c_pause, out_path, args.log,
-                      args.verbose)
+        crawler = Crawler(website, c_depth, c_pause, out_path, args.log,
+                          args.verbose)
+        lst = crawler.crawl()
         with open(out_path + '/links.txt', 'w+', encoding='UTF-8') as file:
             for item in lst:
                 file.write(f"{item}\n")

From 52bcf14988ea173fe29076c8355166da7f4ed7a2 Mon Sep 17 00:00:00 2001
From: Aaron Bishop <93538312+the-siegfried@users.noreply.github.com>
Date: Tue, 15 Mar 2022 22:46:10 +0000
Subject: [PATCH 3/4] Delete .gitignore file

---
 .gitignore | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 723ef36..0000000
--- a/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-.idea
\ No newline at end of file

From 83763130662be6e5b6edd23ce65cfff84a1bcfcf Mon Sep 17 00:00:00 2001
From: Aaron Bishop <aaron.bishopalb@gmail.com>
Date: Wed, 16 Mar 2022 21:13:27 +0000
Subject: [PATCH 4/4] Resolve PEP8 violations

- Implements Error handling for uncaught http exceptions.
- Implements TypeError handling for uncaught exceptions from BeautifulSoup.

resolve-pep8-violations
---
 modules/crawler.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/modules/crawler.py b/modules/crawler.py
index 2f7f591..6594cf8 100644
--- a/modules/crawler.py
+++ b/modules/crawler.py
@@ -111,11 +111,22 @@ def crawl(self):
                             html_page = urllib.request.urlopen(item)
                     except HTTPError as error:
                         print(error)
+                        continue
                 else:
-                    html_page = urllib.request.urlopen(self.website)
-                    ord_lst_ind += 1
+                    try:
+                        html_page = urllib.request.urlopen(self.website)
+                        ord_lst_ind += 1
+                    except HTTPError as error:
+                        print(error)
+                        ord_lst_ind += 1
+                        continue
 
-                soup = BeautifulSoup(html_page, features="html.parser")
+                try:
+                    soup = BeautifulSoup(html_page, features="html.parser")
+                except TypeError as err:
+                    print(f"## Soup Error Encountered:: could to parse "
+                          f"ord_list # {ord_lst_ind}::{ord_lst[ord_lst_ind]}")
+                    continue
 
                 # For each <a href=""> tag.
                 for link in soup.findAll('a'):