diff --git a/modules/checker.py b/modules/checker.py
index c4a2152..127fe68 100644
--- a/modules/checker.py
+++ b/modules/checker.py
@@ -1,67 +1,96 @@
#!/usr/bin/python
-import sys
+import os
import re
import subprocess
-import os
-from urllib.request import urlopen
+import sys
from json import load
+from urllib.error import HTTPError
from urllib.parse import urlparse
+from urllib.request import urlopen
-def urlcanon(website, verbose):
- if not website.startswith("http"):
- if not website.startswith("www."):
- website = "www." + website
- if verbose:
- print(("## URL fixed: " + website))
- website = "http://" + website
- if verbose:
- print(("## URL fixed: " + website))
- return website
+def url_canon(website, verbose):
+ """
+
+ :param website: String -
+ :param verbose: Boolean -
+ :return: String 'website' -
+ """
+ if not website.startswith("http"):
+ if not website.startswith("www."):
+ website = "www." + website
+ if verbose:
+ print(("## URL fixed: " + website))
+ website = "http://" + website
+ if verbose:
+ print(("## URL fixed: " + website))
+ return website
def extract_domain(url, remove_http=True):
- uri = urlparse(url)
- if remove_http:
- domain_name = f"{uri.netloc}"
- else:
- domain_name = f"{uri.netloc}://{uri.netloc}"
- return domain_name
+ """
+
+ :param url: String -
+ :param remove_http: Boolean -
+ :return: String 'domain_name' -
+ """
+ uri = urlparse(url)
+ if remove_http:
+ domain_name = f"{uri.netloc}"
+ else:
+ domain_name = f"{uri.netloc}://{uri.netloc}"
+ return domain_name
# Create output path
def folder(website, verbose):
- outpath = website
- if not os.path.exists(outpath):
- os.makedirs(outpath)
- if verbose:
- print(("## Folder created: " + outpath))
- return outpath
-
-
-# Check if TOR service is running
-def checktor(verbose):
- checkfortor = subprocess.check_output(['ps', '-e'])
-
- def findwholeword(w):
- return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search
-
- if findwholeword('tor')(str(checkfortor)):
- if verbose:
- print("## TOR is ready!")
- else:
- print("## TOR is NOT running!")
- print('## Enable tor with \'service tor start\' or add -w argument')
- sys.exit(2)
-
-
-# Check your IP from external website
-def checkip():
- try:
- webipcheck = 'https://api.ipify.org/?format=json'
- my_ip = load(urlopen(webipcheck))['ip']
- print(('## Your IP: ' + my_ip))
- except:
- e = sys.exc_info()[0]
- print(("Error: %s" % e + "\n## IP can't obtain \n## Is " + webipcheck + "up?"))
+ """ Creates an output path for the findings.
+
+ :param website: String - URL of website to crawl.
+ :param verbose: Boolean - Logging level.
+ :return: String 'out_path' - Path of the output folder.
+ """
+ out_path = website
+ if not os.path.exists(out_path):
+ os.makedirs(out_path)
+ if verbose:
+ print(f"## Folder created: {out_path}")
+ return out_path
+
+
+def check_tor(verbose):
+ """Checks to see if TOR service is running on device.
+ Will exit if (-w) with argument is provided on application startup and TOR
+ service is not found to be running on the device.
+
+ :param verbose: Boolean -'verbose' logging argument.
+ :return: None
+ """
+ check_for_tor = subprocess.check_output(['ps', '-e'])
+
+ def find_whole_word(word):
+ return re.compile(r'\b({0})\b'.format(word),
+ flags=re.IGNORECASE).search
+
+ if find_whole_word('tor')(str(check_for_tor)):
+ if verbose:
+ print("## TOR is ready!")
+ else:
+ print("## TOR is NOT running!")
+ print('## Enable tor with \'service tor start\' or add -w argument')
+ sys.exit(2)
+
+
+def check_ip():
+ """ Checks users IP from external resource.
+ :return: None or HTTPError
+ """
+ addr = 'https://api.ipify.org/?format=json'
+ try:
+ my_ip = load(urlopen(addr))['ip']
+ print(f'## Your IP: {my_ip}')
+ except HTTPError as err:
+ error = sys.exc_info()[0]
+ print(f"Error: {error} \n## IP cannot be obtained. \n## Is {addr} up? "
+ f"\n## HTTPError: {err}")
diff --git a/modules/crawler.py b/modules/crawler.py
index aaf8d38..6594cf8 100644
--- a/modules/crawler.py
+++ b/modules/crawler.py
@@ -1,145 +1,176 @@
#!/usr/bin/python
-
-import sys
+import http.client
+import os
import re
-import urllib.request
+import sys
import time
+import urllib.request
+from urllib.error import HTTPError
+
from bs4 import BeautifulSoup
-# Exclude links that we dont need
-def excludes(link, website, outpath):
- # BUG: For NoneType Exceptions, got to find a solution here
- if link is None:
- return True
- # Links
- elif '#' in link:
- return True
- # External links
- elif link.startswith('http') and not link.startswith(website):
- lstfile = open(outpath + '/extlinks.txt', 'w+')
- lstfile.write(str(link) + '\n')
- lstfile.close()
- return True
- # Telephone Number
- elif link.startswith('tel:'):
- lstfile = open(outpath + '/telephones.txt', 'w+')
- lstfile.write(str(link) + '\n')
- lstfile.close()
- return True
- # Mails
- elif link.startswith('mailto:'):
- lstfile = open(outpath + '/mails.txt', 'w+')
- lstfile.write(str(link) + '\n')
- lstfile.close()
- return True
- # Type of files
- elif re.search('^.*\.(pdf|jpg|jpeg|png|gif|doc)$', link, re.IGNORECASE):
- return True
-
-
-# Canonization of the link
-def canonical(link, website):
- # Already formatted
- if link.startswith(website):
- return link
- # For relative paths with / in front
- elif link.startswith('/'):
- if website[-1] == '/':
- finalLink = website[:-1] + link
- else:
- finalLink = website + link
- return finalLink
- # For relative paths without /
- elif re.search('^.*\.(html|htm|aspx|php|doc|css|js|less)$', link, re.IGNORECASE):
- # Pass to
- if website[-1] == '/':
- finalLink = website + link
- else:
- finalLink = website + "/" + link
- return finalLink
- # Clean links from '?page=' arguments
-
-
-# Core of crawler
-def crawler(website, cdepth, cpause, outpath, logs, verbose):
- lst = set()
- ordlst = []
- ordlst.insert(0, website)
- ordlstind = 0
-
- if logs:
- global logfile
- logfile = open(outpath + '/log.txt', 'w+')
-
- print((
- "## Crawler started from " + website +
- " with " + str(cdepth) + " depth crawl and " + str(cpause) + " second(s) delay:"
- ))
-
- # Depth
- for x in range(0, int(cdepth)):
-
- # For every element of list
- for item in ordlst:
-
- # Check if is the first element
- if ordlstind > 0:
- try:
- if item is not None:
- global html_page
- html_page = urllib.request.urlopen(item)
- except urllib.error.HTTPError as e:
- print(e)
- else:
- html_page = urllib.request.urlopen(website)
- ordlstind += 1
-
- soup = BeautifulSoup(html_page, features="html.parser")
-
- # For each tag
- for link in soup.findAll('a'):
- link = link.get('href')
-
- if excludes(link, website, outpath):
- continue
-
- verlink = canonical(link, website)
- lst.add(verlink)
-
- # For each tag
- for link in soup.findAll('area'):
- link = link.get('href')
-
- if excludes(link, website, outpath):
- continue
-
- verlink = canonical(link, website)
- lst.add(verlink)
-
- # TODO: For images
- # TODO: For scripts
-
- # Pass new on list and re-set it to delete duplicates
- ordlst = ordlst + list(set(lst))
- ordlst = list(set(ordlst))
-
- if verbose:
- sys.stdout.write("-- Results: " + str(len(ordlst)) + "\r")
- sys.stdout.flush()
-
- # Pause time
- if (ordlst.index(item) != len(ordlst) - 1) and float(cpause) > 0:
- time.sleep(float(cpause))
-
- # Keeps logs for every webpage visited
- if logs:
- itcode = html_page.getcode()
- logfile.write("[" + str(itcode) + "] " + str(item) + "\n")
-
- print(("## Step " + str(x + 1) + " completed with: " + str(len(ordlst)) + " result(s)"))
-
- if logs:
- logfile.close()
-
- return ordlst
+class Crawler:
+ def __init__(self, website, c_depth, c_pause, out_path, logs, verbose):
+ self.website = website
+ self.c_depth = c_depth
+ self.c_pause = c_pause
+ self.out_path = out_path
+ self.logs = logs
+ self.verbose = verbose
+
+ def excludes(self, link):
+ """ Excludes links that are not required.
+
+ :param link:
+ :return: Boolean
+ """
+ # BUG: For NoneType Exceptions, got to find a solution here
+ if link is None:
+ return True
+ # Links
+ elif '#' in link:
+ return True
+ # External links
+ elif link.startswith('http') and not link.startswith(self.website):
+ file_path = self.out_path + '/extlinks.txt'
+ with open(file_path, 'w+', encoding='UTF-8') as lst_file:
+ lst_file.write(str(link) + '\n')
+ return True
+ # Telephone Number
+ elif link.startswith('tel:'):
+ file_path = self.out_path + '/telephones.txt'
+ with open(file_path, 'w+', encoding='UTF-8') as lst_file:
+ lst_file.write(str(link) + '\n')
+ return True
+ # Mails
+ elif link.startswith('mailto:'):
+ file_path = self.out_path + '/mails.txt'
+ with open(file_path, 'w+', encoding='UTF-8') as lst_file:
+ lst_file.write(str(link) + '\n')
+ return True
+ # Type of files
+ elif re.search('^.*\\.(pdf|jpg|jpeg|png|gif|doc)$', link,
+ re.IGNORECASE):
+ return True
+
+ def canonical(self, link):
+ """ Canonization of the link.
+
+ :param link:
+ :return:
+ """
+ # Already formatted
+ if link.startswith(self.website):
+ return link
+ # For relative paths with / in front
+ elif link.startswith('/'):
+ if self.website[-1] == '/':
+ final_link = self.website[:-1] + link
+ else:
+ final_link = self.website + link
+ return final_link
+ # For relative paths without /
+ elif re.search('^.*\\.(html|htm|aspx|php|doc|css|js|less)$', link,
+ re.IGNORECASE):
+ # Pass to
+ if self.website[-1] == '/':
+ final_link = self.website + link
+ else:
+ final_link = self.website + "/" + link
+ return final_link
+
+ def crawl(self):
+ """ Core of the crawler.
+ :return: List (ord_lst) - List of crawled links.
+ """
+ lst = set()
+ ord_lst = []
+ ord_lst.insert(0, self.website)
+ ord_lst_ind = 0
+ log_path = self.out_path + '/log.txt'
+
+ if self.logs is True and os.access(log_path, os.W_OK) is False:
+ print(f"## Unable to write to {self.out_path}/log.txt - Exiting")
+ sys.exit(2)
+
+ print(f"## Crawler started from {self.website} with "
+ f"{str(self.c_depth)} depth crawl, and {str(self.c_pause)} "
+ f"second(s) delay.")
+
+ # Depth
+ for index in range(0, int(self.c_depth)):
+
+ # For every element of list.
+ for item in ord_lst:
+ html_page = http.client.HTTPResponse
+ # Check if is the first element
+ if ord_lst_ind > 0:
+ try:
+ if item is not None:
+ html_page = urllib.request.urlopen(item)
+ except HTTPError as error:
+ print(error)
+ continue
+ else:
+ try:
+ html_page = urllib.request.urlopen(self.website)
+ ord_lst_ind += 1
+ except HTTPError as error:
+ print(error)
+ ord_lst_ind += 1
+ continue
+
+ try:
+ soup = BeautifulSoup(html_page, features="html.parser")
+ except TypeError as err:
+ print(f"## Soup Error Encountered:: could to parse "
+ f"ord_list # {ord_lst_ind}::{ord_lst[ord_lst_ind]}")
+ continue
+
+ # For each tag.
+ for link in soup.findAll('a'):
+ link = link.get('href')
+
+ if self.excludes(link):
+ continue
+
+ ver_link = self.canonical(link)
+ lst.add(ver_link)
+
+ # For each tag.
+ for link in soup.findAll('area'):
+ link = link.get('href')
+
+ if self.excludes(link):
+ continue
+
+ ver_link = self.canonical(link)
+ lst.add(ver_link)
+
+ # TODO: For images
+ # TODO: For scripts
+
+ # Pass new on list and re-set it to delete duplicates.
+ ord_lst = ord_lst + list(set(lst))
+ ord_lst = list(set(ord_lst))
+
+ if self.verbose:
+ sys.stdout.write("-- Results: " + str(len(ord_lst)) + "\r")
+ sys.stdout.flush()
+
+ # Pause time.
+ if (ord_lst.index(item) != len(ord_lst) - 1) and \
+ float(self.c_pause) > 0:
+ time.sleep(float(self.c_pause))
+
+ # Keeps logs for every webpage visited.
+ if self.logs:
+ it_code = html_page.getcode()
+ with open(log_path, 'w+', encoding='UTF-8') as log_file:
+ log_file.write(f"[{str(it_code)}] {str(item)} \n")
+
+ print(f"## Step {str(index + 1)} completed \n\t "
+ f"with: {str(len(ord_lst))} result(s)")
+
+ return ord_lst
diff --git a/modules/extractor.py b/modules/extractor.py
index 8c446dc..5c2e422 100644
--- a/modules/extractor.py
+++ b/modules/extractor.py
@@ -1,92 +1,130 @@
#!/usr/bin/python
-
+import io
import os
import sys
-import urllib.request, urllib.parse, urllib.error
-
-
-# Input links from file and extract them into path/files
-def cinex(inputFile, outpath):
- try:
- global f
- f = open(inputFile, 'r')
- # print f
- except IOError:
- e = sys.exc_info()[0]
- print(("Error: %s" % e + "\n## Can't open " + inputFile))
-
- for line in f:
-
- # Generate name for every file
- try:
- pagename = line.rsplit('/', 1)
- clpagename = str(pagename[1])
- clpagename = clpagename[:-1]
- if len(clpagename) == 0:
- outputFile = "index.htm"
- else:
- outputFile = clpagename
- except IndexError as e:
- print("Error: %s" % e)
- continue
-
- # Extract page to file
- try:
- f = open(outpath + "/" + outputFile, 'wb')
- f.write(urllib.request.urlopen(line).read())
- f.close()
- print(("## File created on " + os.getcwd() + "/" + outpath + "/" + outputFile))
- except:
- e = sys.exc_info()[0]
- print(("Error: %s" % e + "\n Can't write on file " + outputFile))
-
-
-# Input links from file and extract them into terminal
-def intermex(inputFile):
- try:
- f = open(inputFile, 'r')
- for line in f:
- print((urllib.request.urlopen(line).read()))
- except:
- e = sys.exc_info()[0]
- print(("Error: %s" % e + "\n## Not valid file"))
-
-
-# Output webpage into a file
-def outex(website, outputFile, outpath):
- # Extract page to file
- try:
- outputFile = outpath + "/" + outputFile
- f = open(outputFile, 'wb')
- f.write(urllib.request.urlopen(website).read())
- f.close()
- print(("## File created on " + os.getcwd() + "/" + outputFile))
- except:
- e = sys.exc_info()[0]
- print(("Error: %s" % e + "\n Can't write on file " + outputFile))
-
-
-# Output to terminal
+import urllib.error
+import urllib.parse
+import urllib.request
+from urllib.error import HTTPError
+from urllib.error import URLError
+
+
+def cinex(input_file, out_path):
+ """ Ingests the crawled links from the input_file,
+ scrapes the contents of the resulting web pages and writes the contents to
+ the into out_path/{url_address}.
+
+ :param input_file: String: Filename of the crawled Urls.
+ :param out_path: String: Pathname of results.
+ :return: None
+ """
+ file = io.TextIOWrapper
+ try:
+ file = open(input_file, 'r')
+ except IOError as err:
+ # error = sys.exc_info()[0]
+ print(f"Error: {err}\n## Can't open: {input_file}")
+
+ for line in file:
+
+ # Generate the name for every file.
+ try:
+ page_name = line.rsplit('/', 1)
+ cl_page_name = str(page_name[1])
+ cl_page_name = cl_page_name[:-1]
+ if len(cl_page_name) == 0:
+ output_file = "index.htm"
+ else:
+ output_file = cl_page_name
+ except IndexError as error:
+ print(f"Error: {error}")
+ continue
+
+ # Extract page to file
+ try:
+ with open(out_path + "/" + output_file, 'wb') as results:
+ results.write(urllib.request.urlopen(line).read())
+ print(f"# File created on: {os.getcwd()}/{out_path}/{output_file}")
+ except IOError as err:
+ error = sys.exc_info()[0]
+ print(f"Error: {error}\nCan't write on file: {output_file}")
+ file.close()
+
+
+def intermex(input_file):
+ """ Input links from file and extract them into terminal.
+
+ :param input_file: String: File name of links file.
+ :return: None
+ """
+ try:
+ with open(input_file, 'r') as file:
+ for line in file:
+ print((urllib.request.urlopen(line).read()))
+ except (HTTPError, URLError) as err:
+ print(f"HTTPError: {err}")
+ except IOError as err:
+ # error = sys.exc_info()[0]
+ print(f"Error: {err}\n## Not valid file")
+
+
+def outex(website, output_file, out_path):
+ """ Scrapes the contents of the provided web address and outputs the
+ contents to file.
+
+ :param website: String: Url of web address to scrape.
+ :param output_file: String: Filename of the results.
+ :param out_path: String: Folder name of the output findings.
+ :return: None
+ """
+ # Extract page to file
+ try:
+ output_file = out_path + "/" + output_file
+ with open(output_file, 'wb') as file:
+ file.write(urllib.request.urlopen(website).read())
+ print(f"## File created on: {os.getcwd()}/{output_file}")
+ except (HTTPError, URLError) as err:
+ print(f"HTTPError: {err}")
+ except IOError as err:
+ # error = sys.exc_info()[0]
+ print(f"Error: {err}\n Can't write on file: {output_file}")
+
+
def termex(website):
- try:
- print((urllib.request.urlopen(website).read()))
- except (urllib.error.HTTPError, urllib.error.URLError) as e:
- print(("Error: (%s) %s" % (e, website)))
- return None
-
-
-def extractor(website, crawl, outputFile, inputFile, outpath):
- # TODO: Return output to torcrawl.py
- if len(inputFile) > 0:
- if crawl:
- cinex(inputFile, outpath)
- # TODO: Extract from list into a folder
- # elif len(outputFile) > 0:
- # inoutex(website, inputFile, outputFile)
- else:
- intermex(inputFile)
- else:
- if len(outputFile) > 0:
- outex(website, outputFile, outpath)
- else:
- termex(website)
+ """ Scrapes provided web address and prints the results to the terminal.
+
+ :param website: String: URL of website to scrape.
+ :return: None
+ """
+ try:
+ print((urllib.request.urlopen(website).read()))
+ except (urllib.error.HTTPError, urllib.error.URLError) as err:
+ print(f"Error: ({err}) {website}")
+ return
+
+
+def extractor(website, crawl, output_file, input_file, out_path):
+ """ Extractor - scrapes the resulting website or discovered links.
+
+ :param website: String: URL of website to scrape.
+ :param crawl: Boolean: Cinex trigger.
+ If used iteratively scrape the urls from input_file.
+ :param output_file: String: Filename of resulting output from scrape.
+ :param input_file: String: Filename of crawled/discovered URLs
+ :param out_path: String: Dir path for output files.
+ :return: None
+ """
+ # TODO: Return output to torcrawl.py
+ if len(input_file) > 0:
+ if crawl:
+ cinex(input_file, out_path)
+ # TODO: Extract from list into a folder
+ # elif len(output_file) > 0:
+ # inoutex(website, input_ile, output_file)
+ else:
+ intermex(input_file)
+ else:
+ if len(output_file) > 0:
+ outex(website, output_file, out_path)
+ else:
+ termex(website)
diff --git a/torcrawl.py b/torcrawl.py
index 3fd401c..a7a0157 100755
--- a/torcrawl.py
+++ b/torcrawl.py
@@ -1,22 +1,11 @@
#!/usr/bin/python
-
-import socket
-import socks
-import argparse
-
-# TorCrawl Modules
-from modules.crawler import crawler
-from modules.extractor import extractor
-from modules.checker import *
-
-help = '''
-
-TorCrawl.py is a python script to crawl and extract (regular or onion)
-webpages through TOR network.
+"""
+TorCrawl.py is a python script to crawl and extract (regular or onion)
+webpages through TOR network.
usage: python torcrawl.py [options]
-python torcrawl.py -u l0r3m1p5umD0lorS1t4m3t.onion
-python torcrawl.py -v -w -u http://www.github.com -o github.htm
+python torcrawl.py -u l0r3m1p5umD0lorS1t4m3t.onion
+python torcrawl.py -v -w -u http://www.github.com -o github.htm
python torcrawl.py -v -u l0r3m1p5umD0lorS1t4m3t.onion -c -d 2 -p 5
python torcrawl.py -v -w -u http://www.github.com -c -d 2 -p 5 -e -f GitHub
@@ -24,7 +13,7 @@
-h, --help : Help
-v, --verbose : Show more informations about the progress
-u, --url *.onion : URL of Webpage to crawl or extract
--w, --without : Without the use of Relay TOR
+-w, --without : Without the use of Relay TOR
Extract:
-e, --extract : Extract page's code to terminal or file.
@@ -37,156 +26,179 @@
-d, --cdepth : Set depth of crawl's travel (Default: 1)
-z, --exclusions : Paths that you don't want to include (TODO)
-s, --simultaneous: How many pages to visit at the same time (TODO)
--p, --pause : The length of time the crawler will pause
+-p, --pause : The length of time the crawler will pause
(Default: 0)
--f, --folder : The root directory which will contain the
+-f, --folder : The root directory which will contain the
generated files
-l, --log : Log file with visited URLs and their response code.
GitHub: github.com/MikeMeliz/TorCrawl.py
License: GNU General Public License v3.0
-'''
-
+"""
-# Set socket and connection with TOR network
-def connecttor():
- try:
- port = 9050
- # Set socks proxy and wrap the urllib module
- socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', port)
- socket.socket = socks.socksocket
+import argparse
+import os
+import socket
+import sys
- # Perform DNS resolution through the socket
- def getaddrinfo(*args):
- return [(socket.AF_INET, socket.SOCK_STREAM, 6, '', (args[0], args[1]))]
+import socks # noqa - pysocks
- socket.getaddrinfo = getaddrinfo
- except:
- e = sys.exc_info()[0]
- print(("Error: %s" % e + "\n## Can't establish connection with TOR"))
+from modules.checker import check_ip
+from modules.checker import check_tor
+from modules.checker import extract_domain
+from modules.checker import folder
+from modules.checker import url_canon
+# TorCrawl Modules
+from modules.crawler import Crawler
+from modules.extractor import extractor
-def main():
- # Initialize necessary variables
- inputfile = outputfile = ''
- cpause = 0
- cdepth = 1
-
- # Get arguments with argparse
- parser = argparse.ArgumentParser(
- description="TorCrawl.py is a python script to crawl and extract (regular or onion) webpages through TOR network.")
-
- # General
- parser.add_argument(
- '-v',
- '--verbose',
- action='store_true',
- help='Show more information about the progress'
- )
- parser.add_argument(
- '-u',
- '--url',
- help='URL of webpage to crawl or extract'
- )
- parser.add_argument(
- '-w',
- '--without',
- action='store_true',
- help='Without the use of Relay TOR'
- )
-
- # Extract
- parser.add_argument(
- '-e',
- '--extract',
- action='store_true',
- help='Extract page\'s code to terminal or file.'
- )
- parser.add_argument(
- '-i',
- '--input',
- help='Input file with URL(s) (seperated by line)'
- )
- parser.add_argument(
- '-o',
- '--output',
- help='Output page(s) to file(s) (for one page)'
- )
-
- # Crawl
- parser.add_argument(
- '-c',
- '--crawl',
- action='store_true',
- help='Crawl website (Default output on /links.txt)'
- )
- parser.add_argument(
- '-d',
- '--cdepth',
- help='Set depth of crawl\'s travel (Default: 1)'
- )
- parser.add_argument(
- '-p',
- '--cpause',
- help='The length of time the crawler will pause'
- )
- parser.add_argument(
- '-l',
- '--log',
- action='store_true',
- help='A save log will let you see which URLs were visited and their response code'
- )
- parser.add_argument(
- '-f',
- '--folder',
- help='The root directory which will contain the generated files'
- )
-
- args = parser.parse_args()
-
- # Parse arguments to variables
- if args.input:
- inputfile = args.input
- if args.output:
- outputfile = args.output
- if args.cdepth:
- cdepth = args.cdepth
- if args.cpause:
- cpause = args.cpause
-
- # Connect to TOR
- if args.without is False:
- checktor(args.verbose)
- connecttor()
-
- if args.verbose:
- checkip()
- print(('## URL: ' + args.url))
-
- # Canon/ion of website and create path for output
- if len(args.url) > 0:
- global website
- global outpath
- website = urlcanon(args.url, args.verbose)
- if args.folder is not None:
- outpath = folder(args.folder, args.verbose)
- else:
- outpath = folder(extract_domain(website), args.verbose)
-
- if args.crawl:
- lst = crawler(website, cdepth, cpause, outpath, args.log, args.verbose)
- lstfile = open(outpath + '/links.txt', 'w+')
- for item in lst:
- lstfile.write("%s\n" % item)
- lstfile.close()
- print(("## File created on " + os.getcwd() + "/" + outpath + "/links.txt"))
- if args.extract:
- inputfile = outpath + "/links.txt"
- extractor(website, args.crawl, outputfile, inputfile, outpath)
- else:
- extractor(website, args.crawl, outputfile, inputfile, outpath)
+# Set socket and connection with TOR network
+def connect_tor():
+ """ Connect to TOR via DNS resolution through a socket.
+ :return: None or HTTPError.
+ """
+ try:
+ port = 9050
+ # Set socks proxy and wrap the urllib module
+ socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', port)
+ socket.socket = socks.socksocket
+
+ # Perform DNS resolution through the socket
+ def getaddrinfo(*args): # noqa
+ return [(socket.AF_INET, socket.SOCK_STREAM, 6, '',
+ (args[0], args[1]))]
+
+ socket.getaddrinfo = getaddrinfo # noqa
+ except socks.HTTPError as err:
+ error = sys.exc_info()[0]
+ print(f"Error: {error} \n## Cannot establish connection with TOR\n"
+ f"HTTPError: {err}")
+def main():
+ """ Main method of TorCrawl application. Collects and parses arguments and
+ instructs the rest of the application on how to run.
+
+ :return: None
+ """
+
+ # Get arguments with argparse.
+ parser = argparse.ArgumentParser(
+ description="TorCrawl.py is a python script to crawl and extract "
+ "(regular or onion) webpages through TOR network.")
+
+ # General
+ parser.add_argument(
+ '-v',
+ '--verbose',
+ action='store_true',
+ help='Show more information about the progress'
+ )
+ parser.add_argument(
+ '-u',
+ '--url',
+ help='URL of webpage to crawl or extract'
+ )
+ parser.add_argument(
+ '-w',
+ '--without',
+ action='store_true',
+ help='Without the use of Relay TOR'
+ )
+
+ # Extract
+ parser.add_argument(
+ '-e',
+ '--extract',
+ action='store_true',
+ help='Extract page\'s code to terminal or file.'
+ )
+ parser.add_argument(
+ '-i',
+ '--input',
+ help='Input file with URL(s) (seperated by line)'
+ )
+ parser.add_argument(
+ '-o',
+ '--output',
+ help='Output page(s) to file(s) (for one page)'
+ )
+
+ # Crawl
+ parser.add_argument(
+ '-c',
+ '--crawl',
+ action='store_true',
+ help='Crawl website (Default output on /links.txt)'
+ )
+ parser.add_argument(
+ '-d',
+ '--cdepth',
+ help='Set depth of crawl\'s travel (Default: 1)'
+ )
+ parser.add_argument(
+ '-p',
+ '--cpause',
+ help='The length of time the crawler will pause'
+ )
+ parser.add_argument(
+ '-l',
+ '--log',
+ action='store_true',
+ help='A save log will let you see which URLs were visited and their '
+ 'response code'
+ )
+ parser.add_argument(
+ '-f',
+ '--folder',
+ help='The root directory which will contain the generated files'
+ )
+
+ args = parser.parse_args()
+
+ # Parse arguments to variables else initiate variables.
+ input_file = args.input if args.input else ''
+ output_file = args.output if args.output else ''
+ c_depth = args.cdepth if args.cdepth else 0
+ c_pause = args.cpause if args.cpause else 1
+
+ # Connect to TOR
+ if args.without is False:
+ check_tor(args.verbose)
+ connect_tor()
+
+ if args.verbose:
+ check_ip()
+ print(('## URL: ' + args.url))
+
+ website = ''
+ out_path = ''
+
+ # Canon/ion of website and create path for output
+ if len(args.url) > 0:
+ website = url_canon(args.url, args.verbose)
+ if args.folder is not None:
+ out_path = folder(args.folder, args.verbose)
+ else:
+ out_path = folder(extract_domain(website), args.verbose)
+
+ if args.crawl:
+ crawler = Crawler(website, c_depth, c_pause, out_path, args.log,
+ args.verbose)
+ lst = crawler.crawl()
+ with open(out_path + '/links.txt', 'w+', encoding='UTF-8') as file:
+ for item in lst:
+ file.write(f"{item}\n")
+ print(f"## File created on {os.getcwd()}/{out_path}/links.txt")
+ if args.extract:
+ input_file = out_path + "/links.txt"
+ extractor(website, args.crawl, output_file, input_file, out_path)
+ else:
+ extractor(website, args.crawl, output_file, input_file, out_path)
+
+
+# Stub to call main method.
if __name__ == "__main__":
- main()
+ main()