diff --git a/.gitignore b/.gitignore index b0b6f3a..a3b6bc7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,12 @@ +# Project Specific +output/* +!output/.gitkeep + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class -# C extensions -*.so - # Distribution / packaging .Python build/ @@ -26,12 +27,6 @@ share/python-wheels/ *.egg MANIFEST -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - # Installer logs pip-log.txt pip-delete-this-directory.txt @@ -51,74 +46,6 @@ coverage.xml .pytest_cache/ cover/ -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - # Environments .env .venv @@ -128,33 +55,5 @@ ENV/ env.bak/ venv.bak/ -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - # PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ \ No newline at end of file diff --git a/README.md b/README.md index 21cf061..dc2a1a2 100644 --- a/README.md +++ b/README.md @@ -51,12 +51,12 @@ $ torcrawl -v -u http://www.github.com/ -c -d 2 -p 2 ## Installation ### Easy Installation with pip: -*Comming soon..* +*Coming soon...* ### Manual Installation: 1. **Clone this repository**:
`git clone https://github.com/MikeMeliz/TorCrawl.py.git` -2. **Install dependecies**:
+2. **Install dependencies**:
`pip install -r requirements.txt` 3. **Install and Start TOR Service**: 1. **Debian/Ubuntu**:
@@ -82,12 +82,12 @@ $ torcrawl -v -u http://www.github.com/ -c -d 2 -p 2 -f |--folder| The directory which will contain the generated files **Extract**: | | -e |--extract| Extract page's code to terminal or file (Default: Terminal) --i |--input filename| Input file with URL(s) (seperated by line) +-i |--input filename| Input file with URL(s) (separated by line) -o |--output [filename]| Output page(s) to file(s) (for one page) -y |--yara | Perform yara keyword search:
h = search entire html object,
t = search only text **Crawl**: | | -c |--crawl| Crawl website (Default output on website/links.txt) --d |--cdepth| Set depth of crawler's travel (Default: 1) +-d |--depth| Set depth of crawler's travel (Default: 1) -p |--pause| Seconds of pause between requests (Default: 0) -l |--log| Log file with visited URLs and their response code @@ -134,8 +134,7 @@ $ python torcrawl.py -i links.txt ### As Crawler: -Crawl the links of the webpage without the use of TOR, -also show verbose output (really helpfull): +Crawl the links of the webpage without the use of TOR, also show verbose output (really helpful): ```shell $ python torcrawl.py -v -w -u http://www.github.com/ -c @@ -216,6 +215,10 @@ Feel free to contribute on this project! Just fork it, make any change on your f ## Changelog ```shell +v1.32: + * Removed 1 second default pause between requests + * Several improvements on results + * Improved logs v1.31: * Fixed Input Link NoneType Error * Fixed name mismatch diff --git a/modules/checker.py b/modules/checker.py index 3099543..59b1566 100644 --- a/modules/checker.py +++ b/modules/checker.py @@ -50,14 +50,22 @@ def folder(website, verbose): :param website: String - URL of website to crawl. :param verbose: Boolean - Logging level. - :return: String 'out_path' - Path of the output folder. + :return: String 'output_folder' - Path of the output folder. """ - out_path = website - if not os.path.exists(out_path): - os.makedirs(out_path) + parsed = urlparse(website) + if parsed.scheme != '': + output_folder = "output/" + urlparse(website).netloc + else: + output_folder = "output/" + website + if not os.path.exists(output_folder): + try: + os.makedirs(output_folder) + except FileExistsError: + if verbose: + print(f"## Folder exists already: {website}") if verbose: - print(f"## Folder created: {out_path}") - return out_path + print(f"## Folder created: {website}") + return output_folder def check_tor(verbose): @@ -87,11 +95,11 @@ def check_ip(): """ Checks users IP from external resource. :return: None or HTTPError """ - addr = 'https://api.ipify.org/?format=json' + api_address = 'https://api.ipify.org/?format=json' try: - my_ip = load(urlopen(addr))['ip'] + my_ip = load(urlopen(api_address))['ip'] print(f'## Your IP: {my_ip}') except HTTPError as err: error = sys.exc_info()[0] - print(f"Error: {error} \n## IP cannot be obtained. \n## Is {addr} up? " + print(f"Error: {error} \n## IP cannot be obtained. \n## Is {api_address} up? " f"\n## HTTPError: {err}") diff --git a/modules/crawler.py b/modules/crawler.py index f84b2fe..5430f61 100644 --- a/modules/crawler.py +++ b/modules/crawler.py @@ -26,7 +26,7 @@ def excludes(self, link): :param link: String :return: Boolean """ - now = datetime.datetime.now().strftime("%Y%m%d") + now = datetime.datetime.now().strftime("%y%m%d") # BUG: For NoneType Exceptions, got to find a solution here if link is None: @@ -36,31 +36,33 @@ def excludes(self, link): return True # External links elif link.startswith('http') and not link.startswith(self.website): - file_path = self.out_path + '/' + now + '_extlinks.txt' - with open(file_path, 'w+', encoding='UTF-8') as lst_file: + file_path = self.out_path + '/' + now + '_ext-links.txt' + with open(file_path, 'a+', encoding='UTF-8') as lst_file: lst_file.write(str(link) + '\n') return True # Telephone Number elif link.startswith('tel:'): file_path = self.out_path + '/' + now + '_telephones.txt' - with open(file_path, 'w+', encoding='UTF-8') as lst_file: + with open(file_path, 'a+', encoding='UTF-8') as lst_file: lst_file.write(str(link) + '\n') return True # Mails elif link.startswith('mailto:'): file_path = self.out_path + '/' + now + '_mails.txt' - with open(file_path, 'w+', encoding='UTF-8') as lst_file: + with open(file_path, 'a+', encoding='UTF-8') as lst_file: lst_file.write(str(link) + '\n') return True # Type of files - elif re.search('^.*\\.(pdf|jpg|jpeg|png|gif|doc)$', link, - re.IGNORECASE): + elif re.search('^.*\\.(pdf|jpg|jpeg|png|gif|doc)$', link, re.IGNORECASE): + file_path = self.out_path + '/' + now + '_files.txt' + with open(file_path, 'a+', encoding='UTF-8') as lst_file: + lst_file.write(str(link) + '\n') return True def canonical(self, link): """ Canonicalization of the link. - :param link: String + :param link: String: URL(s) :return: String 'final_link': parsed canonical url. """ # Already formatted @@ -83,6 +85,20 @@ def canonical(self, link): final_link = self.website + "/" + link return final_link + def write_log(self, log): + log_path = self.out_path + '/crawler.log' + now = datetime.datetime.now() + + if self.logs is True: + open(log_path, 'a+') + if self.logs is True and os.access(log_path, os.W_OK) is False: + print(f"## Unable to write to {self.out_path}/log.txt - Exiting") + sys.exit(2) + with open(log_path, 'a+', encoding='UTF-8') as log_file: + log_file.write(str(now) + " [crawler.py] " + log) + log_file.close() + + def crawl(self): """ Core of the crawler. :return: List (ord_lst) - List of crawled links. @@ -91,11 +107,6 @@ def crawl(self): ord_lst = [] ord_lst.insert(0, self.website) ord_lst_ind = 0 - log_path = self.out_path + '/log.txt' - - if self.logs is True and os.access(log_path, os.W_OK) is False: - print(f"## Unable to write to {self.out_path}/log.txt - Exiting") - sys.exit(2) print(f"## Crawler started from {self.website} with " f"{str(self.c_depth)} depth crawl, and {str(self.c_pause)} " @@ -113,27 +124,23 @@ def crawl(self): if item is not None: html_page = urllib.request.urlopen(item) except (HTTPError, URLError) as error: - print('## ERROR: Domain or link seems to be ' - 'unreachable. Add -v to see the verbose error.' - 'Or write the full URL at -u argument!') - if self.verbose: print(error) + self.write_log(f"[INFO] ERROR: Domain or link seems to be unreachable: {str(item)} | " + f"Message: {error}\n") continue else: try: html_page = urllib.request.urlopen(self.website) ord_lst_ind += 1 except (HTTPError, URLError) as error: - print('## ERROR: Domain or link seems to be ' - 'unreachable. Add -v to see the verbose error.' - 'Or write the full URL at -u argument!') - if self.verbose: print(error) + self.write_log(f"[INFO] ERROR: Domain or link seems to be unreachable: {str(item)} | " + f"Message: {error}\n") ord_lst_ind += 1 continue try: soup = BeautifulSoup(html_page, features="html.parser") - except TypeError as err: - print(f"## Soup Error Encountered:: could to parse " + except TypeError: + print(f"## Soup Error Encountered:: couldn't parse " f"ord_list # {ord_lst_ind}::{ord_lst[ord_lst_ind]}") continue @@ -159,7 +166,7 @@ def crawl(self): if ver_link is not None: lst.add(ver_link) - # TODO: For non-formal links, using RegEx + # TODO: For non-formal links, using RegEx, should be an additional parameter, and all patterns to be stored in a file # url_pattern = r'/(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])/igm' # html_content = urllib.request.urlopen(self.website).read().decode('utf-8') @@ -180,21 +187,19 @@ def crawl(self): ord_lst = ord_lst + list(set(lst)) ord_lst = list(set(ord_lst)) + # Keeps logs for every webpage visited. + page_code = html_page.status + url_visited = f"[{str(page_code)}] {str(item)} \n" + self.write_log("[INFO] Parsed: " + url_visited) + if self.verbose: - sys.stdout.write("-- Results: " + str(len(ord_lst)) + "\r") + sys.stdout.write(" -- Results: " + str(len(ord_lst)) + "\r") sys.stdout.flush() - # Pause time. - if (ord_lst.index(item) != len(ord_lst) - 1) and \ - float(self.c_pause) > 0: + # Add Pause time between each iteration + if (ord_lst.index(item) != len(ord_lst) - 1) and float(self.c_pause) > 0: time.sleep(float(self.c_pause)) - # Keeps logs for every webpage visited. - if self.logs: - it_code = html_page.getcode() - with open(log_path, 'w+', encoding='UTF-8') as log_file: - log_file.write(f"[{str(it_code)}] {str(item)} \n") - print(f"## Step {str(index + 1)} completed " f"with: {str(len(ord_lst))} result(s)") diff --git a/modules/extractor.py b/modules/extractor.py index 89a6bd9..a4beacd 100644 --- a/modules/extractor.py +++ b/modules/extractor.py @@ -8,8 +8,8 @@ from urllib.error import URLError from http.client import InvalidURL from http.client import IncompleteRead - from bs4 import BeautifulSoup +from pathlib import Path from modules.checker import url_canon @@ -55,16 +55,17 @@ def check_yara(raw=None, yara=0): return matches -def cinex(input_file, out_path, yara=None): +def input_file_to_folder(input_file, output_path, yara=None): """ Ingests the crawled links from the input_file, scrapes the contents of the resulting web pages and writes the contents to the into out_path/{url_address}. :param input_file: String: Filename of the crawled Urls. - :param out_path: String: Pathname of results. + :param output_path: String: Pathname of results. :param yara: Integer: Keyword search argument. :return: None """ + i = 0 file = io.TextIOWrapper try: file = open(input_file, 'r') @@ -92,21 +93,25 @@ def cinex(input_file, out_path, yara=None): if yara is not None: full_match_keywords = check_yara(content, yara) - if len(full_match_keywords) == 0: print('No matches found.') continue - with open(out_path + "/" + output_file, 'wb') as results: + # Add an incremental in case of existing filename (eg. index.htm) + filename = Path(output_path + "/" + output_file) + if filename.is_file(): + i += 1 + filename = output_path + "/" + output_file + "(" + str(i) + ")" + with open(filename, 'wb') as results: results.write(content) - print(f"# File created on: {os.getcwd()}/{out_path}/{output_file}") + print(f"# File created on: {os.getcwd()}/{filename}") except HTTPError as e: - print(f"Cinex Error: {e.code}, cannot access: {e.url}") + print(f"Error: {e.code}, cannot access: {e.url}") continue - except InvalidURL as e: - print(f"Invalid URL: {line} \n Skipping...") + except InvalidURL: + print(f"Invalid URL: {line}, \n Skipping...") continue - except IncompleteRead as e: + except IncompleteRead: print(f"IncompleteRead on {line}") continue except IOError as err: @@ -114,7 +119,7 @@ def cinex(input_file, out_path, yara=None): file.close() -def intermex(input_file, yara): +def input_file_to_terminal(input_file, yara): """ Input links from file and extract them into terminal. :param input_file: String: File name of links file. @@ -140,19 +145,19 @@ def intermex(input_file, yara): print(f"ERROR: {err}\n## Not valid file. File tried: " + input_file) -def outex(website, output_file, out_path, yara): +def url_to_folder(website, output_file, output_path, yara): """ Scrapes the contents of the provided web address and outputs the contents to file. :param website: String: Url of web address to scrape. :param output_file: String: Filename of the results. - :param out_path: String: Folder name of the output findings. + :param output_path: String: Folder name of the output findings. :param yara: Integer: Keyword search argument. :return: None """ # Extract page to file try: - output_file = out_path + "/" + output_file + output_file = output_path + "/" + output_file content = urllib.request.urlopen(website).read() if yara is not None: @@ -170,7 +175,7 @@ def outex(website, output_file, out_path, yara): print(f"Error: {err}\n Can't write on file: {output_file}") -def termex(website, yara): +def url_to_terminal(website, yara): """ Scrapes provided web address and prints the results to the terminal. :param website: String: URL of website to scrape. @@ -193,29 +198,28 @@ def termex(website, yara): return -def extractor(website, crawl, output_file, input_file, out_path, selection_yara): +def extractor(website, crawl, output_file, input_file, output_path, selection_yara): """ Extractor - scrapes the resulting website or discovered links. :param website: String: URL of website to scrape. - :param crawl: Boolean: Cinex trigger. + :param crawl: Boolean: input_file_to_folder trigger. If used iteratively scrape the urls from input_file. :param output_file: String: Filename of resulting output from scrape. :param input_file: String: Filename of crawled/discovered URLs - :param out_path: String: Dir path for output files. + :param output_path: String: Dir path for output files. :param selection_yara: String: Selected option of HTML or Text. :return: None """ - # TODO: Return output to torcrawl.py if len(input_file) > 0: if crawl: - cinex(input_file, out_path, selection_yara) + input_file_to_folder(input_file, output_path, selection_yara) # TODO: Extract from list into a folder # elif len(output_file) > 0: - # inoutex(website, input_ile, output_file) + # input_list_to_folder(website, input_ile, output_file) else: - intermex(input_file, selection_yara) + input_file_to_terminal(input_file, selection_yara) else: if len(output_file) > 0: - outex(website, output_file, out_path, selection_yara) + url_to_folder(website, output_file, output_path, selection_yara) else: - termex(website, selection_yara) + url_to_terminal(website, selection_yara) diff --git a/modules/tests/__init__.py b/modules/tests/__init__.py index c976c13..e69de29 100644 --- a/modules/tests/__init__.py +++ b/modules/tests/__init__.py @@ -1,3 +0,0 @@ -from modules import checker -from modules import crawler -from modules import extractor diff --git a/modules/tests/test_checker.py b/modules/tests/test_checker.py index b3c911b..35c0b96 100644 --- a/modules/tests/test_checker.py +++ b/modules/tests/test_checker.py @@ -22,7 +22,7 @@ def test_url_canon_001(self): Returns true if the function successfully performs URL normalisation. """ url = 'torcrawl.com' - expected = 'http://www.torcrawl.com' + expected = 'https://torcrawl.com' result = url_canon(url, False) self.assertEqual(expected, result, f'Test Fail:: expected = {expected}, got {result}') @@ -32,7 +32,7 @@ def test_url_canon_002(self): Returns true if the function successfully performs URL normalisation. """ url = 'www.torcrawl.com' - expected = 'http://www.torcrawl.com' + expected = 'https://www.torcrawl.com' result = url_canon(url, False) self.assertEqual(expected, result, f'Test Fail:: expected = {expected}, got {result}') diff --git a/modules/tests/test_crawler.py b/modules/tests/test_crawler.py index 6dbb3d2..1beb771 100644 --- a/modules/tests/test_crawler.py +++ b/modules/tests/test_crawler.py @@ -20,8 +20,7 @@ def tearDown(self): def test_excludes(self): """ Test crawler.excludes function. - Return True if the function successfully excludes the the provided - failing links. + Return True if the function successfully excludes the provided failing links. """ _uri = 'http://www.torcrawl.com' failing_links = ['#', 'tel:012-013-104-5', @@ -38,7 +37,7 @@ def test_canonical(self): Return True if the function successfully normalizes the provided failing links. """ - _uri = 'http://www.torcrawl.com/' + _uri = 'https://torcrawl.com/' links = [[f'{_uri}sundance', f'{_uri}sundance'], ['/sundance', f'{_uri}sundance'], [f'{_uri}bob.html', f'{_uri}bob.html'], diff --git a/output/.gitkeep b/output/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/torcrawl.py b/torcrawl.py index dba3a23..13e2c39 100755 --- a/torcrawl.py +++ b/torcrawl.py @@ -11,14 +11,14 @@ General: -h, --help : Help --v, --verbose : Show more informations about the progress +-v, --verbose : Show more information about the progress -u, --url *.onion : URL of Webpage to crawl or extract -w, --without : Without the use of Relay TOR Extract: -e, --extract : Extract page's code to terminal or file. - (Defualt: terminal) --i, --input filename : Input file with URL(s) (seperated by line) + (Default: terminal) +-i, --input filename : Input file with URL(s) (separated by line) -o, --output [filename] : Output page(s) to file(s) (for one page) -y, --yara : Yara keyword search page categorisation read in from /res folder. @@ -27,7 +27,7 @@ Crawl: -c, --crawl : Crawl website (Default output on /links.txt) --d, --cdepth : Set depth of crawl's travel (Default: 1) +-d, --depth : Set depth of crawl's travel (Default: 1) -z, --exclusions : Paths that you don't want to include (TODO) -s, --simultaneous: How many pages to visit at the same time (TODO) -p, --pause : The length of time the crawler will pause @@ -123,7 +123,7 @@ def main(): parser.add_argument( '-i', '--input', - help='Input file with URL(s) (seperated by line)' + help='Input file with URL(s) (separated by line)' ) parser.add_argument( '-o', @@ -140,12 +140,12 @@ def main(): ) parser.add_argument( '-d', - '--cdepth', + '--depth', help='Set depth of crawl\'s travel (Default: 1)' ) parser.add_argument( '-p', - '--cpause', + '--pause', help='The length of time the crawler will pause' ) parser.add_argument( @@ -169,25 +169,25 @@ def main(): args = parser.parse_args() - now = datetime.datetime.now().strftime("%Y%m%d") + now = datetime.datetime.now().strftime("%y%m%d") # Canonicalization of web url and create path for output. website = '' - out_path = '' + output_folder = '' if args.input: pass elif len(args.url) > 0: website = url_canon(args.url, args.verbose) if args.folder is not None: - out_path = folder(args.folder, args.verbose) + output_folder = folder(args.folder, args.verbose) else: - out_path = folder(extract_domain(website), args.verbose) + output_folder = folder(extract_domain(website), args.verbose) # Parse arguments to variables else initiate variables. input_file = args.input if args.input else '' output_file = args.output if args.output else '' - c_depth = args.cdepth if args.cdepth else 0 - c_pause = args.cpause if args.cpause else 1 + depth = args.depth if args.depth else 0 + pause = args.pause if args.pause else 0 selection_yara = args.yara if args.yara else None # Connect to TOR @@ -200,12 +200,12 @@ def main(): if args.url: print(('## URL: ' + args.url)) if args.crawl: - crawler = Crawler(website, c_depth, c_pause, out_path, args.log, + crawler = Crawler(website, depth, pause, output_folder, args.log, args.verbose) lst = crawler.crawl() - if args.input == None: - input_file = out_path + '/' + now + '_links.txt' + if args.input is None: + input_file = output_folder + '/' + now + '_links.txt' with open(input_file, 'w+', encoding='UTF-8') as file: for item in lst: @@ -213,10 +213,10 @@ def main(): print(f"## File created on {os.getcwd()}/{input_file}") if args.extract: - extractor(website, args.crawl, output_file, input_file, out_path, + extractor(website, args.crawl, output_file, input_file, output_folder, selection_yara) else: - extractor(website, args.crawl, output_file, input_file, out_path, + extractor(website, args.crawl, output_file, input_file, output_folder, selection_yara)