From 174d769140a6b808f48ec4755e16e380c762d708 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Sun, 8 Oct 2023 11:27:34 -0400 Subject: [PATCH 01/25] Add httpx and tabulate to poetry --- poetry.lock | 138 +++++++++++++++++++++++++++++++++++++++++-------- pyproject.toml | 5 +- 2 files changed, 120 insertions(+), 23 deletions(-) diff --git a/poetry.lock b/poetry.lock index 50897902..782962e8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "altgraph" @@ -11,6 +11,27 @@ files = [ {file = "altgraph-0.17.2.tar.gz", hash = "sha256:ebf2269361b47d97b3b88e696439f6e4cbc607c17c51feb1754f90fb79839158"}, ] +[[package]] +name = "anyio" +version = "4.0.0" +description = "High level compatibility layer for multiple asynchronous event loop implementations" +optional = false +python-versions = ">=3.8" +files = [ + {file = "anyio-4.0.0-py3-none-any.whl", hash = "sha256:cfdb2b588b9fc25ede96d8db56ed50848b0b649dca3dd1df0b11f683bb9e0b5f"}, + {file = "anyio-4.0.0.tar.gz", hash = "sha256:f7ed51751b2c2add651e5747c891b47e26d2a21be5d32d9311dfe9692f3e5d7a"}, +] + +[package.dependencies] +exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} +idna = ">=2.8" +sniffio = ">=1.1" + +[package.extras] +doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] +trio = ["trio (>=0.22)"] + [[package]] name = "beautifulsoup4" version = "4.11.1" @@ -65,6 +86,75 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "exceptiongroup" +version = "1.1.3" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.1.3-py3-none-any.whl", hash = "sha256:343280667a4585d195ca1cf9cef84a4e178c4b6cf2274caef9859782b567d5e3"}, + {file = "exceptiongroup-1.1.3.tar.gz", hash = "sha256:097acd85d473d75af5bb98e41b61ff7fe35efe6675e4f9370ec6ec5126d160e9"}, +] + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = false +python-versions = ">=3.7" +files = [ + {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, + {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, +] + +[[package]] +name = "httpcore" +version = "0.18.0" +description = "A minimal low-level HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpcore-0.18.0-py3-none-any.whl", hash = "sha256:adc5398ee0a476567bf87467063ee63584a8bce86078bf748e48754f60202ced"}, + {file = "httpcore-0.18.0.tar.gz", hash = "sha256:13b5e5cd1dca1a6636a6aaea212b19f4f85cd88c366a2b82304181b769aab3c9"}, +] + +[package.dependencies] +anyio = ">=3.0,<5.0" +certifi = "*" +h11 = ">=0.13,<0.15" +sniffio = "==1.*" + +[package.extras] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] + +[[package]] +name = "httpx" +version = "0.25.0" +description = "The next generation HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpx-0.25.0-py3-none-any.whl", hash = "sha256:181ea7f8ba3a82578be86ef4171554dd45fec26a02556a744db029a0a27b7100"}, + {file = "httpx-0.25.0.tar.gz", hash = "sha256:47ecda285389cb32bb2691cc6e069e3ab0205956f681c5b2ad2325719751d875"}, +] + +[package.dependencies] +certifi = "*" +httpcore = ">=0.18.0,<0.19.0" +idna = "*" +sniffio = "*" + +[package.extras] +brotli = ["brotli", "brotlicffi"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] + [[package]] name = "idna" version = "3.3" @@ -332,25 +422,6 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] -[[package]] -name = "requests-mock" -version = "1.9.3" -description = "Mock out responses from the requests package" -optional = false -python-versions = "*" -files = [ - {file = "requests-mock-1.9.3.tar.gz", hash = "sha256:8d72abe54546c1fc9696fa1516672f1031d72a55a1d66c85184f972a24ba0eba"}, - {file = "requests_mock-1.9.3-py2.py3-none-any.whl", hash = "sha256:0a2d38a117c08bb78939ec163522976ad59a6b7fdd82b709e23bb98004a44970"}, -] - -[package.dependencies] -requests = ">=2.3,<3" -six = "*" - -[package.extras] -fixture = ["fixtures"] -test = ["fixtures", "mock", "purl", "pytest", "sphinx", "testrepository (>=0.0.18)", "testtools"] - [[package]] name = "scikit-learn" version = "1.3.0" @@ -471,6 +542,17 @@ files = [ [package.dependencies] scikit-learn = "*" +[[package]] +name = "sniffio" +version = "1.3.0" +description = "Sniff out which async library your code is running under" +optional = false +python-versions = ">=3.7" +files = [ + {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, + {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, +] + [[package]] name = "soupsieve" version = "2.3.2.post1" @@ -482,6 +564,20 @@ files = [ {file = "soupsieve-2.3.2.post1.tar.gz", hash = "sha256:fc53893b3da2c33de295667a0e19f078c14bf86544af307354de5fcf12a3f30d"}, ] +[[package]] +name = "tabulate" +version = "0.9.0" +description = "Pretty-print tabular data" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, + {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, +] + +[package.extras] +widechars = ["wcwidth"] + [[package]] name = "termcolor" version = "1.1.0" @@ -595,4 +691,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.9,<=3.11.4" -content-hash = "b8d1390ad998dd46bc5dd5ae402ac9a50158c2a4873a5a2ef1d7a3791870ced7" +content-hash = "ebad665d65bb7d8a6b22362b2ada5cca42961b41f25ff95dbd6b25a65ab803f1" diff --git a/pyproject.toml b/pyproject.toml index 1ef6bef4..bf865fb8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,8 +22,6 @@ pyinstaller = "5.13.0" pyinstaller-hooks-contrib = "2022.7" PySocks = "1.7.1" python-dotenv = "0.20.0" -requests = "2.31.0" -requests-mock = "1.9.3" scikit-learn = "1.3.0" scipy = "1.10.0" six = "1.16.0" @@ -39,6 +37,9 @@ yattag = "1.14.0" treelib = "^1.6.1" numpy = "1.24.4" unipath = "^1.1" +httpx = "^0.25.0" +requests = "^2.31.0" +tabulate = "^0.9.0" [tool.poetry.dev-dependencies] From 1f2a293fb29bcc10b50d97ce4df5c918141ad8f5 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Sun, 8 Oct 2023 11:28:28 -0400 Subject: [PATCH 02/25] Update main.py to handle new changes --- torbot/main.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/torbot/main.py b/torbot/main.py index a49b6f49..a4b1552e 100644 --- a/torbot/main.py +++ b/torbot/main.py @@ -5,7 +5,9 @@ import sys from .modules import link_io -from .modules.linktree import LinkTree + +from .modules.link_io import pprint_tree, print_tor_ip_address +from .modules.api import get_node from .modules.color import color from .modules.updater import check_version from .modules.savefile import saveJson @@ -21,6 +23,7 @@ class TorBot: def __init__(self, args): self.args = args + self.__version__ = version def get_header(self): license_msg = color("LICENSE: GNU Public License v3", "red") @@ -70,7 +73,7 @@ def handle_tree_args(self, args): """ Outputs tree visual for data """ - tree = LinkTree(args.url, args.depth) + ''' # -v/--visualize if args.visualize: tree.show() @@ -79,28 +82,36 @@ def handle_tree_args(self, args): if args.download: file_name = str(input("File Name (.txt): ")) tree.save(file_name) + ''' def perform_action(self): args = self.args + + # If url flag is set then check for accompanying flag set. Only one + # additional flag can be set with -u/--url flag + if not args.url: + print("usage: See run.py -h for possible arguments.") + sys.exit() + if args.gather: collect_data(args.url) - return + sys.exit() # If flag is -v, --update, -q/--quiet then user only runs that operation # because these are single flags only if args.version: - print("TorBot Version:" + self.__version__) + print(f"TorBot Version: {self.__version__}") sys.exit() if args.update: check_version() sys.exit() if not args.quiet: self.get_header() - # If url flag is set then check for accompanying flag set. Only one - # additional flag can be set with -u/--url flag - if not args.url: - print("usage: See run.py -h for possible arguments.") - link_io.print_tor_ip_address() + + print_tor_ip_address() + + tree = get_node(args.url, args.depth) + if args.classify: result = main.classify(args.url) print("Website Classification: " + result[0], "| Accuracy: " + str(result[1])) @@ -114,7 +125,7 @@ def perform_action(self): execute_all(args.url) else: if args.url: - link_io.print_tree(args.url, args.depth, args.classifyAll) + pprint_tree(tree) print("\n\n") @@ -130,7 +141,7 @@ def get_args(): parser.add_argument("-s", "--save", action="store_true", help="Save results in a file") parser.add_argument("-m", "--mail", action="store_true", help="Get e-mail addresses from the crawled sites") parser.add_argument("-p", "--phone", action="store_true", help="Get phone numbers from the crawled sites") - parser.add_argument("--depth", help="Specifiy max depth of crawler (default 1)", default=1) + parser.add_argument("--depth", type=int, help="Specifiy max depth of crawler (default 1)", default=1) parser.add_argument("--gather", action="store_true", help="Gather data for analysis") parser.add_argument("-v", "--visualize", action="store_true", help="Visualizes tree of data gathered.") parser.add_argument("-d", "--download", action="store_true", help="Downloads tree of data gathered.") From e06fa5000775e3ece5883c5db4479ab2508835af Mon Sep 17 00:00:00 2001 From: Akeem King Date: Sun, 8 Oct 2023 11:28:52 -0400 Subject: [PATCH 03/25] Remove the use of gotor for building trees and retrieving IP address --- torbot/modules/api.py | 65 +++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/torbot/modules/api.py b/torbot/modules/api.py index b3557ba2..56e89181 100644 --- a/torbot/modules/api.py +++ b/torbot/modules/api.py @@ -3,37 +3,58 @@ Provides access to external services using API wrappers """ -import requests +import httpx +import logging + +from treelib import Tree +from bs4 import BeautifulSoup, Tag -from .log import debug from .config import host, port +from .linktree import append_node, build_tree base_url: str = f'http://{host}:{port}' +logging.getLogger("httpx").setLevel(logging.WARNING) -def get_node(link: str, depth: int): +def get_node(url: str, depth: int): """ Returns the LinkTree for the given link at the specified depth. """ - endpoint = f'/tree?link={link}&depth={depth}' - url = base_url + endpoint - debug(f'requesting {url}') - resp = requests.get(url) - data = resp.json() - debug(f'retrieved {data}') - return data + tree = Tree() + append_node(tree, id=url, parent_id=None) + build_tree(tree, url, depth) + return tree -def get_ip(): +def get_ip() -> dict: """ Returns the IP address of the current Tor client the service is using. """ - endpoint = '/ip' - url = base_url + endpoint - debug(f'requesting {url}') - resp = requests.get(url) - debug(f'retrieved {resp.text}') - return resp.text + resp = httpx.get("https://check.torproject.org/") + soup = BeautifulSoup(resp.text, features='html.parser') + + # Get the content of check tor project, this contains the header and body + content = soup.find("div", {"class": "content"}) + if not content: + raise Exception("unable to find content to parse IP.") + + # parse the header + header_tag = content.find("h1") + if not header_tag: + raise Exception("unable to find header") + if not isinstance(header_tag, Tag): + raise Exception("invalid header found") + header = header_tag.get_text().strip() + + # parse the main content containing the IP address + body_tag = content.find("p") + if not body_tag: + raise Exception("unable to find body") + if not isinstance(body_tag, Tag): + raise Exception("invalid body found") + body = body_tag.get_text().strip() + + return {"header": header, "body": body} def get_emails(link: str): @@ -42,10 +63,8 @@ def get_emails(link: str): """ endpoint = f'/emails?link={link}' url = base_url + endpoint - debug(f'requesting {url}') - resp = requests.get(url) + resp = httpx.get(url) data = resp.json() - debug(f'retrieved {data}') return data @@ -55,10 +74,8 @@ def get_phone(link: str): """ endpoint = f'/phone?link={link}' url = base_url + endpoint - debug(f'requesting {url}') - resp = requests.get(url) + resp = httpx.get(url) data = resp.json() - debug(f'retrieved {data}') return data @@ -69,6 +86,6 @@ def get_web_content(link: str): endpoint = f'/content?link={link}' url = base_url + endpoint debug(f'requesting {url}') - resp = requests.get(url) + resp = httpx.get(url) debug(f'retrieved {resp.text}') return resp.text From 71f8cc887853d50d45fd0eec938ecfe9f3177a3f Mon Sep 17 00:00:00 2001 From: Akeem King Date: Sun, 8 Oct 2023 11:29:07 -0400 Subject: [PATCH 04/25] Utilize new api in IO module --- torbot/modules/link_io.py | 78 ++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 43 deletions(-) diff --git a/torbot/modules/link_io.py b/torbot/modules/link_io.py index b4ea0d10..fa0ba921 100644 --- a/torbot/modules/link_io.py +++ b/torbot/modules/link_io.py @@ -2,63 +2,56 @@ This module is used for reading HTML pages using either bs4.BeautifulSoup objects or url strings """ +import http.client +import tabulate from pprint import pprint -from typing import Any +from treelib import Tree -from .linktree import LinkTree -from .api import get_web_content, get_node, get_emails, get_phone, get_ip +from .api import get_node, get_emails, get_phone, get_ip from .color import color -from .nlp.main import classify -def print_tor_ip_address(): +def print_tor_ip_address() -> None: """ https://check.torproject.org/ tells you if you are using tor and it displays your IP address which we scape and display """ - print('Attempting to connect to https://check.torproject.org/') - ip_string = color(get_ip(), 'yellow') - print(f'Tor IP Address: {ip_string}') + resp = get_ip() + print(resp["header"]) + print(color(resp["body"], "yellow")) -def print_node(node: LinkTree, classify_page: bool): +def pprint_tree(tree: Tree) -> None: """ Prints the status of a link based on it's connection status """ - try: - title = node['url'] - status_text = f"{node['status_code']} {node['status']}" - if classify_page: - classification = classify(get_web_content(node['url'])) - status_text += f" {classification}" - if node['status_code'] >= 200 and node['status_code'] < 300: - status = color(status_text, 'green') - elif node['status_code'] >= 300 and node['status_code'] < 400: - status = color(status_text, 'yellow') + nodes = tree.all_nodes_itr() + table_data = [] + + def insert(node, color_code): + status = str(node.data.status) + code = http.client.responses[node.data.status] + status_message = f'{status} {code}' + table_data.append([ + node.tag, + node.identifier, + color(status_message, color_code), + node.data.classification, + ]) + + for node in nodes: + status_code = node.data.status + if status_code >= 200 and status_code < 300: + insert(node, 'green') + elif status_code >= 300 and status_code < 400: + insert(node, 'yellow') else: - status = color(status_text, 'red') - except Exception: - title = "NOT FOUND" - status = color('Unable to reach destination.', 'red') - - status_msg = "%-60s %-20s" % (title, status) - print(status_msg) - - -def cascade(node: LinkTree, work: Any, classify_page: bool): - work(node, classify_page) - if node['children']: - for child in node['children']: - cascade(child, work, classify_page) - - -def print_tree(url: str, depth: int = 1, classify_page: bool = False): - """ - Prints the entire tree in a user friendly fashion - """ - root = get_node(url, depth) - cascade(root, print_node, classify_page) + insert(node, 'red') + + headers = ["Title", "URL", "Status", "Category"] + table = tabulate.tabulate(table_data, headers=headers) + print(table) def print_json(url: str, depth: int = 1): @@ -69,8 +62,7 @@ def print_json(url: str, depth: int = 1): root (dict): Dictionary containing the root node and it's children """ root = get_node(url, depth) - pprint(root) - return root + print(root.to_json()) def print_emails(url: str): From d727061f73859219b9b5da7c1f8feeb7a4cc98b4 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Sun, 8 Oct 2023 11:29:32 -0400 Subject: [PATCH 05/25] Remove LinkTree class and use treelib strcuture for hosting nodes --- torbot/modules/linktree.py | 127 ++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 71 deletions(-) diff --git a/torbot/modules/linktree.py b/torbot/modules/linktree.py index 22e31f8d..eb1c9327 100644 --- a/torbot/modules/linktree.py +++ b/torbot/modules/linktree.py @@ -2,87 +2,72 @@ Module is used for analyzing link relationships """ import os +import re +import httpx +import validators +import logging -from treelib import Tree, exceptions +from treelib import Tree, exceptions, Node +from bs4 import BeautifulSoup -from .api import get_node -from .config import get_data_directory -from .log import debug +from .nlp.main import classify +class Link(Node): + def __init__(self, title: str, url: str, status: int, classification: str, accuracy: float): + self.identifier = url + self.tag = title + self.status = status + self.classification = classification + self.accuracy = accuracy -def formatNode(n): - return f"{n['url']} {n['status_code']} {n['status']}" - -def build_tree_recursive(t, n): - - # this will only be ran on the root node since others will exist before being passed - parent_id = n["url"] - if not t.contains(parent_id): - debug(f"adding id {parent_id}") - t.create_node(formatNode(n), parent_id) - - # if there are no children, there's nothing to process - children = n["children"] - if not children: - return - - for child in children: - try: - child_id = child["url"] - debug(f"adding child_id {child_id} to parent_id {parent_id}") - t.create_node(formatNode(child), child_id, parent=parent_id) - except exceptions.DuplicatedNodeIdError: - debug(f"found a duplicate url {child_id}") - continue # this node has already been processed somewhere else - - build_tree_recursive(t, child) - - -class LinkTree: +def parse_links(html: str) -> list[str]: """ - This is a class that represents a tree of links within TorBot. This can - be used to build a tree, examine the number of nodes, check if a node - exists within a tree, displaying the tree, and downloading the tree. It - will be expanded in the future to meet further needs. + Finds all anchor tags and parses the href attribute. """ + soup = BeautifulSoup(html, 'html.parser') + tags = soup.find_all('a') + return [tag['href'] for tag in tags if tag.has_attr('href') and validators.url(tag['href'])] - def __init__(self, root: str, depth: int): - self.__build_tree(root, depth) - def __build_tree(self, url: str, depth: int = 1): - """ - Builds link tree by traversing through children nodes. +def append_node(tree: Tree, id: str, parent_id: str | None) -> None: + """ + Creates a node for a tree using the given ID which corresponds to a URL. + If the parent_id is None, this will be considered a root node. + """ + resp = httpx.get(id) + soup = BeautifulSoup(resp.text, 'html.parser') + title = soup.title.text.strip() if soup.title is not None else id + try: + [classification, accuracy] = classify(resp.text) + data = Link(title, id, resp.status_code, classification, accuracy) + tree.create_node(title, identifier=id, parent=parent_id, data=data) + except exceptions.DuplicatedNodeIdError: + logging.debug(f"found a duplicate URL {id}") + + +def build_tree(tree: Tree, url: str, depth: int) -> None: + """ + Builds a tree from the root to the given depth. + """ + if depth > 0: + depth -= 1 + resp = httpx.get(url) + children = parse_links(resp.text) + for child in children: + append_node(tree, id=child, parent_id=url) + build_tree(tree, child, depth) - Returns: - tree (ete3.Tree): Built tree. - """ - debug(f"building tree for {url} at {depth} depth") - n = get_node(url, depth) - t = Tree() - build_tree_recursive(t, n) - self._tree = t - debug("tree built successfully") - def save(self, file_name: str): - """ - Saves LinkTree to file with given file_name - Current file types supported are .txt - """ - print(f"saving link tree as {file_name}") - data_directory = get_data_directory() - file_path = os.path.join(data_directory, file_name) - try: - self._tree.save2file(file_path) - except Exception as e: - print(f"failed to save link tree to {file_path}") - debug(e) - raise e +def save(tree: Tree, file_name: str) -> None: + """ + Saves the tree to the current working directory under the given file name. + """ + tree.save2file(os.path.join(os.getcwd(), file_name)) - print(f"file saved successfully to {file_path}") - def show(self): - """ - Displays image of LinkTree - """ - self._tree.show() +def show(tree: Tree) -> None: + """ + Prints the tree + """ + tree.show() From 760b961184669f4d72d9f3a805c9fe0a475c516c Mon Sep 17 00:00:00 2001 From: Akeem King Date: Sun, 8 Oct 2023 11:29:46 -0400 Subject: [PATCH 06/25] Remove gotor submodule --- gotor | 1 - 1 file changed, 1 deletion(-) delete mode 160000 gotor diff --git a/gotor b/gotor deleted file mode 160000 index 544df786..00000000 --- a/gotor +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 544df786eda19f618085d13d37e5f179aac04bf7 From 356c96c1f5b4a628f1b721a279e0f2a2cfd2a0f9 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Sun, 8 Oct 2023 11:30:28 -0400 Subject: [PATCH 07/25] Remove gotor from .gitmodules --- .gitmodules | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 .gitmodules diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 68cdb9d9..00000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "gotor"] - path = gotor - url = https://github.com/KingAkeem/gotor From 8dd2b81c1480140c7e6ade2ec157dc59a3316217 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Sun, 8 Oct 2023 15:45:35 -0400 Subject: [PATCH 08/25] Update README, scripts and dependency managers to reflect gotor changes --- README.md | 46 +--------------------------------------------- poetry.lock | 10 +++++----- requirements.txt | 12 +++++++++--- scripts/install.sh | 15 +-------------- 4 files changed, 16 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index 8a07699b..7da8766f 100755 --- a/README.md +++ b/README.md @@ -40,42 +40,15 @@ ### Dependencies - Tor -- Python ^3.9 -- Golang 1.19 +- Python ^3.11 - Poetry ### Python Dependencies (see requirements.txt for more details) -### Golang Dependencies -- https://github.com/KingAkeem/gotor (This service needs to be ran in tandem with TorBot) - ## Installation -### Gotor -gotor is needed to run this module. -Note: If the `gotor` directory is empty, you may need to run `git submodule update --init --recursive` to initialize the submodule. - -#### Using local Tor service -* Run the tor service: -```sh -sudo service tor start -``` -* Make sure that your torrc is configured to SOCKS_PORT localhost:9050 - -* Open a new terminal and start `gotor`, this can be done using `docker` or `go` -- using go: -```sh -cd gotor && go run cmd/main/main.go -server -``` - -#### Using tor and gotor docker containers -- using docker (multi-stage image, builds tor and gotor container): -```sh -cd gotor && ./build.sh -``` - ### TorBot * TorBot dependencies are managed using `poetry`, you can find the installation commands below: ```sh @@ -84,23 +57,6 @@ poetry run python run.py -u https://www.example.com --depth 2 -v # example of ru poetry run python run.py -h # for help ``` -### Full Installation -There is a shell script that will attempt to install both `torbot` and `gotor` as global modules. -The script `install.sh` will first install the latest version of `torbot` found in `PyPI`, -then it will attempt to install `gotor` to the `GOBIN` path after making the path globally accessible. -```sh -source install.sh # execute script -``` - -You can now run -```sh -gotor -server -``` -and crawl using -```sh -python -m torbot -u https://www.example.com -``` - ### Options
 usage: Gather and analyze data from Tor sites.
diff --git a/poetry.lock b/poetry.lock
index 782962e8..7d99b61f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -504,19 +504,19 @@ test = ["asv", "gmpy2", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeo
 
 [[package]]
 name = "setuptools"
-version = "68.1.2"
+version = "68.2.2"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "setuptools-68.1.2-py3-none-any.whl", hash = "sha256:3d8083eed2d13afc9426f227b24fd1659489ec107c0e86cec2ffdde5c92e790b"},
-    {file = "setuptools-68.1.2.tar.gz", hash = "sha256:3d4dfa6d95f1b101d695a6160a7626e15583af71a5f52176efa5d39a054d475d"},
+    {file = "setuptools-68.2.2-py3-none-any.whl", hash = "sha256:b454a35605876da60632df1a60f736524eb73cc47bbc9f3f1ef1b644de74fd2a"},
+    {file = "setuptools-68.2.2.tar.gz", hash = "sha256:4ac1475276d2f1c48684874089fefcd83bd7162ddaafb81fac866ba0db282a87"},
 ]
 
 [package.extras]
-docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5,<=7.1.2)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
 testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
-testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.1)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
 
 [[package]]
 name = "six"
diff --git a/requirements.txt b/requirements.txt
index 19e8fc25..3f361cde 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,13 @@
 altgraph==0.17.2 ; python_version >= "3.9" and python_full_version <= "3.11.4"
+anyio==4.0.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 beautifulsoup4==4.11.1 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 certifi==2023.7.22 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 charset-normalizer==2.0.12 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 decorator==5.1.1 ; python_version >= "3.9" and python_full_version <= "3.11.4"
+exceptiongroup==1.1.3 ; python_version >= "3.9" and python_version < "3.11"
+h11==0.14.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
+httpcore==0.18.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
+httpx==0.25.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 idna==3.3 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 igraph==0.10.6 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 joblib==1.2.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
@@ -15,20 +20,21 @@ pyinstaller==5.13.0 ; python_version >= "3.9" and python_full_version <= "3.11.4
 pysocks==1.7.1 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 python-dotenv==0.20.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 pywin32-ctypes==0.2.2 ; python_version >= "3.9" and python_full_version <= "3.11.4" and sys_platform == "win32"
-requests-mock==1.9.3 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 requests==2.31.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 scikit-learn==1.3.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 scipy==1.10.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
-setuptools==68.1.2 ; python_version >= "3.9" and python_full_version <= "3.11.4"
+setuptools==68.2.2 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 six==1.16.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 sklearn==0.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
+sniffio==1.3.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 soupsieve==2.3.2.post1 ; python_version >= "3.9" and python_full_version <= "3.11.4"
+tabulate==0.9.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 termcolor==1.1.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 texttable==1.6.4 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 threadpoolctl==3.1.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 threadsafe==1.0.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 treelib==1.7.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 unipath==1.1 ; python_version >= "3.9" and python_full_version <= "3.11.4"
-urllib3==1.26.9 ; python_version >= "3.9" and python_full_version <= "3.11.4"
+urllib3==1.26.17 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 validators==0.20.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 yattag==1.14.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
diff --git a/scripts/install.sh b/scripts/install.sh
index d1283593..1c7d8f13 100755
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -5,17 +5,4 @@ echo
 python -m pip install torbot
 echo
 echo "TorBot installed. Run with 'python -m torbot'"
-echo
-
-echo "Setting GOPATH to access executable"
-export PATH=${PATH}:`go env GOPATH`/bin
-echo "New Path ${PATH}"
-echo
-
-echo "Installing gotor"
-echo
-cd gotor/cmd/main
-go install gotor.go
-echo "Gotor installed. Run with 'gotor'."
-
-cd ../../..
+echo
\ No newline at end of file

From 0ac5c326aa6b92877ead91d3dd7484485dfb83eb Mon Sep 17 00:00:00 2001
From: Akeem King 
Date: Sun, 8 Oct 2023 18:48:14 -0400
Subject: [PATCH 09/25] Test support for socks5 proxy using default values

---
 poetry.lock                | 14 +++++++++++++-
 pyproject.toml             |  2 +-
 torbot/modules/api.py      |  2 +-
 torbot/modules/linktree.py |  6 +++---
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 7d99b61f..c573ccb6 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -148,6 +148,7 @@ certifi = "*"
 httpcore = ">=0.18.0,<0.19.0"
 idna = "*"
 sniffio = "*"
+socksio = {version = "==1.*", optional = true, markers = "extra == \"socks\""}
 
 [package.extras]
 brotli = ["brotli", "brotlicffi"]
@@ -553,6 +554,17 @@ files = [
     {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
 ]
 
+[[package]]
+name = "socksio"
+version = "1.0.0"
+description = "Sans-I/O implementation of SOCKS4, SOCKS4A, and SOCKS5."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3"},
+    {file = "socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac"},
+]
+
 [[package]]
 name = "soupsieve"
 version = "2.3.2.post1"
@@ -691,4 +703,4 @@ files = [
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<=3.11.4"
-content-hash = "ebad665d65bb7d8a6b22362b2ada5cca42961b41f25ff95dbd6b25a65ab803f1"
+content-hash = "bc665d85d8bb2537f084f64260e0b84212b7917a530ff79d8c8c9dd896c015d5"
diff --git a/pyproject.toml b/pyproject.toml
index bf865fb8..4f5e71b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ yattag = "1.14.0"
 treelib = "^1.6.1"
 numpy = "1.24.4"
 unipath = "^1.1"
-httpx = "^0.25.0"
+httpx = {extras = ["socks"], version = "^0.25.0"}
 requests = "^2.31.0"
 tabulate = "^0.9.0"
 
diff --git a/torbot/modules/api.py b/torbot/modules/api.py
index 56e89181..fcba6d64 100644
--- a/torbot/modules/api.py
+++ b/torbot/modules/api.py
@@ -30,7 +30,7 @@ def get_ip() -> dict:
     """
     Returns the IP address of the current Tor client the service is using.
     """
-    resp = httpx.get("https://check.torproject.org/")
+    resp = httpx.get("https://check.torproject.org/", proxies='socks5://127.0.0.1:9050')
     soup = BeautifulSoup(resp.text, features='html.parser')
 
     # Get the content of check tor project, this contains the header and body
diff --git a/torbot/modules/linktree.py b/torbot/modules/linktree.py
index eb1c9327..c41052bf 100644
--- a/torbot/modules/linktree.py
+++ b/torbot/modules/linktree.py
@@ -2,7 +2,6 @@
 Module is used for analyzing link relationships
 """
 import os
-import re
 import httpx
 import validators
 import logging
@@ -12,6 +11,7 @@
 
 from .nlp.main import classify
 
+
 class Link(Node):
     def __init__(self, title: str, url: str, status: int, classification: str, accuracy: float):
         self.identifier = url
@@ -35,7 +35,7 @@ def append_node(tree: Tree, id: str, parent_id: str | None) -> None:
     Creates a node for a tree using the given ID which corresponds to a URL.
     If the parent_id is None, this will be considered a root node.
     """
-    resp = httpx.get(id)
+    resp = httpx.get(id, proxies='socks5://127.0.0.1:9050')
     soup = BeautifulSoup(resp.text, 'html.parser')
     title = soup.title.text.strip() if soup.title is not None else id
     try:
@@ -52,7 +52,7 @@ def build_tree(tree: Tree, url: str, depth: int) -> None:
     """
     if depth > 0:
         depth -= 1
-        resp = httpx.get(url)
+        resp = httpx.get(url, proxies='socks5://127.0.0.1:9050')
         children = parse_links(resp.text) 
         for child in children:
             append_node(tree, id=child, parent_id=url)

From 461d67ea935cba683fa251b42f5f155a4113d0c5 Mon Sep 17 00:00:00 2001
From: Akeem King 
Date: Sun, 8 Oct 2023 21:19:47 -0400
Subject: [PATCH 10/25] flake8 fixes

---
 torbot/__init__.py         | 12 ------------
 torbot/__main__.py         | 10 ----------
 torbot/main.py             |  8 ++++----
 torbot/modules/api.py      |  3 +--
 torbot/modules/link_io.py  |  4 ++--
 torbot/modules/linktree.py |  2 +-
 torbot/version.py          |  1 -
 7 files changed, 8 insertions(+), 32 deletions(-)
 delete mode 100644 torbot/__init__.py
 delete mode 100644 torbot/__main__.py
 delete mode 100644 torbot/version.py

diff --git a/torbot/__init__.py b/torbot/__init__.py
deleted file mode 100644
index df6ddfaa..00000000
--- a/torbot/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-"""
-Torbot API.
-"""
-from .modules import link_io
-# from .modules.linktree import LinkTree
-from .modules.color import color
-from .modules.updater import check_version
-from .modules.savefile import saveJson
-from .modules.info import execute_all
-from .modules.collect_data import collect_data
-
-from . import version
diff --git a/torbot/__main__.py b/torbot/__main__.py
deleted file mode 100644
index 8414b399..00000000
--- a/torbot/__main__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/env python3
-from torbot import main
-
-if __name__ == '__main__':
-    try:
-        args = main.get_args()
-        torbot = main.TorBot(args)
-        torbot.perform_action()
-    except KeyboardInterrupt:
-        print("Interrupt received! Exiting cleanly...")
diff --git a/torbot/main.py b/torbot/main.py
index a4b1552e..3ed43a96 100644
--- a/torbot/main.py
+++ b/torbot/main.py
@@ -15,7 +15,7 @@
 from .modules.collect_data import collect_data
 from .modules.nlp import main
 
-from . import version
+VERSION = '3.1.2'
 
 
 # TorBot CLI class
@@ -23,7 +23,7 @@ class TorBot:
 
     def __init__(self, args):
         self.args = args
-        self.__version__ = version
+        self.__version__ = VERSION
 
     def get_header(self):
         license_msg = color("LICENSE: GNU Public License v3", "red")
@@ -33,7 +33,7 @@ def get_header(self):
                             / __/ / / / /_/ / __ \/ __ \/ /
                            / /_/ /_/ / _, _/ /_/ / /_/ / /
                            \__/\____/_/ |_/_____/\____/_/  V{VERSION}
-                """.format(VERSION=version.__version__)
+                """.format(VERSION=self.__version__)
         banner = color(banner, "red")
 
         title = r"""
@@ -110,7 +110,7 @@ def perform_action(self):
 
         print_tor_ip_address()
 
-        tree = get_node(args.url, args.depth) 
+        tree = get_node(args.url, args.depth)
 
         if args.classify:
             result = main.classify(args.url)
diff --git a/torbot/modules/api.py b/torbot/modules/api.py
index fcba6d64..37774987 100644
--- a/torbot/modules/api.py
+++ b/torbot/modules/api.py
@@ -16,6 +16,7 @@
 
 logging.getLogger("httpx").setLevel(logging.WARNING)
 
+
 def get_node(url: str, depth: int):
     """
     Returns the LinkTree for the given link at the specified depth.
@@ -85,7 +86,5 @@ def get_web_content(link: str):
     """
     endpoint = f'/content?link={link}'
     url = base_url + endpoint
-    debug(f'requesting {url}')
     resp = httpx.get(url)
-    debug(f'retrieved {resp.text}')
     return resp.text
diff --git a/torbot/modules/link_io.py b/torbot/modules/link_io.py
index fa0ba921..e7943084 100644
--- a/torbot/modules/link_io.py
+++ b/torbot/modules/link_io.py
@@ -48,9 +48,9 @@ def insert(node, color_code):
             insert(node, 'yellow')
         else:
             insert(node, 'red')
-        
+
     headers = ["Title", "URL", "Status", "Category"]
-    table = tabulate.tabulate(table_data, headers=headers) 
+    table = tabulate.tabulate(table_data, headers=headers)
     print(table)
 
 
diff --git a/torbot/modules/linktree.py b/torbot/modules/linktree.py
index c41052bf..6dc95ef7 100644
--- a/torbot/modules/linktree.py
+++ b/torbot/modules/linktree.py
@@ -53,7 +53,7 @@ def build_tree(tree: Tree, url: str, depth: int) -> None:
     if depth > 0:
         depth -= 1
         resp = httpx.get(url, proxies='socks5://127.0.0.1:9050')
-        children = parse_links(resp.text) 
+        children = parse_links(resp.text)
         for child in children:
             append_node(tree, id=child, parent_id=url)
             build_tree(tree, child, depth)
diff --git a/torbot/version.py b/torbot/version.py
deleted file mode 100644
index f71b21a5..00000000
--- a/torbot/version.py
+++ /dev/null
@@ -1 +0,0 @@
-__version__ = '3.1.2'

From 1fe7a7b0fc9b3d304705eb83636048cce5690000 Mon Sep 17 00:00:00 2001
From: Akeem King 
Date: Sun, 8 Oct 2023 23:28:55 -0400
Subject: [PATCH 11/25] many changes

---
 poetry.lock                             |  13 +-
 pyproject.toml                          |   1 +
 requirements.txt                        |   4 +-
 run.py                                  |  10 -
 torbot/main.py                          | 241 +++++++++---------------
 torbot/modules/api.py                   |  49 +----
 torbot/modules/collect_data.py          |  76 --------
 torbot/modules/config.py                |  22 +--
 torbot/modules/info.py                  |  24 +--
 torbot/modules/{link_io.py => io.py}    |  37 +---
 torbot/modules/linktree.py              |  94 ++++-----
 torbot/modules/log.py                   |  19 --
 torbot/modules/tests/test_pagereader.py |   2 +-
 13 files changed, 188 insertions(+), 404 deletions(-)
 delete mode 100644 run.py
 delete mode 100644 torbot/modules/collect_data.py
 rename torbot/modules/{link_io.py => io.py} (68%)
 delete mode 100644 torbot/modules/log.py

diff --git a/poetry.lock b/poetry.lock
index c573ccb6..e2efb3f5 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -311,6 +311,17 @@ files = [
     {file = "pefile-2023.2.7.tar.gz", hash = "sha256:82e6114004b3d6911c77c3953e3838654b04511b8b66e8583db70c65998017dc"},
 ]
 
+[[package]]
+name = "phonenumbers"
+version = "8.13.22"
+description = "Python version of Google's common library for parsing, formatting, storing and validating international phone numbers."
+optional = false
+python-versions = "*"
+files = [
+    {file = "phonenumbers-8.13.22-py2.py3-none-any.whl", hash = "sha256:85ceeba9e67984ba98182c77e8e4c70093d38c0c6a0cb2bd392e0694ddaeb1f6"},
+    {file = "phonenumbers-8.13.22.tar.gz", hash = "sha256:001664c90f59b8954766c2db85adafc8dbc96177efeb49607ca4e64a7acaf569"},
+]
+
 [[package]]
 name = "progress"
 version = "1.6"
@@ -703,4 +714,4 @@ files = [
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<=3.11.4"
-content-hash = "bc665d85d8bb2537f084f64260e0b84212b7917a530ff79d8c8c9dd896c015d5"
+content-hash = "7b3ae36389472ec97dd5aacc437381b5c7f13f3d08e4ab738ef699b46c85a17a"
diff --git a/pyproject.toml b/pyproject.toml
index 4f5e71b7..daf30849 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ unipath = "^1.1"
 httpx = {extras = ["socks"], version = "^0.25.0"}
 requests = "^2.31.0"
 tabulate = "^0.9.0"
+phonenumbers = "^8.13.22"
 
 [tool.poetry.dev-dependencies]
 
diff --git a/requirements.txt b/requirements.txt
index 3f361cde..31442efa 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,13 +7,14 @@ decorator==5.1.1 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 exceptiongroup==1.1.3 ; python_version >= "3.9" and python_version < "3.11"
 h11==0.14.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 httpcore==0.18.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
-httpx==0.25.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
+httpx[socks]==0.25.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 idna==3.3 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 igraph==0.10.6 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 joblib==1.2.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 macholib==1.16 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 numpy==1.24.4 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 pefile==2023.2.7 ; python_version >= "3.9" and python_full_version <= "3.11.4" and sys_platform == "win32"
+phonenumbers==8.13.22 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 progress==1.6 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 pyinstaller-hooks-contrib==2022.7 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 pyinstaller==5.13.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
@@ -27,6 +28,7 @@ setuptools==68.2.2 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 six==1.16.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 sklearn==0.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 sniffio==1.3.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
+socksio==1.0.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 soupsieve==2.3.2.post1 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 tabulate==0.9.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
 termcolor==1.1.0 ; python_version >= "3.9" and python_full_version <= "3.11.4"
diff --git a/run.py b/run.py
deleted file mode 100644
index 8414b399..00000000
--- a/run.py
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/env python3
-from torbot import main
-
-if __name__ == '__main__':
-    try:
-        args = main.get_args()
-        torbot = main.TorBot(args)
-        torbot.perform_action()
-    except KeyboardInterrupt:
-        print("Interrupt received! Exiting cleanly...")
diff --git a/torbot/main.py b/torbot/main.py
index 3ed43a96..ba4d4af1 100644
--- a/torbot/main.py
+++ b/torbot/main.py
@@ -3,170 +3,117 @@
 """
 import argparse
 import sys
+import logging
 
-from .modules import link_io
-
-from .modules.link_io import pprint_tree, print_tor_ip_address
-from .modules.api import get_node
-from .modules.color import color
-from .modules.updater import check_version
-from .modules.savefile import saveJson
-from .modules.info import execute_all
-from .modules.collect_data import collect_data
-from .modules.nlp import main
+from modules.io import pprint_tree, print_tor_ip_address
+from modules.color import color
+from modules.updater import check_version
+from modules.info import execute_all
+from modules.linktree import LinkTree
 
 VERSION = '3.1.2'
 
 
-# TorBot CLI class
-class TorBot:
-
-    def __init__(self, args):
-        self.args = args
-        self.__version__ = VERSION
-
-    def get_header(self):
-        license_msg = color("LICENSE: GNU Public License v3", "red")
-        banner = r"""
-                              __  ____  ____  __        ______
-                             / /_/ __ \/ __ \/ /_  ____/_  __/
-                            / __/ / / / /_/ / __ \/ __ \/ /
-                           / /_/ /_/ / _, _/ /_/ / /_/ / /
-                           \__/\____/_/ |_/_____/\____/_/  V{VERSION}
-                """.format(VERSION=self.__version__)
-        banner = color(banner, "red")
-
-        title = r"""
-                                        {banner}
-                        #######################################################
-                        #  TorBot - Dark Web OSINT Tool                       #
-                        #  GitHub : https://github.com/DedsecInside/TorBot    #
-                        #  Help : use -h for help text                        #
-                        #######################################################
-                                    {license_msg}
-                """
-
-        title = title.format(license_msg=license_msg, banner=banner)
-        print(title)
-
-    def handle_json_args(self, args):
-        """
-        Outputs JSON file for data
-        """
-
-        # -m/--mail
-        if args.mail:
-            email_json = link_io.print_emails(args.url)
-            if args.save:
-                saveJson('Emails', email_json)
-        # -p/--phone
-        if args.phone:
-            phone_json = link_io.print_phones(args.url)
-            if args.save:
-                saveJson('Phones', phone_json)
-        # -s/--save
-        else:
-            node_json = link_io.print_json(args.url, args.depth)
-            saveJson("Links", node_json)
-
-    def handle_tree_args(self, args):
-        """
-        Outputs tree visual for data
-        """
-        '''
-        # -v/--visualize
-        if args.visualize:
-            tree.show()
-
-        # -d/--download
-        if args.download:
-            file_name = str(input("File Name (.txt): "))
-            tree.save(file_name)
-            '''
-
-    def perform_action(self):
-        args = self.args
-
-        # If url flag is set then check for accompanying flag set. Only one
-        # additional flag can be set with -u/--url flag
-        if not args.url:
-            print("usage: See run.py -h for possible arguments.")
-            sys.exit()
-
-        if args.gather:
-            collect_data(args.url)
-            sys.exit()
-
-        # If flag is -v, --update, -q/--quiet then user only runs that operation
-        # because these are single flags only
-        if args.version:
-            print(f"TorBot Version: {self.__version__}")
-            sys.exit()
-        if args.update:
-            check_version()
-            sys.exit()
-        if not args.quiet:
-            self.get_header()
-
+def print_header() -> None:
+    """
+    Prints the TorBot banner including version and license.
+    """
+    license_msg = color("LICENSE: GNU Public License v3", "red")
+    banner = r"""
+                            __  ____  ____  __        ______
+                            / /_/ __ \/ __ \/ /_  ____/_  __/
+                        / __/ / / / /_/ / __ \/ __ \/ /
+                        / /_/ /_/ / _, _/ /_/ / /_/ / /
+                        \__/\____/_/ |_/_____/\____/_/  V{VERSION}
+            """.format(VERSION=VERSION)
+    banner = color(banner, "red")
+
+    title = r"""
+                                    {banner}
+                    #######################################################
+                    #  TorBot - Dark Web OSINT Tool                       #
+                    #  GitHub : https://github.com/DedsecInside/TorBot    #
+                    #  Help : use -h for help text                        #
+                    #######################################################
+                                {license_msg}
+            """
+
+    title = title.format(license_msg=license_msg, banner=banner)
+    print(title)
+
+
+def run(arg_parser: argparse.ArgumentParser) -> None:
+    args = arg_parser.parse_args()
+
+    # setup logging
+    date_fmt = '%d-%b-%y %H:%M:%S'
+    logging_fmt = '%(asctime)s - %(levelname)s - %(message)s'
+    logging_lvl = logging.DEBUG if args.v else logging.INFO
+    logging.basicConfig(level=logging_lvl, format=logging_fmt, datefmt=date_fmt)
+
+    # URL is a required argument
+    if not args.url:
+        arg_parser.print_help()
+        sys.exit()
+
+    # Print verison then exit
+    if args.version:
+        print(f"TorBot Version: {VERSION}")
+        sys.exit()
+
+    # check version and update if necessary
+    if args.update:
+        check_version()
+        sys.exit()
+
+    # print header and IP address if not set to quiet
+    if not args.quiet:
+        print_header()
         print_tor_ip_address()
 
-        tree = get_node(args.url, args.depth)
-
-        if args.classify:
-            result = main.classify(args.url)
-            print("Website Classification: " + result[0], "| Accuracy: " + str(result[1]))
-        if args.visualize or args.download:
-            self.handle_tree_args(args)
-            # raise NotImplementedError("Tree visualization and download is not available yet.")
-        elif args.save or args.mail or args.phone:
-            self.handle_json_args(args)
-        # -i/--info
-        elif args.info:
-            execute_all(args.url)
-        else:
-            if args.url:
-                pprint_tree(tree)
-        print("\n\n")
-
-
-def get_args():
+    if args.info:
+        execute_all(args.url)
+
+    tree = LinkTree(url=args.url, depth=args.depth)
+    tree.load()
+    # save tree and continue
+    if args.save:
+        tree.save()
+    
+    if args.visualize:
+        tree.show()
+
+    pprint_tree(tree)
+    '''
+    elif args.save or args.mail or args.phone:
+        self.handle_json_args(args)
+    '''
+    print("\n\n")
+
+
+def set_arguments() -> argparse.ArgumentParser:
     """
     Parses user flags passed to TorBot
     """
     parser = argparse.ArgumentParser(prog="TorBot", usage="Gather and analayze data from Tor sites.")
-    parser.add_argument("--version", action="store_true", help="Show current version of TorBot.")
-    parser.add_argument("--update", action="store_true", help="Update TorBot to the latest stable version")
+    parser.add_argument("-u", "--url", type=str, required=True, help="Specifiy a website link to crawl")
+    parser.add_argument("--depth", type=int, help="Specifiy max depth of crawler (default 1)", default=1)
     parser.add_argument("-q", "--quiet", action="store_true")
-    parser.add_argument("-u", "--url", help="Specifiy a website link to crawl")
-    parser.add_argument("-s", "--save", action="store_true", help="Save results in a file")
     parser.add_argument("-m", "--mail", action="store_true", help="Get e-mail addresses from the crawled sites")
     parser.add_argument("-p", "--phone", action="store_true", help="Get phone numbers from the crawled sites")
-    parser.add_argument("--depth", type=int, help="Specifiy max depth of crawler (default 1)", default=1)
-    parser.add_argument("--gather", action="store_true", help="Gather data for analysis")
-    parser.add_argument("-v", "--visualize", action="store_true", help="Visualizes tree of data gathered.")
-    parser.add_argument("-d", "--download", action="store_true", help="Downloads tree of data gathered.")
-    parser.add_argument(
-        "-e",
-        "--extension",
-        action='append',
-        dest='extension',
-        default=[],
-        help=' '.join(("Specifiy additional website", "extensions to the list(.com , .org, .etc)"))
-    )
-    parser.add_argument("-c", "--classify", action="store_true", help="Classify the webpage using NLP module")
-    parser.add_argument(
-        "-cAll", "--classifyAll", action="store_true", help="Classify all the obtained webpages using NLP module"
-    )
-    parser.add_argument(
-        "-i", "--info", action="store_true", help=' '.join(("Info displays basic info of the scanned site"))
-    )
-    return parser.parse_args()
+    parser.add_argument("--version", action="store_true", help="Show current version of TorBot.")
+    parser.add_argument("--update", action="store_true", help="Update TorBot to the latest stable version")
+    parser.add_argument("--save", action="store_true", help="Save results in a file")
+    parser.add_argument("--info", action="store_true", help="Info displays basic info of the scanned site. Only supports a single URL at a time.")
+    parser.add_argument("--visualize", action="store_true", help="Visualizes tree of data gathered.")
+    parser.add_argument("-v", action="store_true", help="verbose logging")
+
+    return parser
 
 
 if __name__ == '__main__':
     try:
-        args = get_args()
-        torbot = TorBot(args)
-        torbot.perform_action()
+        arg_parser = set_arguments()
+        run(arg_parser)
     except KeyboardInterrupt:
         print("Interrupt received! Exiting cleanly...")
diff --git a/torbot/modules/api.py b/torbot/modules/api.py
index 37774987..18cbf465 100644
--- a/torbot/modules/api.py
+++ b/torbot/modules/api.py
@@ -6,33 +6,18 @@
 import httpx
 import logging
 
-from treelib import Tree
 from bs4 import BeautifulSoup, Tag
 
-from .config import host, port
-from .linktree import append_node, build_tree
-
-base_url: str = f'http://{host}:{port}'
 
 logging.getLogger("httpx").setLevel(logging.WARNING)
 
 
-def get_node(url: str, depth: int):
-    """
-    Returns the LinkTree for the given link at the specified depth.
-    """
-    tree = Tree()
-    append_node(tree, id=url, parent_id=None)
-    build_tree(tree, url, depth)
-    return tree
-
-
 def get_ip() -> dict:
     """
     Returns the IP address of the current Tor client the service is using.
     """
     resp = httpx.get("https://check.torproject.org/", proxies='socks5://127.0.0.1:9050')
-    soup = BeautifulSoup(resp.text, features='html.parser')
+    soup = BeautifulSoup(resp.text, 'html.parser')
 
     # Get the content of check tor project, this contains the header and body
     content = soup.find("div", {"class": "content"})
@@ -56,35 +41,3 @@ def get_ip() -> dict:
     body = body_tag.get_text().strip()
 
     return {"header": header, "body": body}
-
-
-def get_emails(link: str):
-    """
-    Returns the mailto links found on the page.
-    """
-    endpoint = f'/emails?link={link}'
-    url = base_url + endpoint
-    resp = httpx.get(url)
-    data = resp.json()
-    return data
-
-
-def get_phone(link: str):
-    """
-    Returns the tel links found on the page.
-    """
-    endpoint = f'/phone?link={link}'
-    url = base_url + endpoint
-    resp = httpx.get(url)
-    data = resp.json()
-    return data
-
-
-def get_web_content(link: str):
-    """
-    Returns the HTML content of the page.
-    """
-    endpoint = f'/content?link={link}'
-    url = base_url + endpoint
-    resp = httpx.get(url)
-    return resp.text
diff --git a/torbot/modules/collect_data.py b/torbot/modules/collect_data.py
deleted file mode 100644
index 5f64736d..00000000
--- a/torbot/modules/collect_data.py
+++ /dev/null
@@ -1,76 +0,0 @@
-"""
-This module is used to gather data for analysis using thehiddenwiki.org.
-"""
-import datetime
-import uuid
-import os
-import requests
-
-from bs4 import BeautifulSoup
-from progress.bar import Bar
-from threadsafe.safe_csv import SafeDictWriter
-
-from .config import get_data_directory
-from .validators import validate_link
-from .log import debug
-
-
-def parse_links(html: str) -> list[str]:
-    """
-    Finds all anchor tags and parses the href attribute.
-    """
-    soup = BeautifulSoup(html, 'html.parser')
-    tags = soup.find_all('a')
-    return [tag['href'] for tag in tags if validate_link(tag['href'])]
-
-
-def parse_meta_tags(soup: BeautifulSoup) -> list[object]:
-    """
-    Parses all meta tags.
-    """
-    meta_tags = soup.find_all('meta')
-    content_list = list()
-    for tag in meta_tags:
-        content_list.append(tag.attrs)
-    return content_list
-
-
-def get_links(url: str) -> list[str]:
-    """
-    Returns all valid links found on the URL.
-    """
-    resp = requests.get(url)
-    links = parse_links(resp.text)
-    return links
-
-
-def collect_data(url: str = 'https://thehiddenwiki.org'):
-    print(f"Gathering data for {url}")
-    links = get_links(url)
-    current_time = datetime.datetime.now().isoformat()
-    file_name = f'torbot_{current_time}.csv'
-    data_directory = get_data_directory()
-    local_file_path = os.path.join(data_directory, file_name)
-    with open(local_file_path, 'w+') as outcsv:
-        fieldnames = ['ID', 'Title', 'Metadata', 'Content']
-        writer = SafeDictWriter(outcsv, fieldnames=fieldnames)
-        bar = Bar('Processing...', max=len(links))
-        for link in links:
-            try:
-                resp = requests.get(link)
-                soup = BeautifulSoup(resp.text, 'html.parser')
-                meta_tags = parse_meta_tags(soup)
-                entry = {
-                    "ID": uuid.uuid4(),
-                    "Title": soup.title.string if soup.title else "",
-                    "Metadata": meta_tags,
-                    "Content": soup.find('body')
-                }
-                writer.writerow(entry)
-            except requests.exceptions.RequestException as e:
-                print(f"Failed to connect to [{link}].")
-                debug(e)
-            bar.next()
-    bar.finish()
-
-    print(f'Data has been saved to {local_file_path}.')
diff --git a/torbot/modules/config.py b/torbot/modules/config.py
index 86b6ba12..05e43564 100644
--- a/torbot/modules/config.py
+++ b/torbot/modules/config.py
@@ -5,28 +5,20 @@
 from inspect import getsourcefile
 from unipath import Path
 
+source_file = getsourcefile(lambda: 0)
+config_file_path = None
+if isinstance(source_file, str):
+    config_file_path = (os.path.abspath(source_file))
+
+if not config_file_path:
+    raise Exception('Unable to load environment.')
 
-config_file_path = (os.path.abspath(getsourcefile(lambda: 0)))
 modules_directory = Path(config_file_path).parent
 torbot_directory = modules_directory.parent
 project_root_directory = torbot_directory.parent
 dotenv_path = os.path.join(project_root_directory, '.env')
 load_dotenv(dotenv_path=dotenv_path, verbose=True)
 
-port = os.getenv("PORT")
-host = os.getenv("HOST")
-
-
-def get_log_level() -> int:
-    log_level_str = os.getenv('LOG_LEVEL')
-    if log_level_str:
-        log_level_str = log_level_str.lower()
-        mapping = logging.getLevelNamesMapping()
-        if log_level_str in mapping:
-            return mapping[log_level_str]
-    return logging.INFO
-
-
 def get_data_directory():
     data_directory = os.getenv('TORBOT_DATA_DIR')
     # if a path is not set, write data to the config directory
diff --git a/torbot/modules/info.py b/torbot/modules/info.py
index 2ae1ec65..6a849921 100644
--- a/torbot/modules/info.py
+++ b/torbot/modules/info.py
@@ -3,12 +3,12 @@
 and saving data to file.
 """
 import re
+import httpx
 
 from urllib.parse import urlsplit
 from bs4 import BeautifulSoup
 from termcolor import cprint
 from requests.exceptions import HTTPError
-from .api import get_web_content
 
 
 keys = set()  # high entropy strings, prolly secret keys
@@ -42,14 +42,14 @@ def execute_all(link, *, display_status=False):
             attempts to terminal.
     """
 
-    response = get_web_content(link)
-    soup = BeautifulSoup(response, 'html.parser')
+    resp = httpx.get(link, proxies='socks5://127.0.0.1:9050')
+    soup = BeautifulSoup(resp.text, 'html.parser')
     validation_functions = [
         get_robots_txt, get_dot_git, get_dot_svn, get_dot_git, get_intel, get_dot_htaccess, get_bitcoin_address
     ]
     for validate_func in validation_functions:
         try:
-            validate_func(link, response)
+            validate_func(link, resp)
         except (ConnectionError, HTTPError):
             cprint('Error', 'red')
 
@@ -81,7 +81,7 @@ def get_robots_txt(target, response):
     cprint("[*]Checking for Robots.txt", 'yellow')
     url = target
     target = "{0.scheme}://{0.netloc}/".format(urlsplit(url))
-    get_web_content(target + "robots.txt")
+    httpx.get(target + "robots.txt", proxies='socks5://127.0.0.1:9050')
     print(target + "robots.txt")
     matches = re.findall(r'Allow: (.*)|Disallow: (.*)', response)
     for match in matches:
@@ -119,8 +119,8 @@ def get_dot_git(target, response):
     cprint("[*]Checking for .git folder", 'yellow')
     url = target
     target = "{0.scheme}://{0.netloc}/".format(urlsplit(url))
-    resp = get_web_content(target + "/.git/config")
-    if not resp.__contains__("404"):
+    resp = httpx.get(target + "/.git/config", proxies='socks5://127.0.0.1:9050')
+    if not resp.text.__contains__("404"):
         cprint("Alert!", 'red')
         cprint(".git folder exposed publicly", 'red')
     else:
@@ -150,8 +150,8 @@ def get_dot_svn(target, response):
     cprint("[*]Checking for .svn folder", 'yellow')
     url = target
     target = "{0.scheme}://{0.netloc}/".format(urlsplit(url))
-    resp = get_web_content(target + "/.svn/entries")
-    if not resp.__contains__("404"):
+    resp = httpx.get(target + "/.svn/entries", proxies='socks5://127.0.0.1:9050')
+    if not resp.text.__contains__("404"):
         cprint("Alert!", 'red')
         cprint(".SVN folder exposed publicly", 'red')
     else:
@@ -168,10 +168,10 @@ def get_dot_htaccess(target, response):
     cprint("[*]Checking for .htaccess", 'yellow')
     url = target
     target = "{0.scheme}://{0.netloc}/".format(urlsplit(url))
-    resp = get_web_content(target + "/.htaccess")
-    if resp.__contains__("403"):
+    resp = httpx.get(target + "/.htaccess", proxies='socks5://127.0.0.1:9050')
+    if resp.text.__contains__("403"):
         cprint("403 Forbidden", 'blue')
-    elif not resp.__contains__("404") or resp.__contains__("500"):
+    elif not resp.text.__contains__("404") or resp.text.__contains__("500"):
         cprint("Alert!!", 'blue')
         cprint(".htaccess file found!", 'blue')
     else:
diff --git a/torbot/modules/link_io.py b/torbot/modules/io.py
similarity index 68%
rename from torbot/modules/link_io.py
rename to torbot/modules/io.py
index e7943084..8bab0cf5 100644
--- a/torbot/modules/link_io.py
+++ b/torbot/modules/io.py
@@ -6,9 +6,9 @@
 import tabulate
 
 from pprint import pprint
-from treelib import Tree
+from .linktree import LinkTree
 
-from .api import get_node, get_emails, get_phone, get_ip
+from .api import get_ip
 from .color import color
 
 
@@ -22,7 +22,7 @@ def print_tor_ip_address() -> None:
     print(color(resp["body"], "yellow"))
 
 
-def pprint_tree(tree: Tree) -> None:
+def pprint_tree(tree: LinkTree) -> None:
     """
     Prints the status of a link based on it's connection status
     """
@@ -54,36 +54,13 @@ def insert(node, color_code):
     print(table)
 
 
-def print_json(url: str, depth: int = 1):
+def print_json(url: str, depth: int = 1) -> None:
     """
     Prints the JSON representation of a Link node.
 
     Returns:
         root (dict): Dictionary containing the root node and it's children
     """
-    root = get_node(url, depth)
-    print(root.to_json())
-
-
-def print_emails(url: str):
-    """
-    Prints any emails found within the HTML content of this url.
-
-    Returns:
-        emails (list): list of emails
-    """
-    email_list = get_emails(url)
-    pprint(email_list)
-    return email_list
-
-
-def print_phones(url: str):
-    """
-    Prints any phones found within the HTML content of this url.
-
-    Returns:
-        phones (list): list of phones
-    """
-    phone_list = get_phone(url)
-    pprint(phone_list)
-    return phone_list
+    tree = LinkTree(url=url, depth=depth)
+    tree.load()
+    print(tree.to_json())
diff --git a/torbot/modules/linktree.py b/torbot/modules/linktree.py
index 6dc95ef7..3983cd42 100644
--- a/torbot/modules/linktree.py
+++ b/torbot/modules/linktree.py
@@ -9,17 +9,65 @@
 from treelib import Tree, exceptions, Node
 from bs4 import BeautifulSoup
 
+from .config import project_root_directory
 from .nlp.main import classify
 
 
-class Link(Node):
-    def __init__(self, title: str, url: str, status: int, classification: str, accuracy: float):
+class LinkNode(Node):
+    def __init__(self, title: str, url: str, status: int,
+                 classification: str, accuracy: float):
+        super().__init__()
         self.identifier = url
         self.tag = title
         self.status = status
         self.classification = classification
         self.accuracy = accuracy
 
+class LinkTree(Tree):
+    def __init__(self, url: str, depth: int) -> None:
+        super().__init__()
+        self._url = url
+        self._depth = depth
+    
+    def load(self) -> None:
+        self._append_node(id=self._url, parent_id=None)
+        self._build_tree(url=self._url, depth=self._depth)
+
+    def _append_node(self, id: str, parent_id: str | None) -> None:
+        """
+        Creates a node for a tree using the given ID which corresponds to a URL.
+        If the parent_id is None, this will be considered a root node.
+        """
+        resp = httpx.get(id, proxies='socks5://127.0.0.1:9050')
+        soup = BeautifulSoup(resp.text, 'html.parser')
+        title = soup.title.text.strip() if soup.title is not None else id
+        try:
+            [classification, accuracy] = classify(resp.text)
+            data = LinkNode(title, id, resp.status_code, classification, accuracy)
+            self.create_node(title, identifier=id, parent=parent_id, data=data)
+        except exceptions.DuplicatedNodeIdError:
+            logging.debug(f"found a duplicate URL {id}")
+
+    def _build_tree(self,  url: str, depth: int) -> None:
+        """
+        Builds a tree from the root to the given depth.
+        """
+        if depth > 0:
+            depth -= 1
+            resp = httpx.get(url, proxies='socks5://127.0.0.1:9050')
+            children = parse_links(resp.text)
+            for child in children:
+                self._append_node(id=child, parent_id=url)
+                self._build_tree(url=child, depth=depth)
+
+    def save(self) -> None:
+        """
+        Saves the tree to the current working directory under the given file name.
+        """
+        root_id = self.root
+        root_node = self.get_node(root_id)
+        self.save2file(os.path.join(project_root_directory, root_node.tag))
+
 
 def parse_links(html: str) -> list[str]:
     """
@@ -29,45 +77,3 @@ def parse_links(html: str) -> list[str]:
     tags = soup.find_all('a')
     return [tag['href'] for tag in tags if tag.has_attr('href') and validators.url(tag['href'])]
 
-
-def append_node(tree: Tree, id: str, parent_id: str | None) -> None:
-    """
-    Creates a node for a tree using the given ID which corresponds to a URL.
-    If the parent_id is None, this will be considered a root node.
-    """
-    resp = httpx.get(id, proxies='socks5://127.0.0.1:9050')
-    soup = BeautifulSoup(resp.text, 'html.parser')
-    title = soup.title.text.strip() if soup.title is not None else id
-    try:
-        [classification, accuracy] = classify(resp.text)
-        data = Link(title, id, resp.status_code, classification, accuracy)
-        tree.create_node(title, identifier=id, parent=parent_id, data=data)
-    except exceptions.DuplicatedNodeIdError:
-        logging.debug(f"found a duplicate URL {id}")
-
-
-def build_tree(tree: Tree, url: str, depth: int) -> None:
-    """
-    Builds a tree from the root to the given depth.
-    """
-    if depth > 0:
-        depth -= 1
-        resp = httpx.get(url, proxies='socks5://127.0.0.1:9050')
-        children = parse_links(resp.text)
-        for child in children:
-            append_node(tree, id=child, parent_id=url)
-            build_tree(tree, child, depth)
-
-
-def save(tree: Tree, file_name: str) -> None:
-    """
-    Saves the tree to the current working directory under the given file name.
-    """
-    tree.save2file(os.path.join(os.getcwd(), file_name))
-
-
-def show(tree: Tree) -> None:
-    """
-    Prints the tree
-    """
-    tree.show()
diff --git a/torbot/modules/log.py b/torbot/modules/log.py
deleted file mode 100644
index 9d98b1b5..00000000
--- a/torbot/modules/log.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import logging
-
-from .config import get_log_level
-
-
-logging.basicConfig(level=get_log_level(),
-                    format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
-
-
-def info(msg: str):
-    logging.info(msg)
-
-
-def fatal(msg: str):
-    logging.error(msg)
-
-
-def debug(msg: str):
-    logging.debug(msg)
diff --git a/torbot/modules/tests/test_pagereader.py b/torbot/modules/tests/test_pagereader.py
index 684e8b43..6c7d872e 100644
--- a/torbot/modules/tests/test_pagereader.py
+++ b/torbot/modules/tests/test_pagereader.py
@@ -5,7 +5,7 @@
 import requests_mock
 
 from yattag import Doc
-from ..link_io import LinkIO
+from ..io import LinkIO
 
 
 @pytest.fixture

From 182f51abb5596410e8eb875885d61dc0b9b19096 Mon Sep 17 00:00:00 2001
From: Akeem King 
Date: Mon, 9 Oct 2023 09:36:54 -0400
Subject: [PATCH 12/25] more major changes

---
 .env                                    |   3 +-
 README.md                               |  25 ++---
 torbot/main.py                          |  63 ++++++++----
 torbot/modules/config.py                |   1 -
 torbot/modules/io.py                    |  66 ------------
 torbot/modules/linktree.py              | 131 ++++++++++++++++++++++--
 torbot/modules/savefile.py              |  35 -------
 torbot/modules/tests/test_savetofile.py |  39 -------
 8 files changed, 174 insertions(+), 189 deletions(-)
 delete mode 100644 torbot/modules/io.py
 delete mode 100644 torbot/modules/savefile.py
 delete mode 100644 torbot/modules/tests/test_savetofile.py

diff --git a/.env b/.env
index 45bfeeb9..aa0b014d 100644
--- a/.env
+++ b/.env
@@ -1,4 +1,3 @@
 export TORBOT_DATA_DIR=${PWD}/data
 export HOST='localhost'
-export PORT=8081
-export LOG_LEVEL="info" # OPTIONS - info, debug, fatal
\ No newline at end of file
+export PORT=8081
\ No newline at end of file
diff --git a/README.md b/README.md
index 7da8766f..7ecd43c0 100755
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@
 6. Crawl custom domains
 7. Check if the link is live
 8. Built-in Updater
-9. Build visual tree of link relationship that can be quickly viewed or saved to an image file
+9. Build visual tree of link relationship that can be quickly viewed or saved to an file
 
 ...(will be updated)
 
@@ -53,7 +53,7 @@
 * TorBot dependencies are managed using `poetry`, you can find the installation commands below:
 ```sh
 poetry install # to install dependencies
-poetry run python run.py -u https://www.example.com --depth 2 -v # example of running command with poetry
+poetry run python run.py -u https://www.example.com --depth 2 --visualize tree --save json # example of running command with poetry
 poetry run python run.py -h # for help
 ```
 
@@ -62,22 +62,15 @@ poetry run python run.py -h # for help
 usage: Gather and analyze data from Tor sites.
 
 optional arguments:
-  -h, --help            show this help message and exit
-  --version             Show current version of TorBot.
-  --update              Update TorBot to the latest stable version
-  -q, --quiet
   -u URL, --url URL     Specifiy a website link to crawl
-  -s, --save            Save results in a file
-  -m, --mail            Get e-mail addresses from the crawled sites
-  -p, --phone           Get phone numbers from the crawled sites
   --depth DEPTH         Specifiy max depth of crawler (default 1)
-  --gather              Gather data for analysis
-  -v, --visualize       Visualizes tree of data gathered.
-  -d, --download        Downloads tree of data gathered.
-  -e EXTENSION, --extension EXTENSION
-                        Specifiy additional website extensions to the list(.com , .org, .etc)
-  -c, --classify        Classify the webpage using NLP module
-  -cAll, --classifyAll  Classify all the obtained webpages using NLP module
+  -h, --help            Show this help message and exit
+  -v                    Displays DEBUG level logging, default is INFO
+  --version             Show current version of TorBot.
+  --update              Update TorBot to the latest stable version
+  -q, --quiet           Prevents display of header and IP address
+  --save                Save results in a file. (tree, json)
+  --visualize           Visualizes tree of data gathered. (tree, json, table)
   -i, --info            Info displays basic info of the scanned site 
* NOTE: -u is a mandatory for crawling diff --git a/torbot/main.py b/torbot/main.py index ba4d4af1..f1997d99 100644 --- a/torbot/main.py +++ b/torbot/main.py @@ -1,20 +1,31 @@ """ Core """ +import os import argparse import sys import logging +import tomllib -from modules.io import pprint_tree, print_tor_ip_address +from modules.api import get_ip from modules.color import color from modules.updater import check_version from modules.info import execute_all from modules.linktree import LinkTree +from modules.config import project_root_directory -VERSION = '3.1.2' +def print_tor_ip_address() -> None: + """ + https://check.torproject.org/ tells you if you are using tor and it + displays your IP address which we scape and display + """ + resp = get_ip() + print(resp["header"]) + print(color(resp["body"], "yellow")) -def print_header() -> None: + +def print_header(version: str) -> None: """ Prints the TorBot banner including version and license. """ @@ -24,8 +35,8 @@ def print_header() -> None: / /_/ __ \/ __ \/ /_ ____/_ __/ / __/ / / / /_/ / __ \/ __ \/ / / /_/ /_/ / _, _/ /_/ / /_/ / / - \__/\____/_/ |_/_____/\____/_/ V{VERSION} - """.format(VERSION=VERSION) + \__/\____/_/ |_/_____/\____/_/ v{VERSION} + """.format(VERSION=version) banner = color(banner, "red") title = r""" @@ -42,7 +53,7 @@ def print_header() -> None: print(title) -def run(arg_parser: argparse.ArgumentParser) -> None: +def run(arg_parser: argparse.ArgumentParser, version: str) -> None: args = arg_parser.parse_args() # setup logging @@ -58,7 +69,7 @@ def run(arg_parser: argparse.ArgumentParser) -> None: # Print verison then exit if args.version: - print(f"TorBot Version: {VERSION}") + print(f"TorBot Version: {version}") sys.exit() # check version and update if necessary @@ -68,7 +79,7 @@ def run(arg_parser: argparse.ArgumentParser) -> None: # print header and IP address if not set to quiet if not args.quiet: - print_header() + print_header(version) print_tor_ip_address() if args.info: @@ -76,18 +87,21 @@ def run(arg_parser: argparse.ArgumentParser) -> None: tree = LinkTree(url=args.url, depth=args.depth) tree.load() - # save tree and continue - if args.save: + + # save data if desired + if args.save == 'tree': tree.save() + elif args.save == 'json': + tree.saveJSON() - if args.visualize: + # always print something, table is the default + if args.visualize == 'table' or not args.visualize: + tree.showTable() + elif args.visualize == 'tree': tree.show() + elif args.visualize == 'json': + tree.showJSON() - pprint_tree(tree) - ''' - elif args.save or args.mail or args.phone: - self.handle_json_args(args) - ''' print("\n\n") @@ -98,14 +112,12 @@ def set_arguments() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(prog="TorBot", usage="Gather and analayze data from Tor sites.") parser.add_argument("-u", "--url", type=str, required=True, help="Specifiy a website link to crawl") parser.add_argument("--depth", type=int, help="Specifiy max depth of crawler (default 1)", default=1) + parser.add_argument("--save", type=str, choices=['tree', 'json'], help="Save results in a file") + parser.add_argument("--visualize", type=str, choices=['table', 'tree', 'json'], help="Visualizes data collection.") parser.add_argument("-q", "--quiet", action="store_true") - parser.add_argument("-m", "--mail", action="store_true", help="Get e-mail addresses from the crawled sites") - parser.add_argument("-p", "--phone", action="store_true", help="Get phone numbers from the crawled sites") parser.add_argument("--version", action="store_true", help="Show current version of TorBot.") parser.add_argument("--update", action="store_true", help="Update TorBot to the latest stable version") - parser.add_argument("--save", action="store_true", help="Save results in a file") parser.add_argument("--info", action="store_true", help="Info displays basic info of the scanned site. Only supports a single URL at a time.") - parser.add_argument("--visualize", action="store_true", help="Visualizes tree of data gathered.") parser.add_argument("-v", action="store_true", help="verbose logging") return parser @@ -114,6 +126,15 @@ def set_arguments() -> argparse.ArgumentParser: if __name__ == '__main__': try: arg_parser = set_arguments() - run(arg_parser) + config_file_path = os.path.join(project_root_directory, "pyproject.toml") + try: + version = None + with open(config_file_path, "rb") as f: + data = tomllib.load(f) + version = data['tool']['poetry']['version'] + except Exception as e: + raise Exception("unable to find version from pyprojec.toml.\n", e) + + run(arg_parser, version) except KeyboardInterrupt: print("Interrupt received! Exiting cleanly...") diff --git a/torbot/modules/config.py b/torbot/modules/config.py index 05e43564..836dd1e3 100644 --- a/torbot/modules/config.py +++ b/torbot/modules/config.py @@ -1,5 +1,4 @@ import os -import logging from dotenv import load_dotenv from inspect import getsourcefile diff --git a/torbot/modules/io.py b/torbot/modules/io.py deleted file mode 100644 index 8bab0cf5..00000000 --- a/torbot/modules/io.py +++ /dev/null @@ -1,66 +0,0 @@ -""" -This module is used for reading HTML pages using either bs4.BeautifulSoup -objects or url strings -""" -import http.client -import tabulate - -from pprint import pprint -from .linktree import LinkTree - -from .api import get_ip -from .color import color - - -def print_tor_ip_address() -> None: - """ - https://check.torproject.org/ tells you if you are using tor and it - displays your IP address which we scape and display - """ - resp = get_ip() - print(resp["header"]) - print(color(resp["body"], "yellow")) - - -def pprint_tree(tree: LinkTree) -> None: - """ - Prints the status of a link based on it's connection status - """ - nodes = tree.all_nodes_itr() - table_data = [] - - def insert(node, color_code): - status = str(node.data.status) - code = http.client.responses[node.data.status] - status_message = f'{status} {code}' - table_data.append([ - node.tag, - node.identifier, - color(status_message, color_code), - node.data.classification, - ]) - - for node in nodes: - status_code = node.data.status - if status_code >= 200 and status_code < 300: - insert(node, 'green') - elif status_code >= 300 and status_code < 400: - insert(node, 'yellow') - else: - insert(node, 'red') - - headers = ["Title", "URL", "Status", "Category"] - table = tabulate.tabulate(table_data, headers=headers) - print(table) - - -def print_json(url: str, depth: int = 1) -> None: - """ - Prints the JSON representation of a Link node. - - Returns: - root (dict): Dictionary containing the root node and it's children - """ - tree = LinkTree(url=url, depth=depth) - tree.load() - print(tree.to_json()) diff --git a/torbot/modules/linktree.py b/torbot/modules/linktree.py index 3983cd42..9ceda13a 100644 --- a/torbot/modules/linktree.py +++ b/torbot/modules/linktree.py @@ -1,27 +1,35 @@ """ Module is used for analyzing link relationships """ +import http.client import os import httpx import validators import logging +import phonenumbers +from urllib import parse +from tabulate import tabulate from treelib import Tree, exceptions, Node from bs4 import BeautifulSoup +from .color import color from .config import project_root_directory from .nlp.main import classify class LinkNode(Node): - def __init__(self, title: str, url: str, status: int, - classification: str, accuracy: float): + def __init__(self, title: str, url: str, status: int, classification: str, accuracy: float, + numbers: list[str], emails: list[str]): super().__init__() self.identifier = url self.tag = title self.status = status self.classification = classification self.accuracy = accuracy + self.numbers = numbers + self.emails = emails + class LinkTree(Tree): def __init__(self, url: str, depth: int) -> None: @@ -38,12 +46,14 @@ def _append_node(self, id: str, parent_id: str | None) -> None: Creates a node for a tree using the given ID which corresponds to a URL. If the parent_id is None, this will be considered a root node. """ - resp = httpx.get(id, proxies='socks5://127.0.0.1:9050') + resp = httpx.get(id, timeout=60, proxies='socks5://127.0.0.1:9050') soup = BeautifulSoup(resp.text, 'html.parser') - title = soup.title.text.strip() if soup.title is not None else id + title = soup.title.text.strip() if soup.title is not None else parse_hostname(id) try: [classification, accuracy] = classify(resp.text) - data = LinkNode(title, id, resp.status_code, classification, accuracy) + numbers = parse_phone_numbers(soup) + emails = parse_emails(soup) + data = LinkNode(title, id, resp.status_code, classification, accuracy, numbers, emails) self.create_node(title, identifier=id, parent=parent_id, data=data) except exceptions.DuplicatedNodeIdError: logging.debug(f"found a duplicate URL {id}") @@ -54,19 +64,82 @@ def _build_tree(self, url: str, depth: int) -> None: """ if depth > 0: depth -= 1 - resp = httpx.get(url, proxies='socks5://127.0.0.1:9050') + resp = httpx.get(url, timeout=60, proxies='socks5://127.0.0.1:9050') children = parse_links(resp.text) for child in children: self._append_node(id=child, parent_id=url) self._build_tree(url=child, depth=depth) + + def _get_tree_file_name(self) -> str: + root_id = self.root + root_node = self.get_node(root_id) + if root_node is None: + raise Exception('no root node can be found.') + return os.path.join(project_root_directory, f'{root_node.tag} - Depth {self._depth}') + def save(self) -> None: """ Saves the tree to the current working directory under the given file name. """ - root_id = self.root - root_node = self.get_node(root_id) - self.save2file(os.path.join(project_root_directory, root_node.tag)) + file_name = self._get_tree_file_name() + self.save2file(file_name) + + def saveJSON(self) -> None: + """ + Saves the tree to the current working directory under the given file name in JSON. + """ + json_data = self.to_json() + with open(self._get_tree_file_name(), 'w+') as f: + f.write(json_data) + + def showJSON(self) -> None: + """ + Prints tree to console as JSON + """ + json_data = self.to_json() + print(json_data) + + def showTable(self) -> None: + """ + Prints the status of a link based on it's connection status + """ + nodes = self.all_nodes_itr() + table_data = [] + + def insert(node, color_code): + status = str(node.data.status) + code = http.client.responses[node.data.status] + status_message = f'{status} {code}' + table_data.append([ + node.tag, + node.identifier, + color(status_message, color_code), + node.data.numbers, + node.data.emails, + node.data.classification, + ]) + + for node in nodes: + status_code = node.data.status + if status_code >= 200 and status_code < 300: + insert(node, 'green') + elif status_code >= 300 and status_code < 400: + insert(node, 'yellow') + else: + insert(node, 'red') + + headers = ["Title", "URL", "Status", "Phone Numbers", "Emails", "Category"] + table = tabulate(table_data, headers=headers) + print(table) + + +def parse_hostname(url: str) -> str: + hostname = parse.urlsplit(url).hostname + if hostname is not None: + return hostname + + raise Exception('unable to parse hostname from URL') def parse_links(html: str) -> list[str]: @@ -77,3 +150,43 @@ def parse_links(html: str) -> list[str]: tags = soup.find_all('a') return [tag['href'] for tag in tags if tag.has_attr('href') and validators.url(tag['href'])] + +def parse_emails(soup: BeautifulSoup) -> list[str]: + """ + Finds all anchor tags and parses the email href attributes. + example attribute: `mailto:example@example.com` + """ + tags = soup.find_all('a') + + emails = set() + for tag in tags: + if tag.has_attr('href') and 'mailto:' in tag['href']: + email = tag['href'].split('mailto:', 1)[1] + if validators.email(email): + emails.add(set) + + return list(emails) + + +def parse_phone_numbers(soup: BeautifulSoup) -> list[str]: + """ + Finds all anchor tags and parses the href attribute. + """ + tags = soup.find_all('a') + numbers = set() + for tag in tags: + if tag.has_attr('href') and 'tel:' in tag['href']: + number = tag['href'].split('tel:', 1)[1] + try: + if phonenumbers.is_valid_number(number): + numbers.add(number) + except: + pass + + try: + if phonenumbers.is_valid_number(tag['href']): + numbers.add(tag['href']) + except: + pass + + return list(numbers) diff --git a/torbot/modules/savefile.py b/torbot/modules/savefile.py deleted file mode 100644 index cf93b70f..00000000 --- a/torbot/modules/savefile.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Module that facilitates the saving of data to JSON file. -""" -import json -import time -import os - -from .config import get_data_directory - - -def saveJson(datatype: str, data: list): - """ - Creates json file and stores data as JSON. - - Args: - datatype (str): Type of the object being passed. - data (list): List of data elements of type 'datatype' to be saved. - - Returns: - (str): Name of file data was saved to. - """ - timestr = time.strftime("%Y%m%d-%H%M%S") - file_name = "TorBot-Export-" + datatype + timestr + ".json" - data_directory = get_data_directory() - file_path = os.path.join(data_directory, file_name) - - # Json File Creation - with open(file_path, 'w+') as f: - # Store data in Json format - output = {datatype: data} - # Dump output to file - json.dump(output, f, indent=2) - - print("\nData will be saved with a File Name :", file_name) - return file_name diff --git a/torbot/modules/tests/test_savetofile.py b/torbot/modules/tests/test_savetofile.py deleted file mode 100644 index 1cebc80d..00000000 --- a/torbot/modules/tests/test_savetofile.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -Test module for saving data to file -""" -import json -import os -from ..savefile import saveJson - - -def test_save_json_successful(): - """ - Sucessfully create and dump JSON object of links - """ - mock_data = [ - 'http://aff.ironsocket.com/SH7L', 'http://aff.ironsocket.com/SH7L', 'http://wsrs.net/', 'http://cmsgear.com/' - ] - try: - file_name = saveJson('Links', mock_data) - mock_output = {'Links': mock_data} - - with open('test_file.json', 'w+') as test_file: - json.dump(mock_output, test_file, indent=2) - - os.chdir(os.getcwd()) - assert os.path.isfile(file_name) is True - mock_file = open(file_name, 'r') - test_file = open('test_file.json', 'r') - - mock_data = mock_file.read() - test_data = test_file.read() - - finally: - os.remove(file_name) - os.remove('test_file.json') - - assert mock_data == test_data - - -if __name__ == '__main__': - test_save_json_successful() From f702d5b61c07df89e06a97ca868ea18bb84089dd Mon Sep 17 00:00:00 2001 From: Akeem King Date: Mon, 9 Oct 2023 10:20:52 -0400 Subject: [PATCH 13/25] fix tree printing --- torbot/main.py | 4 ++-- torbot/modules/linktree.py | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/torbot/main.py b/torbot/main.py index f1997d99..fdc1a3a9 100644 --- a/torbot/main.py +++ b/torbot/main.py @@ -98,7 +98,7 @@ def run(arg_parser: argparse.ArgumentParser, version: str) -> None: if args.visualize == 'table' or not args.visualize: tree.showTable() elif args.visualize == 'tree': - tree.show() + print(tree) elif args.visualize == 'json': tree.showJSON() @@ -133,7 +133,7 @@ def set_arguments() -> argparse.ArgumentParser: data = tomllib.load(f) version = data['tool']['poetry']['version'] except Exception as e: - raise Exception("unable to find version from pyprojec.toml.\n", e) + raise Exception("unable to find version from pyproject.toml.\n", e) run(arg_parser, version) except KeyboardInterrupt: diff --git a/torbot/modules/linktree.py b/torbot/modules/linktree.py index 9ceda13a..bb1056f7 100644 --- a/torbot/modules/linktree.py +++ b/torbot/modules/linktree.py @@ -83,14 +83,15 @@ def save(self) -> None: Saves the tree to the current working directory under the given file name. """ file_name = self._get_tree_file_name() - self.save2file(file_name) + self.save2file(f'{file_name}.txt') def saveJSON(self) -> None: """ Saves the tree to the current working directory under the given file name in JSON. """ json_data = self.to_json() - with open(self._get_tree_file_name(), 'w+') as f: + file_name = self._get_tree_file_name() + with open(f'{file_name}.json', 'w+') as f: f.write(json_data) def showJSON(self) -> None: @@ -171,6 +172,7 @@ def parse_emails(soup: BeautifulSoup) -> list[str]: def parse_phone_numbers(soup: BeautifulSoup) -> list[str]: """ Finds all anchor tags and parses the href attribute. + example attribute: `tel:+45651112331` or possiby the href attribute itself. """ tags = soup.find_all('a') numbers = set() From 5cfcde398d0d0a143096e6301053955d01b9ac70 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Mon, 9 Oct 2023 10:34:59 -0400 Subject: [PATCH 14/25] more major changes --- .env | 4 +- poetry.lock | 23 +--------- pyproject.toml | 1 - requirements.txt | 1 - torbot/main.py | 58 +++++++++++++------------ torbot/modules/api.py | 4 +- torbot/modules/config.py | 3 ++ torbot/modules/info.py | 27 ++++++------ torbot/modules/linktree.py | 7 +-- torbot/modules/tests/test_pagereader.py | 41 ----------------- 10 files changed, 56 insertions(+), 113 deletions(-) delete mode 100644 torbot/modules/tests/test_pagereader.py diff --git a/.env b/.env index aa0b014d..eb047ad1 100644 --- a/.env +++ b/.env @@ -1,3 +1,3 @@ export TORBOT_DATA_DIR=${PWD}/data -export HOST='localhost' -export PORT=8081 \ No newline at end of file +export SOCKS5_HOST='127.0.0.1' +export SOCKS5_PORT=9050 \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index e2efb3f5..20fcc0fb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -413,27 +413,6 @@ files = [ {file = "pywin32_ctypes-0.2.2-py3-none-any.whl", hash = "sha256:bf490a1a709baf35d688fe0ecf980ed4de11d2b3e37b51e5442587a75d9957e7"}, ] -[[package]] -name = "requests" -version = "2.31.0" -description = "Python HTTP for Humans." -optional = false -python-versions = ">=3.7" -files = [ - {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, - {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, -] - -[package.dependencies] -certifi = ">=2017.4.17" -charset-normalizer = ">=2,<4" -idna = ">=2.5,<4" -urllib3 = ">=1.21.1,<3" - -[package.extras] -socks = ["PySocks (>=1.5.6,!=1.5.7)"] -use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] - [[package]] name = "scikit-learn" version = "1.3.0" @@ -714,4 +693,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.9,<=3.11.4" -content-hash = "7b3ae36389472ec97dd5aacc437381b5c7f13f3d08e4ab738ef699b46c85a17a" +content-hash = "fa048130f884a71b33d42a8dd2940a2c17365309afe56ae1c6abc2dfc6ee5a40" diff --git a/pyproject.toml b/pyproject.toml index daf30849..cfdffc80 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,6 @@ treelib = "^1.6.1" numpy = "1.24.4" unipath = "^1.1" httpx = {extras = ["socks"], version = "^0.25.0"} -requests = "^2.31.0" tabulate = "^0.9.0" phonenumbers = "^8.13.22" diff --git a/requirements.txt b/requirements.txt index 31442efa..a383a60a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,7 +21,6 @@ pyinstaller==5.13.0 ; python_version >= "3.9" and python_full_version <= "3.11.4 pysocks==1.7.1 ; python_version >= "3.9" and python_full_version <= "3.11.4" python-dotenv==0.20.0 ; python_version >= "3.9" and python_full_version <= "3.11.4" pywin32-ctypes==0.2.2 ; python_version >= "3.9" and python_full_version <= "3.11.4" and sys_platform == "win32" -requests==2.31.0 ; python_version >= "3.9" and python_full_version <= "3.11.4" scikit-learn==1.3.0 ; python_version >= "3.9" and python_full_version <= "3.11.4" scipy==1.10.0 ; python_version >= "3.9" and python_full_version <= "3.11.4" setuptools==68.2.2 ; python_version >= "3.9" and python_full_version <= "3.11.4" diff --git a/torbot/main.py b/torbot/main.py index fdc1a3a9..0d8b27c2 100644 --- a/torbot/main.py +++ b/torbot/main.py @@ -6,21 +6,22 @@ import sys import logging import tomllib +import httpx from modules.api import get_ip from modules.color import color from modules.updater import check_version from modules.info import execute_all from modules.linktree import LinkTree -from modules.config import project_root_directory +from modules.config import project_root_directory, socks5_host, socks5_port -def print_tor_ip_address() -> None: +def print_tor_ip_address(client: httpx.Client) -> None: """ https://check.torproject.org/ tells you if you are using tor and it displays your IP address which we scape and display """ - resp = get_ip() + resp = get_ip(client) print(resp["header"]) print(color(resp["body"], "yellow")) @@ -77,30 +78,33 @@ def run(arg_parser: argparse.ArgumentParser, version: str) -> None: check_version() sys.exit() - # print header and IP address if not set to quiet - if not args.quiet: - print_header(version) - print_tor_ip_address() - - if args.info: - execute_all(args.url) - - tree = LinkTree(url=args.url, depth=args.depth) - tree.load() - - # save data if desired - if args.save == 'tree': - tree.save() - elif args.save == 'json': - tree.saveJSON() - - # always print something, table is the default - if args.visualize == 'table' or not args.visualize: - tree.showTable() - elif args.visualize == 'tree': - print(tree) - elif args.visualize == 'json': - tree.showJSON() + + socks5_proxy = f'socks5://{socks5_host}:{socks5_port}' + with httpx.Client(timeout=60, proxies=socks5_proxy) as client: + # print header and IP address if not set to quiet + if not args.quiet: + print_header(version) + print_tor_ip_address(client) + + if args.info: + execute_all(client, args.url) + + tree = LinkTree(url=args.url, depth=args.depth, client=client) + tree.load() + + # save data if desired + if args.save == 'tree': + tree.save() + elif args.save == 'json': + tree.saveJSON() + + # always print something, table is the default + if args.visualize == 'table' or not args.visualize: + tree.showTable() + elif args.visualize == 'tree': + print(tree) + elif args.visualize == 'json': + tree.showJSON() print("\n\n") diff --git a/torbot/modules/api.py b/torbot/modules/api.py index 18cbf465..39f9a092 100644 --- a/torbot/modules/api.py +++ b/torbot/modules/api.py @@ -12,11 +12,11 @@ logging.getLogger("httpx").setLevel(logging.WARNING) -def get_ip() -> dict: +def get_ip(client: httpx.Client) -> dict: """ Returns the IP address of the current Tor client the service is using. """ - resp = httpx.get("https://check.torproject.org/", proxies='socks5://127.0.0.1:9050') + resp = client.get("https://check.torproject.org/") soup = BeautifulSoup(resp.text, 'html.parser') # Get the content of check tor project, this contains the header and body diff --git a/torbot/modules/config.py b/torbot/modules/config.py index 836dd1e3..0ebd33e3 100644 --- a/torbot/modules/config.py +++ b/torbot/modules/config.py @@ -18,6 +18,9 @@ dotenv_path = os.path.join(project_root_directory, '.env') load_dotenv(dotenv_path=dotenv_path, verbose=True) +socks5_host = os.getenv('SOCKS5_HOST') +socks5_port = os.getenv('SOCKS5_PORT') + def get_data_directory(): data_directory = os.getenv('TORBOT_DATA_DIR') # if a path is not set, write data to the config directory diff --git a/torbot/modules/info.py b/torbot/modules/info.py index 6a849921..92491052 100644 --- a/torbot/modules/info.py +++ b/torbot/modules/info.py @@ -8,7 +8,6 @@ from urllib.parse import urlsplit from bs4 import BeautifulSoup from termcolor import cprint -from requests.exceptions import HTTPError keys = set() # high entropy strings, prolly secret keys @@ -32,7 +31,7 @@ ] -def execute_all(link, *, display_status=False): +def execute_all(client: httpx.Client, link: str, *, display_status: bool = False) -> None: """Initialise datasets and functions to retrieve data, and execute each for a given link. @@ -42,15 +41,15 @@ def execute_all(link, *, display_status=False): attempts to terminal. """ - resp = httpx.get(link, proxies='socks5://127.0.0.1:9050') + resp = client.get(link) soup = BeautifulSoup(resp.text, 'html.parser') validation_functions = [ get_robots_txt, get_dot_git, get_dot_svn, get_dot_git, get_intel, get_dot_htaccess, get_bitcoin_address ] for validate_func in validation_functions: try: - validate_func(link, resp) - except (ConnectionError, HTTPError): + validate_func(client, link, resp) + except: cprint('Error', 'red') display_webpage_description(soup) @@ -71,7 +70,7 @@ def display_headers(response): print('*', key, ':', val) -def get_robots_txt(target, response): +def get_robots_txt(client: httpx.Client, target: str, response: str) -> None: """ Check link for Robot.txt, and if found, add link to robots dataset. Args: @@ -81,7 +80,7 @@ def get_robots_txt(target, response): cprint("[*]Checking for Robots.txt", 'yellow') url = target target = "{0.scheme}://{0.netloc}/".format(urlsplit(url)) - httpx.get(target + "robots.txt", proxies='socks5://127.0.0.1:9050') + client.get(target + "robots.txt") print(target + "robots.txt") matches = re.findall(r'Allow: (.*)|Disallow: (.*)', response) for match in matches: @@ -93,7 +92,7 @@ def get_robots_txt(target, response): print(robots) -def get_intel(link, response): +def get_intel(client: httpx.Client, url: str, response: str) -> None: """ Check link for intel, and if found, add link to intel dataset, including but not limited to website accounts and AWS buckets. @@ -109,7 +108,7 @@ def get_intel(link, response): intel.add(match) -def get_dot_git(target, response): +def get_dot_git(client: httpx.Client, target: str, response: str) -> None: """ Check link for .git folders exposed on public domain. Args: @@ -119,7 +118,7 @@ def get_dot_git(target, response): cprint("[*]Checking for .git folder", 'yellow') url = target target = "{0.scheme}://{0.netloc}/".format(urlsplit(url)) - resp = httpx.get(target + "/.git/config", proxies='socks5://127.0.0.1:9050') + resp = client.get(target + "/.git/config") if not resp.text.__contains__("404"): cprint("Alert!", 'red') cprint(".git folder exposed publicly", 'red') @@ -127,7 +126,7 @@ def get_dot_git(target, response): cprint("NO .git folder found", 'blue') -def get_bitcoin_address(target, response): +def get_bitcoin_address(client: httpx.Client, target: str, response: str) -> None: """ Check link for Bitcoin addresses, and if found, print. Args: @@ -140,7 +139,7 @@ def get_bitcoin_address(target, response): print("BTC: ", bitcoin) -def get_dot_svn(target, response): +def get_dot_svn(client: httpx.Client, target: str, response: str) -> None: """ Check link for .svn folders exposed on public domain=. Args: @@ -158,7 +157,7 @@ def get_dot_svn(target, response): cprint("NO .SVN folder found", 'blue') -def get_dot_htaccess(target, response): +def get_dot_htaccess(client: httpx.Client, target: str, response: str) -> None: """ Check link for .htaccess files on public domain. Args: @@ -179,7 +178,7 @@ def get_dot_htaccess(target, response): cprint(resp, 'blue') -def display_webpage_description(soup): +def display_webpage_description(soup: BeautifulSoup) -> None: """Print all meta tags found in page. Args: diff --git a/torbot/modules/linktree.py b/torbot/modules/linktree.py index bb1056f7..30531b90 100644 --- a/torbot/modules/linktree.py +++ b/torbot/modules/linktree.py @@ -32,10 +32,11 @@ def __init__(self, title: str, url: str, status: int, classification: str, accur class LinkTree(Tree): - def __init__(self, url: str, depth: int) -> None: + def __init__(self, url: str, depth: int, client: httpx.Client) -> None: super().__init__() self._url = url self._depth = depth + self._client = client def load(self) -> None: self._append_node(id=self._url, parent_id=None) @@ -46,7 +47,7 @@ def _append_node(self, id: str, parent_id: str | None) -> None: Creates a node for a tree using the given ID which corresponds to a URL. If the parent_id is None, this will be considered a root node. """ - resp = httpx.get(id, timeout=60, proxies='socks5://127.0.0.1:9050') + resp = self._client.get(id) soup = BeautifulSoup(resp.text, 'html.parser') title = soup.title.text.strip() if soup.title is not None else parse_hostname(id) try: @@ -64,7 +65,7 @@ def _build_tree(self, url: str, depth: int) -> None: """ if depth > 0: depth -= 1 - resp = httpx.get(url, timeout=60, proxies='socks5://127.0.0.1:9050') + resp = self._client.get(url) children = parse_links(resp.text) for child in children: self._append_node(id=child, parent_id=url) diff --git a/torbot/modules/tests/test_pagereader.py b/torbot/modules/tests/test_pagereader.py deleted file mode 100644 index 6c7d872e..00000000 --- a/torbot/modules/tests/test_pagereader.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Test module for reading pages -""" -import pytest -import requests_mock - -from yattag import Doc -from ..io import LinkIO - - -@pytest.fixture -def read_func(): - """ - Tests if read is returning the expected html - """ - websites = [] - test_data = [ - ('https://www.test.com', 'This is a dot com site.'), ('https://www.test.org', 'This is a dot org site.'), - ('https://www.test.net', 'This is a dot net site.'), ('https://www.test.onion', 'This is a dot onion site.') - ] - - doc, tag, text = Doc().tagtext() - - for data in test_data: - doc.asis('') - with tag('html'): - with tag('body'): - text(data[1]) - - websites.append(doc.getvalue()) - - with requests_mock.Mocker() as mock_connection: - for i in range(len(websites)): - mock_connection.register_uri('GET', test_data[i][0], text=test_data[i][1]) - result = LinkIO.read(test_data[i][0]) - return result, test_data[i][1] - - -def test_read(read_func): - result, test_data = read_func - assert result == test_data From d9de8d3dd8ad27a4da739725331f71537b0f2144 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Mon, 9 Oct 2023 10:38:16 -0400 Subject: [PATCH 15/25] flake8 fixes --- torbot/main.py | 6 +++--- torbot/modules/config.py | 1 + torbot/modules/info.py | 6 ++++-- torbot/modules/linktree.py | 18 ++++++++++-------- 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/torbot/main.py b/torbot/main.py index 0d8b27c2..e3916d45 100644 --- a/torbot/main.py +++ b/torbot/main.py @@ -78,7 +78,6 @@ def run(arg_parser: argparse.ArgumentParser, version: str) -> None: check_version() sys.exit() - socks5_proxy = f'socks5://{socks5_host}:{socks5_port}' with httpx.Client(timeout=60, proxies=socks5_proxy) as client: # print header and IP address if not set to quiet @@ -97,7 +96,7 @@ def run(arg_parser: argparse.ArgumentParser, version: str) -> None: tree.save() elif args.save == 'json': tree.saveJSON() - + # always print something, table is the default if args.visualize == 'table' or not args.visualize: tree.showTable() @@ -121,7 +120,8 @@ def set_arguments() -> argparse.ArgumentParser: parser.add_argument("-q", "--quiet", action="store_true") parser.add_argument("--version", action="store_true", help="Show current version of TorBot.") parser.add_argument("--update", action="store_true", help="Update TorBot to the latest stable version") - parser.add_argument("--info", action="store_true", help="Info displays basic info of the scanned site. Only supports a single URL at a time.") + parser.add_argument("--info", action="store_true", + help="Info displays basic info of the scanned site. Only supports a single URL at a time.") parser.add_argument("-v", action="store_true", help="verbose logging") return parser diff --git a/torbot/modules/config.py b/torbot/modules/config.py index 0ebd33e3..dafceb48 100644 --- a/torbot/modules/config.py +++ b/torbot/modules/config.py @@ -21,6 +21,7 @@ socks5_host = os.getenv('SOCKS5_HOST') socks5_port = os.getenv('SOCKS5_PORT') + def get_data_directory(): data_directory = os.getenv('TORBOT_DATA_DIR') # if a path is not set, write data to the config directory diff --git a/torbot/modules/info.py b/torbot/modules/info.py index 92491052..0072fd54 100644 --- a/torbot/modules/info.py +++ b/torbot/modules/info.py @@ -4,6 +4,7 @@ """ import re import httpx +import logging from urllib.parse import urlsplit from bs4 import BeautifulSoup @@ -41,7 +42,7 @@ def execute_all(client: httpx.Client, link: str, *, display_status: bool = False attempts to terminal. """ - resp = client.get(link) + resp = client.get(url=link) soup = BeautifulSoup(resp.text, 'html.parser') validation_functions = [ get_robots_txt, get_dot_git, get_dot_svn, get_dot_git, get_intel, get_dot_htaccess, get_bitcoin_address @@ -49,7 +50,8 @@ def execute_all(client: httpx.Client, link: str, *, display_status: bool = False for validate_func in validation_functions: try: validate_func(client, link, resp) - except: + except Exception as e: + logging.debug(e) cprint('Error', 'red') display_webpage_description(soup) diff --git a/torbot/modules/linktree.py b/torbot/modules/linktree.py index 30531b90..79b44bb3 100644 --- a/torbot/modules/linktree.py +++ b/torbot/modules/linktree.py @@ -37,7 +37,7 @@ def __init__(self, url: str, depth: int, client: httpx.Client) -> None: self._url = url self._depth = depth self._client = client - + def load(self) -> None: self._append_node(id=self._url, parent_id=None) self._build_tree(url=self._url, depth=self._depth) @@ -70,7 +70,7 @@ def _build_tree(self, url: str, depth: int) -> None: for child in children: self._append_node(id=child, parent_id=url) self._build_tree(url=child, depth=depth) - + def _get_tree_file_name(self) -> str: root_id = self.root root_node = self.get_node(root_id) @@ -78,7 +78,7 @@ def _get_tree_file_name(self) -> str: raise Exception('no root node can be found.') return os.path.join(project_root_directory, f'{root_node.tag} - Depth {self._depth}') - + def save(self) -> None: """ Saves the tree to the current working directory under the given file name. @@ -94,14 +94,14 @@ def saveJSON(self) -> None: file_name = self._get_tree_file_name() with open(f'{file_name}.json', 'w+') as f: f.write(json_data) - + def showJSON(self) -> None: """ Prints tree to console as JSON """ json_data = self.to_json() print(json_data) - + def showTable(self) -> None: """ Prints the status of a link based on it's connection status @@ -183,13 +183,15 @@ def parse_phone_numbers(soup: BeautifulSoup) -> list[str]: try: if phonenumbers.is_valid_number(number): numbers.add(number) - except: + except Exception as e: + logging.debug(e) pass try: if phonenumbers.is_valid_number(tag['href']): numbers.add(tag['href']) - except: + except Exception as e: + logging.debug(e) pass - + return list(numbers) From 6f3f4dbf2de2ac4ea34e82a7436a0cf3af8722af Mon Sep 17 00:00:00 2001 From: Akeem King Date: Mon, 9 Oct 2023 10:43:43 -0400 Subject: [PATCH 16/25] Add option to disable socks5 --- torbot/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torbot/main.py b/torbot/main.py index e3916d45..4d3aaf11 100644 --- a/torbot/main.py +++ b/torbot/main.py @@ -79,7 +79,7 @@ def run(arg_parser: argparse.ArgumentParser, version: str) -> None: sys.exit() socks5_proxy = f'socks5://{socks5_host}:{socks5_port}' - with httpx.Client(timeout=60, proxies=socks5_proxy) as client: + with httpx.Client(timeout=60, proxies=socks5_proxy if not args.disable_socks5 else None) as client: # print header and IP address if not set to quiet if not args.quiet: print_header(version) @@ -123,6 +123,7 @@ def set_arguments() -> argparse.ArgumentParser: parser.add_argument("--info", action="store_true", help="Info displays basic info of the scanned site. Only supports a single URL at a time.") parser.add_argument("-v", action="store_true", help="verbose logging") + parser.add_argument("--disable-socks5", action="store_true", help="Executes HTTP requests without using SOCKS5 proxy") return parser From e97fca59a71b5f7161fec377345d22a52ce00bae Mon Sep 17 00:00:00 2001 From: Akeem King Date: Mon, 9 Oct 2023 10:44:58 -0400 Subject: [PATCH 17/25] Update README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7ecd43c0..ac704cab 100755 --- a/README.md +++ b/README.md @@ -72,6 +72,7 @@ optional arguments: --save Save results in a file. (tree, json) --visualize Visualizes tree of data gathered. (tree, json, table) -i, --info Info displays basic info of the scanned site + --disable-socks5 Executes HTTP requests without using SOCKS5 proxy * NOTE: -u is a mandatory for crawling From c380052e59e9896a7f4f6d6ce640c34b51ffca4e Mon Sep 17 00:00:00 2001 From: Akeem King Date: Mon, 9 Oct 2023 10:46:00 -0400 Subject: [PATCH 18/25] Update README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ac704cab..18e97005 100755 --- a/README.md +++ b/README.md @@ -39,13 +39,13 @@ ...(will be updated) ### Dependencies -- Tor -- Python ^3.11 +- Tor (Optional) +- Python ^3.9 - Poetry ### Python Dependencies -(see requirements.txt for more details) +(see pyproject.toml or requirements.txt for more details) ## Installation From 3a967b960a0ff987bfd69cdb7decc5a4d3092565 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Mon, 9 Oct 2023 10:46:49 -0400 Subject: [PATCH 19/25] flake8 --- torbot/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torbot/main.py b/torbot/main.py index 4d3aaf11..3c14e154 100644 --- a/torbot/main.py +++ b/torbot/main.py @@ -123,7 +123,8 @@ def set_arguments() -> argparse.ArgumentParser: parser.add_argument("--info", action="store_true", help="Info displays basic info of the scanned site. Only supports a single URL at a time.") parser.add_argument("-v", action="store_true", help="verbose logging") - parser.add_argument("--disable-socks5", action="store_true", help="Executes HTTP requests without using SOCKS5 proxy") + parser.add_argument("--disable-socks5", action="store_true", + help="Executes HTTP requests without using SOCKS5 proxy") return parser From 36d3480c5afff06f6d58c5b42004a6a26a7c617e Mon Sep 17 00:00:00 2001 From: Akeem King Date: Mon, 9 Oct 2023 11:25:37 -0400 Subject: [PATCH 20/25] better formatted JSON for tree --- torbot/modules/linktree.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/torbot/modules/linktree.py b/torbot/modules/linktree.py index 79b44bb3..2835c34e 100644 --- a/torbot/modules/linktree.py +++ b/torbot/modules/linktree.py @@ -3,6 +3,7 @@ """ import http.client import os +import json import httpx import validators import logging @@ -90,17 +91,20 @@ def saveJSON(self) -> None: """ Saves the tree to the current working directory under the given file name in JSON. """ - json_data = self.to_json() + json_data = self._to_json() file_name = self._get_tree_file_name() with open(f'{file_name}.json', 'w+') as f: f.write(json_data) + def _to_json(self) -> str: + json_data = self.to_json() + return json.dumps(json.loads(json_data), indent=2) + def showJSON(self) -> None: """ Prints tree to console as JSON """ - json_data = self.to_json() - print(json_data) + print(self._to_json()) def showTable(self) -> None: """ From ddabe8a0dc676ffe1663e479464c764517331a8a Mon Sep 17 00:00:00 2001 From: Akeem King Date: Mon, 9 Oct 2023 12:33:13 -0400 Subject: [PATCH 21/25] syntax fix and removing threadsafe --- poetry.lock | 13 +------------ pyproject.toml | 1 - requirements.txt | 1 - torbot/modules/linktree.py | 2 +- 4 files changed, 2 insertions(+), 15 deletions(-) diff --git a/poetry.lock b/poetry.lock index 20fcc0fb..b43c6c18 100644 --- a/poetry.lock +++ b/poetry.lock @@ -612,17 +612,6 @@ files = [ {file = "threadpoolctl-3.1.0.tar.gz", hash = "sha256:a335baacfaa4400ae1f0d8e3a58d6674d2f8828e3716bb2802c44955ad391380"}, ] -[[package]] -name = "threadsafe" -version = "1.0.0" -description = "Thread-safe data structures" -optional = false -python-versions = "*" -files = [ - {file = "threadsafe-1.0.0-py3-none-any.whl", hash = "sha256:acbd59278ca8221dc3a8051443fe24c647ee9ac81808058e280ef6f75dd4387b"}, - {file = "threadsafe-1.0.0.tar.gz", hash = "sha256:7c61f9fdd0b3cd6c07b427de355dafcd337578d30871634cb1e8985ee4955edc"}, -] - [[package]] name = "treelib" version = "1.7.0" @@ -693,4 +682,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.9,<=3.11.4" -content-hash = "fa048130f884a71b33d42a8dd2940a2c17365309afe56ae1c6abc2dfc6ee5a40" +content-hash = "1e6d83812ac5be9a550b998795ee28f76bb788e972c933e497e68b20f548a0ea" diff --git a/pyproject.toml b/pyproject.toml index cfdffc80..a440ee67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,6 @@ soupsieve = "2.3.2.post1" termcolor = "1.1.0" texttable = "1.6.4" threadpoolctl = "3.1.0" -threadsafe = "1.0.0" urllib3 = "1.26.17" validators = "0.20.0" yattag = "1.14.0" diff --git a/requirements.txt b/requirements.txt index a383a60a..91064493 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,7 +33,6 @@ tabulate==0.9.0 ; python_version >= "3.9" and python_full_version <= "3.11.4" termcolor==1.1.0 ; python_version >= "3.9" and python_full_version <= "3.11.4" texttable==1.6.4 ; python_version >= "3.9" and python_full_version <= "3.11.4" threadpoolctl==3.1.0 ; python_version >= "3.9" and python_full_version <= "3.11.4" -threadsafe==1.0.0 ; python_version >= "3.9" and python_full_version <= "3.11.4" treelib==1.7.0 ; python_version >= "3.9" and python_full_version <= "3.11.4" unipath==1.1 ; python_version >= "3.9" and python_full_version <= "3.11.4" urllib3==1.26.17 ; python_version >= "3.9" and python_full_version <= "3.11.4" diff --git a/torbot/modules/linktree.py b/torbot/modules/linktree.py index 2835c34e..c10810a2 100644 --- a/torbot/modules/linktree.py +++ b/torbot/modules/linktree.py @@ -60,7 +60,7 @@ def _append_node(self, id: str, parent_id: str | None) -> None: except exceptions.DuplicatedNodeIdError: logging.debug(f"found a duplicate URL {id}") - def _build_tree(self, url: str, depth: int) -> None: + def _build_tree(self, url: str, depth: int) -> None: """ Builds a tree from the root to the given depth. """ From 0706fa052e1b8080d78b99bf83f0a7c2de9785bf Mon Sep 17 00:00:00 2001 From: Akeem King Date: Mon, 9 Oct 2023 12:34:38 -0400 Subject: [PATCH 22/25] Fix README formatting --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 18e97005..0f4d7215 100755 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ optional arguments: -q, --quiet Prevents display of header and IP address --save Save results in a file. (tree, json) --visualize Visualizes tree of data gathered. (tree, json, table) - -i, --info Info displays basic info of the scanned site + -i, --info Info displays basic info of the scanned site --disable-socks5 Executes HTTP requests without using SOCKS5 proxy * NOTE: -u is a mandatory for crawling From 9663610f6c384ba0d2e693f803542c9229aca859 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Mon, 9 Oct 2023 12:35:33 -0400 Subject: [PATCH 23/25] Add details to argument usage --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0f4d7215..831bd0b1 100755 --- a/README.md +++ b/README.md @@ -69,8 +69,8 @@ optional arguments: --version Show current version of TorBot. --update Update TorBot to the latest stable version -q, --quiet Prevents display of header and IP address - --save Save results in a file. (tree, json) - --visualize Visualizes tree of data gathered. (tree, json, table) + --save FORMAT Save results in a file. (tree, json) + --visualize FORMAT Visualizes tree of data gathered. (tree, json, table) -i, --info Info displays basic info of the scanned site --disable-socks5 Executes HTTP requests without using SOCKS5 proxy From ef6e06bc7785355b1701d5524eb4550441086ac4 Mon Sep 17 00:00:00 2001 From: Akeem King Date: Mon, 9 Oct 2023 12:40:42 -0400 Subject: [PATCH 24/25] remove unused validators file --- torbot/modules/validators.py | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 torbot/modules/validators.py diff --git a/torbot/modules/validators.py b/torbot/modules/validators.py deleted file mode 100644 index a8e78c3f..00000000 --- a/torbot/modules/validators.py +++ /dev/null @@ -1,13 +0,0 @@ -import validators - - -def validate_email(email): - if not isinstance(email, str): - return False - return validators.email(email) - - -def validate_link(link): - if not isinstance(link, str): - return False - return validators.url(link) From ab336995d8a814cd7d47d4d1bc399e17a9323e5b Mon Sep 17 00:00:00 2001 From: Akeem King Date: Mon, 9 Oct 2023 13:01:41 -0400 Subject: [PATCH 25/25] Updating README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c86cfa79..9066a272 100755 --- a/README.md +++ b/README.md @@ -53,8 +53,8 @@ * TorBot dependencies are managed using `poetry`, you can find the installation commands below: ```sh poetry install # to install dependencies -poetry run python run.py -u https://www.example.com --depth 2 --visualize tree --save json # example of running command with poetry -poetry run python run.py -h # for help +poetry run python torbot/main.py -u https://www.example.com --depth 2 --visualize tree --save json # example of running command with poetry +poetry run python torbot/main.py -h # for help ``` ### Options