diff --git a/.gitignore b/.gitignore
index b0b6f3a..a3b6bc7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,11 +1,12 @@
+# Project Specific
+output/*
+!output/.gitkeep
+
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
-# C extensions
-*.so
-
# Distribution / packaging
.Python
build/
@@ -26,12 +27,6 @@ share/python-wheels/
*.egg
MANIFEST
-# PyInstaller
-# Usually these files are written by a python script from a template
-# before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
@@ -51,74 +46,6 @@ coverage.xml
.pytest_cache/
cover/
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-# For a library or package, you might want to ignore these files since the code is
-# intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-# However, in case of collaboration, if having platform-specific dependencies or dependencies
-# having no cross-platform support, pipenv may install dependencies that don't work, or not
-# install all needed dependencies.
-#Pipfile.lock
-
-# poetry
-# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-# This is especially recommended for binary packages to ensure reproducibility, and is more
-# commonly ignored for libraries.
-# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-# in version control.
-# https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
# Environments
.env
.venv
@@ -128,33 +55,5 @@ ENV/
env.bak/
venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
# PyCharm
-# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-# and can be added to the global gitignore or merged into this file. For a more nuclear
-# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
\ No newline at end of file
diff --git a/README.md b/README.md
index 21cf061..dc2a1a2 100644
--- a/README.md
+++ b/README.md
@@ -51,12 +51,12 @@ $ torcrawl -v -u http://www.github.com/ -c -d 2 -p 2
## Installation
### Easy Installation with pip:
-*Comming soon..*
+*Coming soon...*
### Manual Installation:
1. **Clone this repository**:
`git clone https://github.com/MikeMeliz/TorCrawl.py.git`
-2. **Install dependecies**:
+2. **Install dependencies**:
`pip install -r requirements.txt`
3. **Install and Start TOR Service**:
1. **Debian/Ubuntu**:
@@ -82,12 +82,12 @@ $ torcrawl -v -u http://www.github.com/ -c -d 2 -p 2
-f |--folder| The directory which will contain the generated files
**Extract**: | |
-e |--extract| Extract page's code to terminal or file (Default: Terminal)
--i |--input filename| Input file with URL(s) (seperated by line)
+-i |--input filename| Input file with URL(s) (separated by line)
-o |--output [filename]| Output page(s) to file(s) (for one page)
-y |--yara | Perform yara keyword search:
h = search entire html object,
t = search only text
**Crawl**: | |
-c |--crawl| Crawl website (Default output on website/links.txt)
--d |--cdepth| Set depth of crawler's travel (Default: 1)
+-d |--depth| Set depth of crawler's travel (Default: 1)
-p |--pause| Seconds of pause between requests (Default: 0)
-l |--log| Log file with visited URLs and their response code
@@ -134,8 +134,7 @@ $ python torcrawl.py -i links.txt
### As Crawler:
-Crawl the links of the webpage without the use of TOR,
-also show verbose output (really helpfull):
+Crawl the links of the webpage without the use of TOR, also show verbose output (really helpful):
```shell
$ python torcrawl.py -v -w -u http://www.github.com/ -c
@@ -216,6 +215,10 @@ Feel free to contribute on this project! Just fork it, make any change on your f
## Changelog
```shell
+v1.32:
+ * Removed 1 second default pause between requests
+ * Several improvements on results
+ * Improved logs
v1.31:
* Fixed Input Link NoneType Error
* Fixed name mismatch
diff --git a/modules/checker.py b/modules/checker.py
index 3099543..59b1566 100644
--- a/modules/checker.py
+++ b/modules/checker.py
@@ -50,14 +50,22 @@ def folder(website, verbose):
:param website: String - URL of website to crawl.
:param verbose: Boolean - Logging level.
- :return: String 'out_path' - Path of the output folder.
+ :return: String 'output_folder' - Path of the output folder.
"""
- out_path = website
- if not os.path.exists(out_path):
- os.makedirs(out_path)
+ parsed = urlparse(website)
+ if parsed.scheme != '':
+ output_folder = "output/" + urlparse(website).netloc
+ else:
+ output_folder = "output/" + website
+ if not os.path.exists(output_folder):
+ try:
+ os.makedirs(output_folder)
+ except FileExistsError:
+ if verbose:
+ print(f"## Folder exists already: {website}")
if verbose:
- print(f"## Folder created: {out_path}")
- return out_path
+ print(f"## Folder created: {website}")
+ return output_folder
def check_tor(verbose):
@@ -87,11 +95,11 @@ def check_ip():
""" Checks users IP from external resource.
:return: None or HTTPError
"""
- addr = 'https://api.ipify.org/?format=json'
+ api_address = 'https://api.ipify.org/?format=json'
try:
- my_ip = load(urlopen(addr))['ip']
+ my_ip = load(urlopen(api_address))['ip']
print(f'## Your IP: {my_ip}')
except HTTPError as err:
error = sys.exc_info()[0]
- print(f"Error: {error} \n## IP cannot be obtained. \n## Is {addr} up? "
+ print(f"Error: {error} \n## IP cannot be obtained. \n## Is {api_address} up? "
f"\n## HTTPError: {err}")
diff --git a/modules/crawler.py b/modules/crawler.py
index f84b2fe..5430f61 100644
--- a/modules/crawler.py
+++ b/modules/crawler.py
@@ -26,7 +26,7 @@ def excludes(self, link):
:param link: String
:return: Boolean
"""
- now = datetime.datetime.now().strftime("%Y%m%d")
+ now = datetime.datetime.now().strftime("%y%m%d")
# BUG: For NoneType Exceptions, got to find a solution here
if link is None:
@@ -36,31 +36,33 @@ def excludes(self, link):
return True
# External links
elif link.startswith('http') and not link.startswith(self.website):
- file_path = self.out_path + '/' + now + '_extlinks.txt'
- with open(file_path, 'w+', encoding='UTF-8') as lst_file:
+ file_path = self.out_path + '/' + now + '_ext-links.txt'
+ with open(file_path, 'a+', encoding='UTF-8') as lst_file:
lst_file.write(str(link) + '\n')
return True
# Telephone Number
elif link.startswith('tel:'):
file_path = self.out_path + '/' + now + '_telephones.txt'
- with open(file_path, 'w+', encoding='UTF-8') as lst_file:
+ with open(file_path, 'a+', encoding='UTF-8') as lst_file:
lst_file.write(str(link) + '\n')
return True
# Mails
elif link.startswith('mailto:'):
file_path = self.out_path + '/' + now + '_mails.txt'
- with open(file_path, 'w+', encoding='UTF-8') as lst_file:
+ with open(file_path, 'a+', encoding='UTF-8') as lst_file:
lst_file.write(str(link) + '\n')
return True
# Type of files
- elif re.search('^.*\\.(pdf|jpg|jpeg|png|gif|doc)$', link,
- re.IGNORECASE):
+ elif re.search('^.*\\.(pdf|jpg|jpeg|png|gif|doc)$', link, re.IGNORECASE):
+ file_path = self.out_path + '/' + now + '_files.txt'
+ with open(file_path, 'a+', encoding='UTF-8') as lst_file:
+ lst_file.write(str(link) + '\n')
return True
def canonical(self, link):
""" Canonicalization of the link.
- :param link: String
+ :param link: String: URL(s)
:return: String 'final_link': parsed canonical url.
"""
# Already formatted
@@ -83,6 +85,20 @@ def canonical(self, link):
final_link = self.website + "/" + link
return final_link
+ def write_log(self, log):
+ log_path = self.out_path + '/crawler.log'
+ now = datetime.datetime.now()
+
+ if self.logs is True:
+ open(log_path, 'a+')
+ if self.logs is True and os.access(log_path, os.W_OK) is False:
+ print(f"## Unable to write to {self.out_path}/log.txt - Exiting")
+ sys.exit(2)
+ with open(log_path, 'a+', encoding='UTF-8') as log_file:
+ log_file.write(str(now) + " [crawler.py] " + log)
+ log_file.close()
+
+
def crawl(self):
""" Core of the crawler.
:return: List (ord_lst) - List of crawled links.
@@ -91,11 +107,6 @@ def crawl(self):
ord_lst = []
ord_lst.insert(0, self.website)
ord_lst_ind = 0
- log_path = self.out_path + '/log.txt'
-
- if self.logs is True and os.access(log_path, os.W_OK) is False:
- print(f"## Unable to write to {self.out_path}/log.txt - Exiting")
- sys.exit(2)
print(f"## Crawler started from {self.website} with "
f"{str(self.c_depth)} depth crawl, and {str(self.c_pause)} "
@@ -113,27 +124,23 @@ def crawl(self):
if item is not None:
html_page = urllib.request.urlopen(item)
except (HTTPError, URLError) as error:
- print('## ERROR: Domain or link seems to be '
- 'unreachable. Add -v to see the verbose error.'
- 'Or write the full URL at -u argument!')
- if self.verbose: print(error)
+ self.write_log(f"[INFO] ERROR: Domain or link seems to be unreachable: {str(item)} | "
+ f"Message: {error}\n")
continue
else:
try:
html_page = urllib.request.urlopen(self.website)
ord_lst_ind += 1
except (HTTPError, URLError) as error:
- print('## ERROR: Domain or link seems to be '
- 'unreachable. Add -v to see the verbose error.'
- 'Or write the full URL at -u argument!')
- if self.verbose: print(error)
+ self.write_log(f"[INFO] ERROR: Domain or link seems to be unreachable: {str(item)} | "
+ f"Message: {error}\n")
ord_lst_ind += 1
continue
try:
soup = BeautifulSoup(html_page, features="html.parser")
- except TypeError as err:
- print(f"## Soup Error Encountered:: could to parse "
+ except TypeError:
+ print(f"## Soup Error Encountered:: couldn't parse "
f"ord_list # {ord_lst_ind}::{ord_lst[ord_lst_ind]}")
continue
@@ -159,7 +166,7 @@ def crawl(self):
if ver_link is not None:
lst.add(ver_link)
- # TODO: For non-formal links, using RegEx
+ # TODO: For non-formal links, using RegEx, should be an additional parameter, and all patterns to be stored in a file
# url_pattern = r'/(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])/igm'
# html_content = urllib.request.urlopen(self.website).read().decode('utf-8')
@@ -180,21 +187,19 @@ def crawl(self):
ord_lst = ord_lst + list(set(lst))
ord_lst = list(set(ord_lst))
+ # Keeps logs for every webpage visited.
+ page_code = html_page.status
+ url_visited = f"[{str(page_code)}] {str(item)} \n"
+ self.write_log("[INFO] Parsed: " + url_visited)
+
if self.verbose:
- sys.stdout.write("-- Results: " + str(len(ord_lst)) + "\r")
+ sys.stdout.write(" -- Results: " + str(len(ord_lst)) + "\r")
sys.stdout.flush()
- # Pause time.
- if (ord_lst.index(item) != len(ord_lst) - 1) and \
- float(self.c_pause) > 0:
+ # Add Pause time between each iteration
+ if (ord_lst.index(item) != len(ord_lst) - 1) and float(self.c_pause) > 0:
time.sleep(float(self.c_pause))
- # Keeps logs for every webpage visited.
- if self.logs:
- it_code = html_page.getcode()
- with open(log_path, 'w+', encoding='UTF-8') as log_file:
- log_file.write(f"[{str(it_code)}] {str(item)} \n")
-
print(f"## Step {str(index + 1)} completed "
f"with: {str(len(ord_lst))} result(s)")
diff --git a/modules/extractor.py b/modules/extractor.py
index 89a6bd9..a4beacd 100644
--- a/modules/extractor.py
+++ b/modules/extractor.py
@@ -8,8 +8,8 @@
from urllib.error import URLError
from http.client import InvalidURL
from http.client import IncompleteRead
-
from bs4 import BeautifulSoup
+from pathlib import Path
from modules.checker import url_canon
@@ -55,16 +55,17 @@ def check_yara(raw=None, yara=0):
return matches
-def cinex(input_file, out_path, yara=None):
+def input_file_to_folder(input_file, output_path, yara=None):
""" Ingests the crawled links from the input_file,
scrapes the contents of the resulting web pages and writes the contents to
the into out_path/{url_address}.
:param input_file: String: Filename of the crawled Urls.
- :param out_path: String: Pathname of results.
+ :param output_path: String: Pathname of results.
:param yara: Integer: Keyword search argument.
:return: None
"""
+ i = 0
file = io.TextIOWrapper
try:
file = open(input_file, 'r')
@@ -92,21 +93,25 @@ def cinex(input_file, out_path, yara=None):
if yara is not None:
full_match_keywords = check_yara(content, yara)
-
if len(full_match_keywords) == 0:
print('No matches found.')
continue
- with open(out_path + "/" + output_file, 'wb') as results:
+ # Add an incremental in case of existing filename (eg. index.htm)
+ filename = Path(output_path + "/" + output_file)
+ if filename.is_file():
+ i += 1
+ filename = output_path + "/" + output_file + "(" + str(i) + ")"
+ with open(filename, 'wb') as results:
results.write(content)
- print(f"# File created on: {os.getcwd()}/{out_path}/{output_file}")
+ print(f"# File created on: {os.getcwd()}/{filename}")
except HTTPError as e:
- print(f"Cinex Error: {e.code}, cannot access: {e.url}")
+ print(f"Error: {e.code}, cannot access: {e.url}")
continue
- except InvalidURL as e:
- print(f"Invalid URL: {line} \n Skipping...")
+ except InvalidURL:
+ print(f"Invalid URL: {line}, \n Skipping...")
continue
- except IncompleteRead as e:
+ except IncompleteRead:
print(f"IncompleteRead on {line}")
continue
except IOError as err:
@@ -114,7 +119,7 @@ def cinex(input_file, out_path, yara=None):
file.close()
-def intermex(input_file, yara):
+def input_file_to_terminal(input_file, yara):
""" Input links from file and extract them into terminal.
:param input_file: String: File name of links file.
@@ -140,19 +145,19 @@ def intermex(input_file, yara):
print(f"ERROR: {err}\n## Not valid file. File tried: " + input_file)
-def outex(website, output_file, out_path, yara):
+def url_to_folder(website, output_file, output_path, yara):
""" Scrapes the contents of the provided web address and outputs the
contents to file.
:param website: String: Url of web address to scrape.
:param output_file: String: Filename of the results.
- :param out_path: String: Folder name of the output findings.
+ :param output_path: String: Folder name of the output findings.
:param yara: Integer: Keyword search argument.
:return: None
"""
# Extract page to file
try:
- output_file = out_path + "/" + output_file
+ output_file = output_path + "/" + output_file
content = urllib.request.urlopen(website).read()
if yara is not None:
@@ -170,7 +175,7 @@ def outex(website, output_file, out_path, yara):
print(f"Error: {err}\n Can't write on file: {output_file}")
-def termex(website, yara):
+def url_to_terminal(website, yara):
""" Scrapes provided web address and prints the results to the terminal.
:param website: String: URL of website to scrape.
@@ -193,29 +198,28 @@ def termex(website, yara):
return
-def extractor(website, crawl, output_file, input_file, out_path, selection_yara):
+def extractor(website, crawl, output_file, input_file, output_path, selection_yara):
""" Extractor - scrapes the resulting website or discovered links.
:param website: String: URL of website to scrape.
- :param crawl: Boolean: Cinex trigger.
+ :param crawl: Boolean: input_file_to_folder trigger.
If used iteratively scrape the urls from input_file.
:param output_file: String: Filename of resulting output from scrape.
:param input_file: String: Filename of crawled/discovered URLs
- :param out_path: String: Dir path for output files.
+ :param output_path: String: Dir path for output files.
:param selection_yara: String: Selected option of HTML or Text.
:return: None
"""
- # TODO: Return output to torcrawl.py
if len(input_file) > 0:
if crawl:
- cinex(input_file, out_path, selection_yara)
+ input_file_to_folder(input_file, output_path, selection_yara)
# TODO: Extract from list into a folder
# elif len(output_file) > 0:
- # inoutex(website, input_ile, output_file)
+ # input_list_to_folder(website, input_ile, output_file)
else:
- intermex(input_file, selection_yara)
+ input_file_to_terminal(input_file, selection_yara)
else:
if len(output_file) > 0:
- outex(website, output_file, out_path, selection_yara)
+ url_to_folder(website, output_file, output_path, selection_yara)
else:
- termex(website, selection_yara)
+ url_to_terminal(website, selection_yara)
diff --git a/modules/tests/__init__.py b/modules/tests/__init__.py
index c976c13..e69de29 100644
--- a/modules/tests/__init__.py
+++ b/modules/tests/__init__.py
@@ -1,3 +0,0 @@
-from modules import checker
-from modules import crawler
-from modules import extractor
diff --git a/modules/tests/test_checker.py b/modules/tests/test_checker.py
index b3c911b..35c0b96 100644
--- a/modules/tests/test_checker.py
+++ b/modules/tests/test_checker.py
@@ -22,7 +22,7 @@ def test_url_canon_001(self):
Returns true if the function successfully performs URL normalisation.
"""
url = 'torcrawl.com'
- expected = 'http://www.torcrawl.com'
+ expected = 'https://torcrawl.com'
result = url_canon(url, False)
self.assertEqual(expected, result,
f'Test Fail:: expected = {expected}, got {result}')
@@ -32,7 +32,7 @@ def test_url_canon_002(self):
Returns true if the function successfully performs URL normalisation.
"""
url = 'www.torcrawl.com'
- expected = 'http://www.torcrawl.com'
+ expected = 'https://www.torcrawl.com'
result = url_canon(url, False)
self.assertEqual(expected, result,
f'Test Fail:: expected = {expected}, got {result}')
diff --git a/modules/tests/test_crawler.py b/modules/tests/test_crawler.py
index 6dbb3d2..1beb771 100644
--- a/modules/tests/test_crawler.py
+++ b/modules/tests/test_crawler.py
@@ -20,8 +20,7 @@ def tearDown(self):
def test_excludes(self):
""" Test crawler.excludes function.
- Return True if the function successfully excludes the the provided
- failing links.
+ Return True if the function successfully excludes the provided failing links.
"""
_uri = 'http://www.torcrawl.com'
failing_links = ['#', 'tel:012-013-104-5',
@@ -38,7 +37,7 @@ def test_canonical(self):
Return True if the function successfully normalizes the provided
failing links.
"""
- _uri = 'http://www.torcrawl.com/'
+ _uri = 'https://torcrawl.com/'
links = [[f'{_uri}sundance', f'{_uri}sundance'],
['/sundance', f'{_uri}sundance'],
[f'{_uri}bob.html', f'{_uri}bob.html'],
diff --git a/output/.gitkeep b/output/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/torcrawl.py b/torcrawl.py
index dba3a23..13e2c39 100755
--- a/torcrawl.py
+++ b/torcrawl.py
@@ -11,14 +11,14 @@
General:
-h, --help : Help
--v, --verbose : Show more informations about the progress
+-v, --verbose : Show more information about the progress
-u, --url *.onion : URL of Webpage to crawl or extract
-w, --without : Without the use of Relay TOR
Extract:
-e, --extract : Extract page's code to terminal or file.
- (Defualt: terminal)
--i, --input filename : Input file with URL(s) (seperated by line)
+ (Default: terminal)
+-i, --input filename : Input file with URL(s) (separated by line)
-o, --output [filename] : Output page(s) to file(s) (for one page)
-y, --yara : Yara keyword search page categorisation
read in from /res folder.
@@ -27,7 +27,7 @@
Crawl:
-c, --crawl : Crawl website (Default output on /links.txt)
--d, --cdepth : Set depth of crawl's travel (Default: 1)
+-d, --depth : Set depth of crawl's travel (Default: 1)
-z, --exclusions : Paths that you don't want to include (TODO)
-s, --simultaneous: How many pages to visit at the same time (TODO)
-p, --pause : The length of time the crawler will pause
@@ -123,7 +123,7 @@ def main():
parser.add_argument(
'-i',
'--input',
- help='Input file with URL(s) (seperated by line)'
+ help='Input file with URL(s) (separated by line)'
)
parser.add_argument(
'-o',
@@ -140,12 +140,12 @@ def main():
)
parser.add_argument(
'-d',
- '--cdepth',
+ '--depth',
help='Set depth of crawl\'s travel (Default: 1)'
)
parser.add_argument(
'-p',
- '--cpause',
+ '--pause',
help='The length of time the crawler will pause'
)
parser.add_argument(
@@ -169,25 +169,25 @@ def main():
args = parser.parse_args()
- now = datetime.datetime.now().strftime("%Y%m%d")
+ now = datetime.datetime.now().strftime("%y%m%d")
# Canonicalization of web url and create path for output.
website = ''
- out_path = ''
+ output_folder = ''
if args.input: pass
elif len(args.url) > 0:
website = url_canon(args.url, args.verbose)
if args.folder is not None:
- out_path = folder(args.folder, args.verbose)
+ output_folder = folder(args.folder, args.verbose)
else:
- out_path = folder(extract_domain(website), args.verbose)
+ output_folder = folder(extract_domain(website), args.verbose)
# Parse arguments to variables else initiate variables.
input_file = args.input if args.input else ''
output_file = args.output if args.output else ''
- c_depth = args.cdepth if args.cdepth else 0
- c_pause = args.cpause if args.cpause else 1
+ depth = args.depth if args.depth else 0
+ pause = args.pause if args.pause else 0
selection_yara = args.yara if args.yara else None
# Connect to TOR
@@ -200,12 +200,12 @@ def main():
if args.url: print(('## URL: ' + args.url))
if args.crawl:
- crawler = Crawler(website, c_depth, c_pause, out_path, args.log,
+ crawler = Crawler(website, depth, pause, output_folder, args.log,
args.verbose)
lst = crawler.crawl()
- if args.input == None:
- input_file = out_path + '/' + now + '_links.txt'
+ if args.input is None:
+ input_file = output_folder + '/' + now + '_links.txt'
with open(input_file, 'w+', encoding='UTF-8') as file:
for item in lst:
@@ -213,10 +213,10 @@ def main():
print(f"## File created on {os.getcwd()}/{input_file}")
if args.extract:
- extractor(website, args.crawl, output_file, input_file, out_path,
+ extractor(website, args.crawl, output_file, input_file, output_folder,
selection_yara)
else:
- extractor(website, args.crawl, output_file, input_file, out_path,
+ extractor(website, args.crawl, output_file, input_file, output_folder,
selection_yara)