Skip to content

Commit

Permalink
Merge pull request #126 from Security-Tools-Alliance/fix-99-unwanted-…
Browse files Browse the repository at this point in the history
…subdomains

fix(fetch_url): fix unwanted subdomain and rework fetch_url task
  • Loading branch information
AnonymousWP authored Aug 3, 2024
2 parents 713b6e4 + b26bedf commit a2a9daa
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 87 deletions.
1 change: 1 addition & 0 deletions default_yaml_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ fetch_url: {
'uses_tools': ['gospider', 'hakrawler', 'waybackurls', 'katana', 'gau'],
'remove_duplicate_endpoints': true,
'duplicate_fields': ['content_length', 'page_title'],
'follow_redirect': false,
'enable_http_crawl': true,
'gf_patterns': ['debug_logic', 'idor', 'interestingEXT', 'interestingparams', 'interestingsubs', 'lfi', 'rce', 'redirect', 'sqli', 'ssrf', 'ssti', 'xss'],
'ignore_file_extensions': ['png', 'jpg', 'jpeg', 'gif', 'mp4', 'mpeg', 'mp3'],
Expand Down
2 changes: 1 addition & 1 deletion web/fixtures/default_scan_engines.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
5,\r\n 'threads': 30,\r\n 'wordlist_name': 'dicc'\r\n}\r\nfetch_url: {\r\n
\ 'uses_tools': ['gospider', 'hakrawler', 'waybackurls', 'katana', 'gau'],\r\n
\ 'remove_duplicate_endpoints': true,\r\n 'duplicate_fields': ['content_length',
'page_title'],\r\n 'enable_http_crawl': true,\r\n 'gf_patterns': ['debug_logic',
'page_title'],\r\n 'follow_redirect': false,\r\n 'enable_http_crawl': true,\r\n 'gf_patterns': ['debug_logic',
'idor', 'interestingEXT', 'interestingparams', 'interestingsubs', 'lfi', 'rce',
'redirect', 'sqli', 'ssrf', 'ssti', 'xss'],\r\n 'ignore_file_extensions': ['png',
'jpg', 'jpeg', 'gif', 'mp4', 'mpeg', 'mp3'],\r\n 'threads': 30\r\n}\r\nvulnerability_scan: {\r\n
Expand Down
7 changes: 7 additions & 0 deletions web/reNgine/common_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,19 +309,26 @@ def get_http_urls(

query = EndPoint.objects
if domain:
logger.debug(f'Searching URLs by domain {domain}')
query = query.filter(target_domain=domain)
if scan:
logger.debug(f'Searching URLs by scan {scan}')
query = query.filter(scan_history=scan)
if subdomain_id:
subdomain = Subdomain.objects.filter(pk=subdomain_id).first()
logger.debug(f'Searching URLs by subdomain {subdomain}')
query = query.filter(subdomain__id=subdomain_id)
elif exclude_subdomains and domain:
logger.debug(f'Excluding subdomains')
query = query.filter(http_url=domain.http_url)
if get_only_default_urls:
logger.debug(f'Searching only for default URL')
query = query.filter(is_default=True)

# If is_uncrawled is True, select only endpoints that have not been crawled
# yet (no status)
if is_uncrawled:
logger.debug(f'Searching for uncrawled endpoints only')
query = query.filter(http_status__isnull=True)

# If a path is passed, select only endpoints that contains it
Expand Down
31 changes: 23 additions & 8 deletions web/reNgine/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,11 +265,6 @@
'level': 'ERROR' if UI_DEBUG else 'CRITICAL',
'propagate': True,
},
'': {
'handlers': ['brief'],
'level': 'DEBUG' if UI_DEBUG else 'INFO',
'propagate': False
},
'celery.app.trace': {
'handlers': ['null'],
'propagate': False,
Expand All @@ -291,12 +286,32 @@
'level': 'INFO',
'propagate': False
},
'reNgine.tasks': {
'reNgine': {
'handlers': ['task'],
'level': 'DEBUG' if CELERY_DEBUG else 'INFO',
'propagate': False
}
'propagate': True # Allow log messages to propagate to root logger
},
'kombu.pidbox': {
'handlers': ['null'],
'propagate': False,
},
'celery.pool': {
'handlers': ['null'],
'propagate': False,
},
'celery.bootsteps': {
'handlers': ['null'],
'propagate': False,
},
'celery.utils.functional': {
'handlers': ['null'],
'propagate': False,
},
},
'root': {
'handlers': ['console'],
'level': 'DEBUG' if CELERY_DEBUG else 'INFO',
}
}

# debug
Expand Down
195 changes: 117 additions & 78 deletions web/reNgine/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1771,11 +1771,12 @@ def fetch_url(self, urls=[], ctx={}, description=None):
threads = config.get(THREADS) or self.yaml_configuration.get(THREADS, DEFAULT_THREADS)
domain_request_headers = self.domain.request_headers if self.domain else None
custom_header = config.get(CUSTOM_HEADER) or self.yaml_configuration.get(CUSTOM_HEADER)
follow_redirect = config.get(FOLLOW_REDIRECT, False) # Get follow redirect setting
if domain_request_headers or custom_header:
custom_header = domain_request_headers or custom_header
exclude_subdomains = config.get(EXCLUDED_SUBDOMAINS, False)

# Get URLs to scan and save to input file
# Initialize the URLs
if urls and is_iterable(urls):
with open(input_path, 'w') as f:
f.write('\n'.join(urls))
Expand All @@ -1788,16 +1789,15 @@ def fetch_url(self, urls=[], ctx={}, description=None):
ctx=ctx
)

# Domain regex
host = self.domain.name if self.domain else urlparse(urls[0]).netloc
host_regex = f"\'https?://([a-z0-9]+[.])*{host}.*\'"
# Log initial URLs
logger.debug(f'Initial URLs: {urls}')

# Tools cmds
# Initialize command map for tools
cmd_map = {
'gau': f'gau',
'hakrawler': 'hakrawler -subs -u',
'waybackurls': 'waybackurls',
'gospider': f'gospider --js -d 2 --sitemap --robots -w -r',
'gospider': f'gospider --js -d 2 --sitemap --robots -w -r -a',
'katana': f'katana -silent -jc -kf all -d 3 -fs rdn',
}
if proxy:
Expand All @@ -1808,26 +1808,47 @@ def fetch_url(self, urls=[], ctx={}, description=None):
if threads > 0:
cmd_map['gau'] += f' --threads {threads}'
cmd_map['gospider'] += f' -t {threads}'
cmd_map['hakrawler'] += f' -t {threads}'
cmd_map['katana'] += f' -c {threads}'
if custom_header:
cmd_map['gospider'] += generate_header_param(custom_header, 'gospider')
cmd_map['hakrawler'] += generate_header_param(custom_header, 'hakrawler')
cmd_map['katana'] += generate_header_param(custom_header, 'common')
cat_input = f'cat {input_path}'
grep_output = f'grep -Eo {host_regex}'
cmd_map = {
tool: f'{cat_input} | {cmd} | {grep_output} > {self.results_dir}/urls_{tool}.txt'
for tool, cmd in cmd_map.items()
}
tasks = group(
run_command.si(
cmd,
shell=True,
scan_id=self.scan_id,
activity_id=self.activity_id)
for tool, cmd in cmd_map.items()
if tool in tools
)

# Add follow_redirect option to tools that support it
if follow_redirect is False:
cmd_map['gospider'] += f' --no-redirect'
cmd_map['hakrawler'] += f' -dr'
cmd_map['katana'] += f' -dr'

tasks = []

# Iterate over each URL and generate commands for each tool
for url in urls:
parsed_url = urlparse(url)
base_domain = parsed_url.netloc.split(':')[0] # Remove port if present
host_regex = f"'https?://{re.escape(base_domain)}(:[0-9]+)?(/.*)?$'"

# Log the generated regex for the current URL
logger.debug(f'Generated regex for domain {base_domain}: {host_regex}')

cat_input = f'echo "{url}"'

# Generate commands for each tool for the current URL
for tool in tools: # Only use tools specified in the config
if tool in cmd_map:
cmd = cmd_map[tool]
tool_cmd = f'{cat_input} | {cmd} | grep -Eo {host_regex} > {self.results_dir}/urls_{tool}_{base_domain}.txt'
tasks.append(run_command.si(
tool_cmd,
shell=True,
scan_id=self.scan_id,
activity_id=self.activity_id)
)
logger.debug(f'Generated command for tool {tool}: {tool_cmd}')

# Group the tasks
task_group = group(tasks)

# Cleanup task
sort_output = [
Expand All @@ -1852,41 +1873,51 @@ def fetch_url(self, urls=[], ctx={}, description=None):
)

# Run all commands
task = chord(tasks)(cleanup)
task = chord(task_group)(cleanup)
with allow_join_result():
task.get()

# Store all the endpoints and run httpx
with open(self.output_path) as f:
discovered_urls = f.readlines()
self.notify(fields={'Discovered URLs': len(discovered_urls)})

# Some tools can have an URL in the format <URL>] - <PATH> or <URL> - <PATH>, add them
# to the final URL list
all_urls = []
for url in discovered_urls:
url = url.strip()
urlpath = None
base_url = None
if '] ' in url: # found JS scraped endpoint e.g from gospider
split = tuple(url.split('] '))
if not len(split) == 2:
logger.warning(f'URL format not recognized for "{url}". Skipping.')
continue
base_url, urlpath = split
urlpath = urlpath.lstrip('- ')
elif ' - ' in url: # found JS scraped endpoint e.g from gospider
base_url, urlpath = tuple(url.split(' - '))

if base_url and urlpath:
subdomain = urlparse(base_url)
url = f'{subdomain.scheme}://{subdomain.netloc}{self.url_filter}'

if not validators.url(url):
logger.warning(f'Invalid URL "{url}". Skipping.')

if url not in all_urls:
all_urls.append(url)
tool_mapping = {} # New dictionary to map URLs to tools
for tool in tools:
for url in urls:
parsed_url = urlparse(url)
base_domain = parsed_url.netloc.split(':')[0] # Remove port if present
tool_output_file = f'{self.results_dir}/urls_{tool}_{base_domain}.txt'
if os.path.exists(tool_output_file):
with open(tool_output_file, 'r') as f:
discovered_urls = f.readlines()
for url in discovered_urls:
url = url.strip()
urlpath = None
base_url = None
if '] ' in url: # found JS scraped endpoint e.g from gospider
split = tuple(url.split('] '))
if not len(split) == 2:
logger.warning(f'URL format not recognized for "{url}". Skipping.')
continue
base_url, urlpath = split
urlpath = urlpath.lstrip('- ')
elif ' - ' in url: # found JS scraped endpoint e.g from gospider
base_url, urlpath = tuple(url.split(' - '))

if base_url and urlpath:
subdomain = urlparse(base_url)
url = f'{subdomain.scheme}://{subdomain.netloc}{urlpath}'

if not validators.url(url):
logger.warning(f'Invalid URL "{url}". Skipping.')
continue

if url not in tool_mapping:
tool_mapping[url] = set()
tool_mapping[url].add(tool) # Use a set to ensure uniqueness

all_urls = list(tool_mapping.keys())
for url, found_tools in tool_mapping.items():
unique_tools = ', '.join(found_tools)
logger.info(f'URL {url} found by tools: {unique_tools}')

# Filter out URLs if a path filter was passed
if self.url_filter:
Expand All @@ -1907,7 +1938,6 @@ def fetch_url(self, urls=[], ctx={}, description=None):
duplicate_removal_fields=duplicate_removal_fields
)


#-------------------#
# GF PATTERNS MATCH #
#-------------------#
Expand Down Expand Up @@ -1965,11 +1995,12 @@ def fetch_url(self, urls=[], ctx={}, description=None):
earlier_pattern = endpoint.matched_gf_patterns
pattern = f'{earlier_pattern},{gf_pattern}' if earlier_pattern else gf_pattern
endpoint.matched_gf_patterns = pattern
# TODO Add tool that found the URL to the db (need to update db model)
# endpoint.found_by_tools = ','.join(tool_mapping.get(url, [])) # Save tools in the endpoint
endpoint.save()

return all_urls


def parse_curl_output(response):
# TODO: Enrich from other cURL fields.
CURL_REGEX_HTTP_STATUS = f'HTTP\/(?:(?:\d\.?)+)\s(\d+)\s(?:\w+)'
Expand All @@ -1985,7 +2016,6 @@ def parse_curl_output(response):
'http_status': http_status,
}


@app.task(name='vulnerability_scan', queue='main_scan_queue', bind=True, base=RengineTask)
def vulnerability_scan(self, urls=[], ctx={}, description=None):
"""
Expand Down Expand Up @@ -4052,7 +4082,8 @@ def remove_duplicate_endpoints(
domain_id,
subdomain_id=None,
filter_ids=[],
filter_status=[200, 301, 404],
# TODO Check if the status code could be set as parameters of the scan engine instead of hardcoded values
filter_status=[200, 301, 302, 303, 307, 404, 410], # Extended status codes
duplicate_removal_fields=ENDPOINT_SCAN_DEFAULT_DUPLICATE_FIELDS
):
"""Remove duplicate endpoints.
Expand All @@ -4071,6 +4102,8 @@ def remove_duplicate_endpoints(
duplicate_removal_fields (list): List of Endpoint model fields to check for duplicates
"""
logger.info(f'Removing duplicate endpoints based on {duplicate_removal_fields}')

# Filter endpoints based on scan history and domain
endpoints = (
EndPoint.objects
.filter(scan_history__id=scan_history_id)
Expand All @@ -4085,29 +4118,35 @@ def remove_duplicate_endpoints(
if filter_ids:
endpoints = endpoints.filter(id__in=filter_ids)

for field_name in duplicate_removal_fields:
cl_query = (
endpoints
.values_list(field_name)
.annotate(mc=Count(field_name))
.order_by('-mc')
)
for (field_value, count) in cl_query:
if count > DELETE_DUPLICATES_THRESHOLD:
eps_to_delete = (
endpoints
.filter(**{field_name: field_value})
.order_by('discovered_date')
.all()[1:]
)
msg = f'Deleting {len(eps_to_delete)} endpoints [reason: same {field_name} {field_value}]'
for ep in eps_to_delete:
url = urlparse(ep.http_url)
if url.path in ['', '/', '/login']: # try do not delete the original page that other pages redirect to
continue
msg += f'\n\t {ep.http_url} [{ep.http_status}] [{field_name}={field_value}]'
ep.delete()
logger.warning(msg)
# Group by all duplicate removal fields combined
fields_combined = duplicate_removal_fields[:]
fields_combined.append('id') # Add ID to ensure unique identification

cl_query = (
endpoints
.values(*duplicate_removal_fields)
.annotate(mc=Count('id'))
.order_by('-mc')
)

for field_values in cl_query:
if field_values['mc'] > DELETE_DUPLICATES_THRESHOLD:
filter_criteria = {field: field_values[field] for field in duplicate_removal_fields}
eps_to_delete = (
endpoints
.filter(**filter_criteria)
.order_by('discovered_date')
.all()[1:]
)
msg = f'Deleting {len(eps_to_delete)} endpoints [reason: same {filter_criteria}]'
for ep in eps_to_delete:
url = urlparse(ep.http_url)
if url.path in ['', '/', '/login']: # Ensure not to delete the original page that other pages redirect to
continue
msg += f'\n\t {ep.http_url} [{ep.http_status}] {filter_criteria}'
ep.delete()
logger.warning(msg)


@app.task(name='run_command', bind=False, queue='run_command_queue')
def run_command(
Expand Down
1 change: 1 addition & 0 deletions web/scanEngine/templates/scanEngine/add_engine.html
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ <h4 class="header-title">Scan Engines</h4>
'content_length',
'page_title'
],
'follow_redirect': false,
'enable_http_crawl': true,
'gf_patterns': ['debug_logic', 'idor', 'interestingEXT', 'interestingparams', 'interestingsubs', 'lfi', 'rce', 'redirect', 'sqli', 'ssrf', 'ssti', 'xss'],
'ignore_file_extensions': ['png', 'jpg', 'jpeg', 'gif', 'mp4', 'mpeg', 'mp3'],
Expand Down

0 comments on commit a2a9daa

Please sign in to comment.