Merge pull request #126 from Security-Tools-Alliance/fix-99-unwanted-…

…subdomains fix(fetch_url): fix unwanted subdomain and rework fetch_url task
Security-Tools-Alliance · Aug 3, 2024 · a2a9daa · a2a9daa
2 parents 713b6e4 + b26bedf
commit a2a9daa
Show file tree

Hide file tree

Showing 6 changed files with 150 additions and 87 deletions.
diff --git a/default_yaml_config.yaml b/default_yaml_config.yaml
@@ -100,6 +100,7 @@ fetch_url: {
   'uses_tools': ['gospider', 'hakrawler', 'waybackurls', 'katana', 'gau'],
   'remove_duplicate_endpoints': true,
   'duplicate_fields': ['content_length', 'page_title'],
+  'follow_redirect': false,
   'enable_http_crawl': true,
   'gf_patterns': ['debug_logic', 'idor', 'interestingEXT', 'interestingparams', 'interestingsubs', 'lfi', 'rce', 'redirect', 'sqli', 'ssrf', 'ssti', 'xss'],
   'ignore_file_extensions': ['png', 'jpg', 'jpeg', 'gif', 'mp4', 'mpeg', 'mp3'],

diff --git a/web/fixtures/default_scan_engines.yaml b/web/fixtures/default_scan_engines.yaml
@@ -23,7 +23,7 @@
       5,\r\n  'threads': 30,\r\n  'wordlist_name': 'dicc'\r\n}\r\nfetch_url: {\r\n
       \ 'uses_tools': ['gospider', 'hakrawler', 'waybackurls', 'katana', 'gau'],\r\n
       \ 'remove_duplicate_endpoints': true,\r\n  'duplicate_fields': ['content_length',
-      'page_title'],\r\n  'enable_http_crawl': true,\r\n  'gf_patterns': ['debug_logic',
+      'page_title'],\r\n 'follow_redirect': false,\r\n 'enable_http_crawl': true,\r\n 'gf_patterns': ['debug_logic',
       'idor', 'interestingEXT', 'interestingparams', 'interestingsubs', 'lfi', 'rce',
       'redirect', 'sqli', 'ssrf', 'ssti', 'xss'],\r\n  'ignore_file_extensions': ['png',
       'jpg', 'jpeg', 'gif', 'mp4', 'mpeg', 'mp3'],\r\n  'threads': 30\r\n}\r\nvulnerability_scan: {\r\n

diff --git a/web/reNgine/common_func.py b/web/reNgine/common_func.py
@@ -309,19 +309,26 @@ def get_http_urls(
 
 	query = EndPoint.objects
 	if domain:
+		logger.debug(f'Searching URLs by domain {domain}')
 		query = query.filter(target_domain=domain)
 	if scan:
+		logger.debug(f'Searching URLs by scan {scan}')
 		query = query.filter(scan_history=scan)
 	if subdomain_id:
+		subdomain = Subdomain.objects.filter(pk=subdomain_id).first()
+		logger.debug(f'Searching URLs by subdomain {subdomain}')
 		query = query.filter(subdomain__id=subdomain_id)
 	elif exclude_subdomains and domain:
+		logger.debug(f'Excluding subdomains')
 		query = query.filter(http_url=domain.http_url)
 	if get_only_default_urls:
+		logger.debug(f'Searching only for default URL')
 		query = query.filter(is_default=True)
 
 	# If is_uncrawled is True, select only endpoints that have not been crawled
 	# yet (no status)
 	if is_uncrawled:
+		logger.debug(f'Searching for uncrawled endpoints only')
 		query = query.filter(http_status__isnull=True)
 
 	# If a path is passed, select only endpoints that contains it

diff --git a/web/reNgine/settings.py b/web/reNgine/settings.py
@@ -265,11 +265,6 @@
             'level': 'ERROR' if UI_DEBUG else 'CRITICAL',
             'propagate': True,
         },
-        '': {
-            'handlers': ['brief'],
-            'level': 'DEBUG' if UI_DEBUG else 'INFO',
-            'propagate': False
-        },
         'celery.app.trace': {
             'handlers': ['null'],
             'propagate': False,
@@ -291,12 +286,32 @@
             'level': 'INFO',
             'propagate': False
         },
-        'reNgine.tasks': {
+        'reNgine': {
             'handlers': ['task'],
             'level': 'DEBUG' if CELERY_DEBUG else 'INFO',
-            'propagate': False
-        }
+            'propagate': True  # Allow log messages to propagate to root logger
+        },
+        'kombu.pidbox': {
+            'handlers': ['null'],
+            'propagate': False,
+        },
+        'celery.pool': {
+            'handlers': ['null'],
+            'propagate': False,
+        },
+        'celery.bootsteps': {
+            'handlers': ['null'],
+            'propagate': False,
+        },
+        'celery.utils.functional': {
+            'handlers': ['null'],
+            'propagate': False,
+        },
     },
+    'root': {
+        'handlers': ['console'],
+        'level': 'DEBUG' if CELERY_DEBUG else 'INFO',
+    }
 }
 
 # debug

diff --git a/web/reNgine/tasks.py b/web/reNgine/tasks.py
@@ -1771,11 +1771,12 @@ def fetch_url(self, urls=[], ctx={}, description=None):
 	threads = config.get(THREADS) or self.yaml_configuration.get(THREADS, DEFAULT_THREADS)
 	domain_request_headers = self.domain.request_headers if self.domain else None
 	custom_header = config.get(CUSTOM_HEADER) or self.yaml_configuration.get(CUSTOM_HEADER)
+	follow_redirect = config.get(FOLLOW_REDIRECT, False)  # Get follow redirect setting
 	if domain_request_headers or custom_header:
 		custom_header = domain_request_headers or custom_header
 	exclude_subdomains = config.get(EXCLUDED_SUBDOMAINS, False)
 
-	# Get URLs to scan and save to input file
+	# Initialize the URLs
 	if urls and is_iterable(urls):
 		with open(input_path, 'w') as f:
 			f.write('\n'.join(urls))
@@ -1788,16 +1789,15 @@ def fetch_url(self, urls=[], ctx={}, description=None):
 			ctx=ctx
 		)
 
-	# Domain regex
-	host = self.domain.name if self.domain else urlparse(urls[0]).netloc
-	host_regex = f"\'https?://([a-z0-9]+[.])*{host}.*\'"
+	# Log initial URLs
+	logger.debug(f'Initial URLs: {urls}')
 
-	# Tools cmds
+	# Initialize command map for tools
 	cmd_map = {
 		'gau': f'gau',
 		'hakrawler': 'hakrawler -subs -u',
 		'waybackurls': 'waybackurls',
-		'gospider': f'gospider --js -d 2 --sitemap --robots -w -r',
+		'gospider': f'gospider --js -d 2 --sitemap --robots -w -r -a',
 		'katana': f'katana -silent -jc -kf all -d 3 -fs rdn',
 	}
 	if proxy:
@@ -1808,26 +1808,47 @@ def fetch_url(self, urls=[], ctx={}, description=None):
 	if threads > 0:
 		cmd_map['gau'] += f' --threads {threads}'
 		cmd_map['gospider'] += f' -t {threads}'
+		cmd_map['hakrawler'] += f' -t {threads}'
 		cmd_map['katana'] += f' -c {threads}'
 	if custom_header:
 		cmd_map['gospider'] += generate_header_param(custom_header, 'gospider')
 		cmd_map['hakrawler'] += generate_header_param(custom_header, 'hakrawler')
 		cmd_map['katana'] += generate_header_param(custom_header, 'common')
-	cat_input = f'cat {input_path}'
-	grep_output = f'grep -Eo {host_regex}'
-	cmd_map = {
-		tool: f'{cat_input} | {cmd} | {grep_output} > {self.results_dir}/urls_{tool}.txt'
-		for tool, cmd in cmd_map.items()
-	}
-	tasks = group(
-		run_command.si(
-			cmd,
-			shell=True,
-			scan_id=self.scan_id,
-			activity_id=self.activity_id)
-		for tool, cmd in cmd_map.items()
-		if tool in tools
-	)
+
+	# Add follow_redirect option to tools that support it
+	if follow_redirect is False:
+		cmd_map['gospider'] += f' --no-redirect'
+		cmd_map['hakrawler'] += f' -dr'
+		cmd_map['katana'] += f' -dr'
+
+	tasks = []
+
+	# Iterate over each URL and generate commands for each tool
+	for url in urls:
+		parsed_url = urlparse(url)
+		base_domain = parsed_url.netloc.split(':')[0]  # Remove port if present
+		host_regex = f"'https?://{re.escape(base_domain)}(:[0-9]+)?(/.*)?$'"
+
+		# Log the generated regex for the current URL
+		logger.debug(f'Generated regex for domain {base_domain}: {host_regex}')
+
+		cat_input = f'echo "{url}"'
+
+		# Generate commands for each tool for the current URL
+		for tool in tools:  # Only use tools specified in the config
+			if tool in cmd_map:
+				cmd = cmd_map[tool]
+				tool_cmd = f'{cat_input} | {cmd} | grep -Eo {host_regex} > {self.results_dir}/urls_{tool}_{base_domain}.txt'
+				tasks.append(run_command.si(
+					tool_cmd,
+					shell=True,
+					scan_id=self.scan_id,
+					activity_id=self.activity_id)
+				)
+				logger.debug(f'Generated command for tool {tool}: {tool_cmd}')
+
+	# Group the tasks
+	task_group = group(tasks)
 
 	# Cleanup task
 	sort_output = [
@@ -1852,41 +1873,51 @@ def fetch_url(self, urls=[], ctx={}, description=None):
 	)
 
 	# Run all commands
-	task = chord(tasks)(cleanup)
+	task = chord(task_group)(cleanup)
 	with allow_join_result():
 		task.get()
 
 	# Store all the endpoints and run httpx
-	with open(self.output_path) as f:
-		discovered_urls = f.readlines()
-		self.notify(fields={'Discovered URLs': len(discovered_urls)})
-
-	# Some tools can have an URL in the format <URL>] - <PATH> or <URL> - <PATH>, add them
-	# to the final URL list
 	all_urls = []
-	for url in discovered_urls:
-		url = url.strip()
-		urlpath = None
-		base_url = None
-		if '] ' in url: # found JS scraped endpoint e.g from gospider
-			split = tuple(url.split('] '))
-			if not len(split) == 2:
-				logger.warning(f'URL format not recognized for "{url}". Skipping.')
-				continue
-			base_url, urlpath = split
-			urlpath = urlpath.lstrip('- ')
-		elif ' - ' in url: # found JS scraped endpoint e.g from gospider
-			base_url, urlpath = tuple(url.split(' - '))
-
-		if base_url and urlpath:
-			subdomain = urlparse(base_url)
-			url = f'{subdomain.scheme}://{subdomain.netloc}{self.url_filter}'
-
-		if not validators.url(url):
-			logger.warning(f'Invalid URL "{url}". Skipping.')
-
-		if url not in all_urls:
-			all_urls.append(url)
+	tool_mapping = {}  # New dictionary to map URLs to tools
+	for tool in tools:
+		for url in urls:
+			parsed_url = urlparse(url)
+			base_domain = parsed_url.netloc.split(':')[0]  # Remove port if present
+			tool_output_file = f'{self.results_dir}/urls_{tool}_{base_domain}.txt'
+			if os.path.exists(tool_output_file):
+				with open(tool_output_file, 'r') as f:
+					discovered_urls = f.readlines()
+					for url in discovered_urls:
+						url = url.strip()
+						urlpath = None
+						base_url = None
+						if '] ' in url:  # found JS scraped endpoint e.g from gospider
+							split = tuple(url.split('] '))
+							if not len(split) == 2:
+								logger.warning(f'URL format not recognized for "{url}". Skipping.')
+								continue
+							base_url, urlpath = split
+							urlpath = urlpath.lstrip('- ')
+						elif ' - ' in url:  # found JS scraped endpoint e.g from gospider
+							base_url, urlpath = tuple(url.split(' - '))
+
+						if base_url and urlpath:
+							subdomain = urlparse(base_url)
+							url = f'{subdomain.scheme}://{subdomain.netloc}{urlpath}'
+
+						if not validators.url(url):
+							logger.warning(f'Invalid URL "{url}". Skipping.')
+							continue
+
+						if url not in tool_mapping:
+							tool_mapping[url] = set()
+						tool_mapping[url].add(tool)  # Use a set to ensure uniqueness
+
+	all_urls = list(tool_mapping.keys())
+	for url, found_tools in tool_mapping.items():
+		unique_tools = ', '.join(found_tools)
+		logger.info(f'URL {url} found by tools: {unique_tools}')
 
 	# Filter out URLs if a path filter was passed
 	if self.url_filter:
@@ -1907,7 +1938,6 @@ def fetch_url(self, urls=[], ctx={}, description=None):
 			duplicate_removal_fields=duplicate_removal_fields
 		)
 
-
 	#-------------------#
 	# GF PATTERNS MATCH #
 	#-------------------#
@@ -1965,11 +1995,12 @@ def fetch_url(self, urls=[], ctx={}, description=None):
 				earlier_pattern = endpoint.matched_gf_patterns
 			pattern = f'{earlier_pattern},{gf_pattern}' if earlier_pattern else gf_pattern
 			endpoint.matched_gf_patterns = pattern
+			# TODO Add tool that found the URL to the db (need to update db model)
+			# endpoint.found_by_tools = ','.join(tool_mapping.get(url, []))  # Save tools in the endpoint
 			endpoint.save()
 
 	return all_urls
 
-
 def parse_curl_output(response):
 	# TODO: Enrich from other cURL fields.
 	CURL_REGEX_HTTP_STATUS = f'HTTP\/(?:(?:\d\.?)+)\s(\d+)\s(?:\w+)'
@@ -1985,7 +2016,6 @@ def parse_curl_output(response):
 		'http_status': http_status,
 	}
 
-
 @app.task(name='vulnerability_scan', queue='main_scan_queue', bind=True, base=RengineTask)
 def vulnerability_scan(self, urls=[], ctx={}, description=None):
 	"""
@@ -4052,7 +4082,8 @@ def remove_duplicate_endpoints(
 		domain_id,
 		subdomain_id=None,
 		filter_ids=[],
-		filter_status=[200, 301, 404],
+		# TODO Check if the status code could be set as parameters of the scan engine instead of hardcoded values
+		filter_status=[200, 301, 302, 303, 307, 404, 410],  # Extended status codes
 		duplicate_removal_fields=ENDPOINT_SCAN_DEFAULT_DUPLICATE_FIELDS
 	):
 	"""Remove duplicate endpoints.
@@ -4071,6 +4102,8 @@ def remove_duplicate_endpoints(
 		duplicate_removal_fields (list): List of Endpoint model fields to check for duplicates
 	"""
 	logger.info(f'Removing duplicate endpoints based on {duplicate_removal_fields}')
+
+	# Filter endpoints based on scan history and domain
 	endpoints = (
 		EndPoint.objects
 		.filter(scan_history__id=scan_history_id)
@@ -4085,29 +4118,35 @@ def remove_duplicate_endpoints(
 	if filter_ids:
 		endpoints = endpoints.filter(id__in=filter_ids)
 
-	for field_name in duplicate_removal_fields:
-		cl_query = (
-			endpoints
-			.values_list(field_name)
-			.annotate(mc=Count(field_name))
-			.order_by('-mc')
-		)
-		for (field_value, count) in cl_query:
-			if count > DELETE_DUPLICATES_THRESHOLD:
-				eps_to_delete = (
-					endpoints
-					.filter(**{field_name: field_value})
-					.order_by('discovered_date')
-					.all()[1:]
-				)
-				msg = f'Deleting {len(eps_to_delete)} endpoints [reason: same {field_name} {field_value}]'
-				for ep in eps_to_delete:
-					url = urlparse(ep.http_url)
-					if url.path in ['', '/', '/login']: # try do not delete the original page that other pages redirect to
-						continue
-					msg += f'\n\t {ep.http_url} [{ep.http_status}] [{field_name}={field_value}]'
-					ep.delete()
-				logger.warning(msg)
+	# Group by all duplicate removal fields combined
+	fields_combined = duplicate_removal_fields[:]
+	fields_combined.append('id')  # Add ID to ensure unique identification
+
+	cl_query = (
+		endpoints
+		.values(*duplicate_removal_fields)
+		.annotate(mc=Count('id'))
+		.order_by('-mc')
+	)
+
+	for field_values in cl_query:
+		if field_values['mc'] > DELETE_DUPLICATES_THRESHOLD:
+			filter_criteria = {field: field_values[field] for field in duplicate_removal_fields}
+			eps_to_delete = (
+				endpoints
+				.filter(**filter_criteria)
+				.order_by('discovered_date')
+				.all()[1:]
+			)
+			msg = f'Deleting {len(eps_to_delete)} endpoints [reason: same {filter_criteria}]'
+			for ep in eps_to_delete:
+				url = urlparse(ep.http_url)
+				if url.path in ['', '/', '/login']:  # Ensure not to delete the original page that other pages redirect to
+					continue
+				msg += f'\n\t {ep.http_url} [{ep.http_status}] {filter_criteria}'
+				ep.delete()
+			logger.warning(msg)
+
 
 @app.task(name='run_command', bind=False, queue='run_command_queue')
 def run_command(

diff --git a/web/scanEngine/templates/scanEngine/add_engine.html b/web/scanEngine/templates/scanEngine/add_engine.html
@@ -148,6 +148,7 @@ <h4 class="header-title">Scan Engines</h4>
     'content_length',
     'page_title'
   ],
+  'follow_redirect': false,
   'enable_http_crawl': true,
   'gf_patterns': ['debug_logic', 'idor', 'interestingEXT', 'interestingparams', 'interestingsubs', 'lfi', 'rce', 'redirect', 'sqli', 'ssrf', 'ssti', 'xss'],
   'ignore_file_extensions': ['png', 'jpg', 'jpeg', 'gif', 'mp4', 'mpeg', 'mp3'],