From ab2c882cae182cbd321dd17217b9deaaf0cdb5b7 Mon Sep 17 00:00:00 2001 From: lordlabuckdas <55460753+lordlabuckdas@users.noreply.github.com> Date: Tue, 13 Jul 2021 23:01:36 +0530 Subject: [PATCH 1/2] feature: redirects --- bin/clone | 4 +- bin/snare | 6 ++- snare/cloner.py | 88 ++++++++++++++++++++++-------------- snare/server.py | 2 + snare/utils/snare_helpers.py | 4 +- 5 files changed, 65 insertions(+), 39 deletions(-) diff --git a/bin/clone b/bin/clone index 41530de4..1f4263b7 100644 --- a/bin/clone +++ b/bin/clone @@ -66,13 +66,15 @@ def main(): loop.run_until_complete(cloner.run()) except KeyboardInterrupt: print_color("\nKeyboardInterrupt received... Quitting", "ERROR") + except AssertionError: + print_color("\nConnection closed by the website... Quitting", "ERROR") finally: end = datetime.now() - start loop.run_until_complete(cloner.close()) loop.close() print("") print_color("-" * 36 + ">SUMMARY<" + "-" * 36, "INFO") - print_color("\tTotal number of URLs cloned: {}".format(str(cloner.runner.counter)), "INFO") + print_color("\tTotal number of URLs cloned: {}".format(str(len(cloner.runner.visited_urls))), "INFO") print_color("\tTime elapsed: {}".format(str(end)), "INFO") print_color("\tCloned directory: {}".format(cloner.runner.target_path), "INFO") print_color("-" * 82, "INFO") diff --git a/bin/snare b/bin/snare index 1dd444c9..2aad7d8f 100644 --- a/bin/snare +++ b/bin/snare @@ -190,10 +190,12 @@ if __name__ == "__main__": print_color("Error found in meta.json. Please clone the pages again.", "ERROR") exit() - if not os.path.exists(os.path.join(full_page_path, os.path.join(meta_info[args.index_page]["hash"]))): + index_file_hash = meta_info[meta_info[args.index_page]["redirect"]]["hash"] if meta_info[args.index_page].get("redirect") else meta_info[args.index_page]["hash"] + if not os.path.exists(os.path.join(full_page_path, index_file_hash)): print_color("can't create meta tag", "WARNING") else: - snare_helpers.add_meta_tag(args.page_dir, meta_info[args.index_page]["hash"], config, base_path) + snare_helpers.add_meta_tag(args.page_dir, index_file_hash, config, base_path) + del index_file_hash loop = asyncio.get_event_loop() loop.run_until_complete(check_tanner()) diff --git a/snare/cloner.py b/snare/cloner.py index 71619c7e..667edd9b 100644 --- a/snare/cloner.py +++ b/snare/cloner.py @@ -43,7 +43,6 @@ def __init__(self, root, max_depth, css_validate, default_path="/opt/snare"): self.new_urls = Queue() self.meta = defaultdict(dict) - self.counter = 0 self.itr = 0 @staticmethod @@ -65,6 +64,7 @@ def get_headers(response): "date", "etag", "expires", + "transfer-encoding", "x-cache", ] @@ -96,7 +96,7 @@ async def process_link(self, url, level, check_host=False): or (self.moved_root is not None and host != self.moved_root.host) ): return None - if url.human_repr() not in self.visited_urls and (level + 1) <= self.max_depth: + if url.with_scheme("http").human_repr() not in self.visited_urls and (level + 1) <= self.max_depth: await self.new_urls.put({"url": url, "level": level + 1, "try_count": 0}) res = None @@ -139,6 +139,7 @@ def _make_filename(self, url): file_name = url.relative().human_repr() else: file_name = url.human_repr() + if not file_name.startswith("/"): file_name = "/" + file_name @@ -157,44 +158,53 @@ async def get_body(self, driver): current_url, level, try_count = (await self.new_urls.get()).values() if try_count > 2: continue - if current_url.human_repr() in self.visited_urls: + if current_url.with_scheme("http").human_repr() in self.visited_urls: + continue + self.visited_urls.append(current_url.with_scheme("http").human_repr()) + redirect_url, data, headers, content_type = await self.fetch_data(driver, current_url, level, try_count) + + if not data: continue - self.visited_urls.append(current_url.human_repr()) - file_name, hash_name = self._make_filename(current_url) + + if redirect_url: + file_name, hash_name = self._make_filename(redirect_url) + old_file_name, _ = self._make_filename(current_url) + if old_file_name != file_name: + self.meta[old_file_name]["redirect"] = file_name + self.visited_urls.append(redirect_url.with_scheme("http").human_repr()) + else: + file_name, hash_name = self._make_filename(current_url) self.logger.debug("Cloned file: %s", file_name) - data, headers, content_type = await self.fetch_data(driver, current_url, level, try_count) - - if data is not None: - self.meta[file_name]["hash"] = hash_name - self.meta[file_name]["headers"] = headers - self.counter = self.counter + 1 - - if content_type == "text/html": - soup = await self.replace_links(data, level) - data = str(soup).encode() - elif content_type == "text/css": - css = cssutils.parseString(data, validate=self.css_validate) - for carved_url in cssutils.getUrls(css): - if carved_url.startswith("data"): - continue - carved_url = yarl.URL(carved_url) - if not carved_url.is_absolute(): - carved_url = self.root.join(carved_url) - if carved_url.human_repr() not in self.visited_urls: - await self.new_urls.put({"url": carved_url, "level": level + 1, "try_count": 0}) - - try: - with open(os.path.join(self.target_path, hash_name), "wb") as index_fh: - index_fh.write(data) - except TypeError: - await self.new_urls.put({"url": current_url, "level": level, "try_count": try_count + 1}) + self.meta[file_name]["hash"] = hash_name + self.meta[file_name]["headers"] = headers + + if content_type == "text/html": + soup = await self.replace_links(data, level) + data = str(soup).encode() + elif content_type == "text/css": + css = cssutils.parseString(data, validate=self.css_validate) + for carved_url in cssutils.getUrls(css): + if carved_url.startswith("data"): + continue + carved_url = yarl.URL(carved_url) + if not carved_url.is_absolute(): + carved_url = self.root.join(carved_url) + if carved_url.with_scheme("http").human_repr() not in self.visited_urls: + await self.new_urls.put({"url": carved_url, "level": level + 1, "try_count": 0}) + + try: + with open(os.path.join(self.target_path, hash_name), "wb") as index_fh: + index_fh.write(data) + except TypeError: + await self.new_urls.put({"url": current_url, "level": level, "try_count": try_count + 1}) async def get_root_host(self): try: async with aiohttp.ClientSession() as session: async with session.get(self.root) as resp: - if resp.host != self.root.host: - self.moved_root = resp.url + resp_url = yarl.URL(resp.url) + if resp_url.host != self.root.host or resp_url.path != self.root.path: + self.moved_root = resp_url except aiohttp.ClientError as err: self.logger.error("Can't connect to target host: %s", err) exit(-1) @@ -205,17 +215,21 @@ async def fetch_data(self, session, current_url, level, try_count): data = None headers = [] content_type = None + redirect_url = None try: response = await session.get(current_url, headers={"Accept": "text/html"}, timeout=10.0) headers = self.get_headers(response) content_type = response.content_type + response_url = yarl.URL(response.url) + if response_url.with_scheme("http") != current_url.with_scheme("http"): + redirect_url = response_url data = await response.read() except (aiohttp.ClientError, asyncio.TimeoutError) as client_error: self.logger.error(client_error) await self.new_urls.put({"url": current_url, "level": level, "try_count": try_count + 1}) else: await response.release() - return [data, headers, content_type] + return [redirect_url, data, headers, content_type] class HeadlessCloner(BaseCloner): @@ -232,11 +246,15 @@ async def fetch_data(self, browser, current_url, level, try_count): headers = [] content_type = None page = None + redirect_url = None try: page = await browser.newPage() response = await page.goto(str(current_url)) headers = self.get_headers(response) content_type = self.get_content_type(headers) + response_url = yarl.URL(response.url) + if response_url.with_scheme("http") != current_url.with_scheme("http"): + redirect_url = response_url data = await response.buffer() except Exception as err: self.logger.error(err) @@ -248,7 +266,7 @@ async def fetch_data(self, browser, current_url, level, try_count): except Exception as err: print_color(str(err), "WARNING") - return [data, headers, content_type] + return [redirect_url, data, headers, content_type] class CloneRunner: diff --git a/snare/server.py b/snare/server.py index bd3e95b4..ca052ad7 100644 --- a/snare/server.py +++ b/snare/server.py @@ -37,6 +37,8 @@ async def submit_slurp(self, data): async def handle_request(self, request): self.logger.info("Request path: {0}".format(request.path_qs)) + if self.meta[request.path_qs].get("redirect"): + raise web.HTTPFound(self.meta[request.path_qs]["redirect"]) data = self.tanner_handler.create_data(request, 200) if request.method == "POST": post_data = await request.post() diff --git a/snare/utils/snare_helpers.py b/snare/utils/snare_helpers.py index d7c8b32c..e739b8f7 100755 --- a/snare/utils/snare_helpers.py +++ b/snare/utils/snare_helpers.py @@ -92,9 +92,11 @@ def add_meta_tag(page_dir, index_page, config, base_path): def check_meta_file(meta_info): - for key, val in meta_info.items(): + for _, val in meta_info.items(): if "hash" in val and any(header in val for header in ["content_type", "headers"]): continue + elif val.get("redirect"): + continue else: return False return True From aae3cc20e49bb22410972a03c2dc471eceeac433 Mon Sep 17 00:00:00 2001 From: lordlabuckdas <55460753+lordlabuckdas@users.noreply.github.com> Date: Thu, 15 Jul 2021 00:20:15 +0530 Subject: [PATCH 2/2] move assertion error --- bin/clone | 2 -- snare/cloner.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/bin/clone b/bin/clone index 1f4263b7..655b9816 100644 --- a/bin/clone +++ b/bin/clone @@ -66,8 +66,6 @@ def main(): loop.run_until_complete(cloner.run()) except KeyboardInterrupt: print_color("\nKeyboardInterrupt received... Quitting", "ERROR") - except AssertionError: - print_color("\nConnection closed by the website... Quitting", "ERROR") finally: end = datetime.now() - start loop.run_until_complete(cloner.close()) diff --git a/snare/cloner.py b/snare/cloner.py index 667edd9b..d4d1cf64 100644 --- a/snare/cloner.py +++ b/snare/cloner.py @@ -224,7 +224,7 @@ async def fetch_data(self, session, current_url, level, try_count): if response_url.with_scheme("http") != current_url.with_scheme("http"): redirect_url = response_url data = await response.read() - except (aiohttp.ClientError, asyncio.TimeoutError) as client_error: + except (aiohttp.ClientError, asyncio.TimeoutError, AssertionError) as client_error: self.logger.error(client_error) await self.new_urls.put({"url": current_url, "level": level, "try_count": try_count + 1}) else: