Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: redirects #302

Merged
merged 2 commits into from
Jul 15, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion bin/clone
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,15 @@ def main():
loop.run_until_complete(cloner.run())
except KeyboardInterrupt:
print_color("\nKeyboardInterrupt received... Quitting", "ERROR")
except AssertionError:
lordlabuckdas marked this conversation as resolved.
Show resolved Hide resolved
print_color("\nConnection closed by the website... Quitting", "ERROR")
finally:
end = datetime.now() - start
loop.run_until_complete(cloner.close())
loop.close()
print("")
print_color("-" * 36 + ">SUMMARY<" + "-" * 36, "INFO")
print_color("\tTotal number of URLs cloned: {}".format(str(cloner.runner.counter)), "INFO")
print_color("\tTotal number of URLs cloned: {}".format(str(len(cloner.runner.visited_urls))), "INFO")
print_color("\tTime elapsed: {}".format(str(end)), "INFO")
print_color("\tCloned directory: {}".format(cloner.runner.target_path), "INFO")
print_color("-" * 82, "INFO")
Expand Down
6 changes: 4 additions & 2 deletions bin/snare
Original file line number Diff line number Diff line change
Expand Up @@ -190,10 +190,12 @@ if __name__ == "__main__":
print_color("Error found in meta.json. Please clone the pages again.", "ERROR")
exit()

if not os.path.exists(os.path.join(full_page_path, os.path.join(meta_info[args.index_page]["hash"]))):
index_file_hash = meta_info[meta_info[args.index_page]["redirect"]]["hash"] if meta_info[args.index_page].get("redirect") else meta_info[args.index_page]["hash"]
if not os.path.exists(os.path.join(full_page_path, index_file_hash)):
print_color("can't create meta tag", "WARNING")
else:
snare_helpers.add_meta_tag(args.page_dir, meta_info[args.index_page]["hash"], config, base_path)
snare_helpers.add_meta_tag(args.page_dir, index_file_hash, config, base_path)
del index_file_hash
loop = asyncio.get_event_loop()
loop.run_until_complete(check_tanner())

Expand Down
88 changes: 53 additions & 35 deletions snare/cloner.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def __init__(self, root, max_depth, css_validate, default_path="/opt/snare"):
self.new_urls = Queue()
self.meta = defaultdict(dict)

self.counter = 0
self.itr = 0

@staticmethod
Expand All @@ -65,6 +64,7 @@ def get_headers(response):
"date",
"etag",
"expires",
"transfer-encoding",
"x-cache",
]

Expand Down Expand Up @@ -96,7 +96,7 @@ async def process_link(self, url, level, check_host=False):
or (self.moved_root is not None and host != self.moved_root.host)
):
return None
if url.human_repr() not in self.visited_urls and (level + 1) <= self.max_depth:
if url.with_scheme("http").human_repr() not in self.visited_urls and (level + 1) <= self.max_depth:
await self.new_urls.put({"url": url, "level": level + 1, "try_count": 0})

res = None
Expand Down Expand Up @@ -139,6 +139,7 @@ def _make_filename(self, url):
file_name = url.relative().human_repr()
else:
file_name = url.human_repr()

if not file_name.startswith("/"):
file_name = "/" + file_name

Expand All @@ -157,44 +158,53 @@ async def get_body(self, driver):
current_url, level, try_count = (await self.new_urls.get()).values()
if try_count > 2:
continue
if current_url.human_repr() in self.visited_urls:
if current_url.with_scheme("http").human_repr() in self.visited_urls:
continue
self.visited_urls.append(current_url.with_scheme("http").human_repr())
redirect_url, data, headers, content_type = await self.fetch_data(driver, current_url, level, try_count)

if not data:
continue
self.visited_urls.append(current_url.human_repr())
file_name, hash_name = self._make_filename(current_url)

if redirect_url:
file_name, hash_name = self._make_filename(redirect_url)
old_file_name, _ = self._make_filename(current_url)
if old_file_name != file_name:
self.meta[old_file_name]["redirect"] = file_name
self.visited_urls.append(redirect_url.with_scheme("http").human_repr())
else:
file_name, hash_name = self._make_filename(current_url)
self.logger.debug("Cloned file: %s", file_name)
data, headers, content_type = await self.fetch_data(driver, current_url, level, try_count)

if data is not None:
self.meta[file_name]["hash"] = hash_name
self.meta[file_name]["headers"] = headers
self.counter = self.counter + 1

if content_type == "text/html":
soup = await self.replace_links(data, level)
data = str(soup).encode()
elif content_type == "text/css":
css = cssutils.parseString(data, validate=self.css_validate)
for carved_url in cssutils.getUrls(css):
if carved_url.startswith("data"):
continue
carved_url = yarl.URL(carved_url)
if not carved_url.is_absolute():
carved_url = self.root.join(carved_url)
if carved_url.human_repr() not in self.visited_urls:
await self.new_urls.put({"url": carved_url, "level": level + 1, "try_count": 0})

try:
with open(os.path.join(self.target_path, hash_name), "wb") as index_fh:
index_fh.write(data)
except TypeError:
await self.new_urls.put({"url": current_url, "level": level, "try_count": try_count + 1})
self.meta[file_name]["hash"] = hash_name
self.meta[file_name]["headers"] = headers

if content_type == "text/html":
soup = await self.replace_links(data, level)
data = str(soup).encode()
elif content_type == "text/css":
css = cssutils.parseString(data, validate=self.css_validate)
for carved_url in cssutils.getUrls(css):
if carved_url.startswith("data"):
continue
carved_url = yarl.URL(carved_url)
if not carved_url.is_absolute():
carved_url = self.root.join(carved_url)
if carved_url.with_scheme("http").human_repr() not in self.visited_urls:
await self.new_urls.put({"url": carved_url, "level": level + 1, "try_count": 0})

try:
with open(os.path.join(self.target_path, hash_name), "wb") as index_fh:
index_fh.write(data)
except TypeError:
await self.new_urls.put({"url": current_url, "level": level, "try_count": try_count + 1})

async def get_root_host(self):
try:
async with aiohttp.ClientSession() as session:
async with session.get(self.root) as resp:
if resp.host != self.root.host:
self.moved_root = resp.url
resp_url = yarl.URL(resp.url)
if resp_url.host != self.root.host or resp_url.path != self.root.path:
self.moved_root = resp_url
except aiohttp.ClientError as err:
self.logger.error("Can't connect to target host: %s", err)
exit(-1)
Expand All @@ -205,17 +215,21 @@ async def fetch_data(self, session, current_url, level, try_count):
data = None
headers = []
content_type = None
redirect_url = None
try:
response = await session.get(current_url, headers={"Accept": "text/html"}, timeout=10.0)
headers = self.get_headers(response)
content_type = response.content_type
response_url = yarl.URL(response.url)
if response_url.with_scheme("http") != current_url.with_scheme("http"):
redirect_url = response_url
data = await response.read()
except (aiohttp.ClientError, asyncio.TimeoutError) as client_error:
self.logger.error(client_error)
await self.new_urls.put({"url": current_url, "level": level, "try_count": try_count + 1})
else:
await response.release()
return [data, headers, content_type]
return [redirect_url, data, headers, content_type]


class HeadlessCloner(BaseCloner):
Expand All @@ -232,11 +246,15 @@ async def fetch_data(self, browser, current_url, level, try_count):
headers = []
content_type = None
page = None
redirect_url = None
try:
page = await browser.newPage()
response = await page.goto(str(current_url))
headers = self.get_headers(response)
content_type = self.get_content_type(headers)
response_url = yarl.URL(response.url)
if response_url.with_scheme("http") != current_url.with_scheme("http"):
redirect_url = response_url
data = await response.buffer()
except Exception as err:
self.logger.error(err)
Expand All @@ -248,7 +266,7 @@ async def fetch_data(self, browser, current_url, level, try_count):
except Exception as err:
print_color(str(err), "WARNING")

return [data, headers, content_type]
return [redirect_url, data, headers, content_type]


class CloneRunner:
Expand Down
2 changes: 2 additions & 0 deletions snare/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ async def submit_slurp(self, data):

async def handle_request(self, request):
self.logger.info("Request path: {0}".format(request.path_qs))
if self.meta[request.path_qs].get("redirect"):
raise web.HTTPFound(self.meta[request.path_qs]["redirect"])
data = self.tanner_handler.create_data(request, 200)
if request.method == "POST":
post_data = await request.post()
Expand Down
4 changes: 3 additions & 1 deletion snare/utils/snare_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,11 @@ def add_meta_tag(page_dir, index_page, config, base_path):


def check_meta_file(meta_info):
for key, val in meta_info.items():
for _, val in meta_info.items():
if "hash" in val and any(header in val for header in ["content_type", "headers"]):
continue
elif val.get("redirect"):
continue
else:
return False
return True
Expand Down