Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

14 implement yara keyword search #15

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ arg | Long | Description
-e |--extract| Extract page's code to terminal or file. (Default: Terminal)
-i |--input filename| Input file with URL(s) (seperated by line)
-o |--output [filename]| Output page(s) to file(s) (for one page)
-y |--yara | Perform yara keyword search (0 = search entire html object. 1 = search only text).
**Crawl**: | |
-c |--crawl| Crawl website (Default output on /links.txt)
-d |--cdepth| Set depth of crawl's travel (Default: 1)
Expand Down Expand Up @@ -98,6 +99,14 @@ $ python torcrawl.py -u http://www.github.com | grep 'google-analytics'
<meta name="google-analytics" content="UA-*******-*">
```

Extract to file and find only the line with google-analytics using yara:
```shell
$ python torcrawl.py -v -w -u https://github.com -e -y 0
...
```
**_Note:_** update res/keyword.yar to search for other keywords.
Use ```-y 0``` for raw html searching and ```-y 1``` for text search only.

Extract a set of webpages (imported from file) to terminal:

```shell
Expand Down Expand Up @@ -156,6 +165,24 @@ $ python torcrawl.py -u http://www.github.com/ -c -e | grep '</html>'
...
```

### As Both + Keyword Search:
You can crawl a page, perform a keyword search and extract the webpages that match the findings into a folder with a single command:

```shell
$ python torcrawl.py -v -u http://www.github.com/ -c -d 2 -p 5 -e -y 0
## TOR is ready!
## URL: http://www.github.com/
## Your IP: *.*.*.*
## Crawler Started from http://www.github.com with step 1 and wait 5
## Step 1 completed with: 11 results
## File created on /script/path/FolderName/index.htm
## File created on /script/path/FolderName/projects.html
## ...
```

***Note:*** *Update res/keyword.yar to search for other keywords.
Use ```-y 0``` for raw html searching and ```-y 1``` for text search only.*

## Demo:
![peek 2018-12-08 16-11](https://user-images.githubusercontent.com/9204902/49687660-f72f8280-fb0e-11e8-981e-1bbeeac398cc.gif)

Expand Down
6 changes: 4 additions & 2 deletions modules/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ def crawl(self):
continue

ver_link = self.canonical(link)
lst.add(ver_link)
if ver_link is not None:
lst.add(ver_link)

# For each <area> tag.
for link in soup.findAll('area'):
Expand All @@ -146,7 +147,8 @@ def crawl(self):
continue

ver_link = self.canonical(link)
lst.add(ver_link)
if ver_link is not None:
lst.add(ver_link)

# TODO: For images
# TODO: For scripts
Expand Down
130 changes: 106 additions & 24 deletions modules/extractor.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,68 @@
#!/usr/bin/python
import io
import os
import sys
import yara as _yara
import urllib.error
import urllib.parse
import urllib.request
from urllib.error import HTTPError
from urllib.error import URLError
from http.client import InvalidURL
from http.client import IncompleteRead

from bs4 import BeautifulSoup

def cinex(input_file, out_path):

def text(response=None):
""" Removes all the garbage from the HTML and takes only text elements
from the page.
:param response: HTTP Response.
:return: String: Text only stripped response.
"""
soup = BeautifulSoup(response, features="lxml")
for s in soup(['script', 'style']):
s.decompose()

return ' '.join(soup.stripped_strings)


def check_yara(raw=None, yara=0):
""" Validates Yara Rule to categorize the site and check for keywords.
:param raw: HTTP Response body.
:param yara: Integer: Keyword search argument.
:return matches: List of yara rule matches.
"""

file_path = os.path.join('res/keywords.yar')

if raw is not None:
if yara == 1:
raw = text(response=raw).lower()

file = os.path.join(file_path)
rules = _yara.compile(file)
matches = rules.match(data=raw)
if len(matches) != 0:
print("found a match!")
return matches


def cinex(input_file, out_path, yara=None):
""" Ingests the crawled links from the input_file,
scrapes the contents of the resulting web pages and writes the contents to
the into out_path/{url_address}.
:param input_file: String: Filename of the crawled Urls.
:param out_path: String: Pathname of results.
:param yara: Integer: Keyword search argument.
:return: None
"""
file = io.TextIOWrapper
try:
file = open(input_file, 'r')
except IOError as err:
# error = sys.exc_info()[0]
print(f"Error: {err}\n## Can't open: {input_file}")

for line in file:
Expand All @@ -40,70 +80,111 @@ def cinex(input_file, out_path):
print(f"Error: {error}")
continue

# Extract page to file
# Extract page to file.
try:
content = urllib.request.urlopen(line, timeout=10).read()

if yara is not None:
full_match_keywords = check_yara(content, yara)

if len(full_match_keywords) == 0:
print('No matches found.')
continue

with open(out_path + "/" + output_file, 'wb') as results:
results.write(urllib.request.urlopen(line).read())
results.write(content)
print(f"# File created on: {os.getcwd()}/{out_path}/{output_file}")
except HTTPError as e:
print(f"Cinex Error: {e.code}, cannot access: {e.url}")
continue
except InvalidURL as e:
print(f"Invalid URL: {line} \n Skipping...")
continue
except IncompleteRead as e:
print(f"IncompleteRead on {line}")
continue
except IOError as err:
error = sys.exc_info()[0]
print(f"Error: {error}\nCan't write on file: {output_file}")
print(f"Error: {err}\nCan't write on file: {output_file}")
file.close()


def intermex(input_file):
def intermex(input_file, yara):
""" Input links from file and extract them into terminal.
:param input_file: String: File name of links file.
:param yara: Integer: Keyword search argument.
:return: None
"""
try:
with open(input_file, 'r') as file:
for line in file:
print((urllib.request.urlopen(line).read()))
except (HTTPError, URLError) as err:
print(f"HTTPError: {err}")
content = urllib.request.urlopen(line).read()
if yara is not None:
full_match_keywords = check_yara(raw=content, yara=yara)

if len(full_match_keywords) == 0:
print(f"No matches in: {line}")
print(content)
except (HTTPError, URLError, InvalidURL) as err:
print(f"Request Error: {err}")
except IOError as err:
# error = sys.exc_info()[0]
print(f"Error: {err}\n## Not valid file")


def outex(website, output_file, out_path):
def outex(website, output_file, out_path, yara):
""" Scrapes the contents of the provided web address and outputs the
contents to file.
:param website: String: Url of web address to scrape.
:param output_file: String: Filename of the results.
:param out_path: String: Folder name of the output findings.
:param yara: Integer: Keyword search argument.
:return: None
"""
# Extract page to file
try:
output_file = out_path + "/" + output_file
content = urllib.request.urlopen(website).read()

if yara is not None:
full_match_keywords = check_yara(raw=content, yara=yara)

if len(full_match_keywords) == 0:
print(f"No matches in: {website}")

with open(output_file, 'wb') as file:
file.write(urllib.request.urlopen(website).read())
file.write(content)
print(f"## File created on: {os.getcwd()}/{output_file}")
except (HTTPError, URLError) as err:
except (HTTPError, URLError, InvalidURL) as err:
print(f"HTTPError: {err}")
except IOError as err:
# error = sys.exc_info()[0]
print(f"Error: {err}\n Can't write on file: {output_file}")


def termex(website):
def termex(website, yara):
""" Scrapes provided web address and prints the results to the terminal.
:param website: String: URL of website to scrape.
:param yara: Integer: Keyword search argument.
:return: None
"""
try:
print((urllib.request.urlopen(website).read()))
except (urllib.error.HTTPError, urllib.error.URLError) as err:
content = urllib.request.urlopen(website).read()
if yara is not None:
full_match_keywords = check_yara(content, yara)

if len(full_match_keywords) == 0:
# No match.
print(f"No matches in: {website}")
return

print(content)
except (HTTPError, URLError, InvalidURL) as err:
print(f"Error: ({err}) {website}")
return


def extractor(website, crawl, output_file, input_file, out_path):
def extractor(website, crawl, output_file, input_file, out_path, yara):
""" Extractor - scrapes the resulting website or discovered links.
:param website: String: URL of website to scrape.
Expand All @@ -112,19 +193,20 @@ def extractor(website, crawl, output_file, input_file, out_path):
:param output_file: String: Filename of resulting output from scrape.
:param input_file: String: Filename of crawled/discovered URLs
:param out_path: String: Dir path for output files.
:param yara: Integer: keyword search option.
:return: None
"""
# TODO: Return output to torcrawl.py
if len(input_file) > 0:
if crawl:
cinex(input_file, out_path)
cinex(input_file, out_path, yara)
# TODO: Extract from list into a folder
# elif len(output_file) > 0:
# inoutex(website, input_ile, output_file)
else:
intermex(input_file)
intermex(input_file, yara)
else:
if len(output_file) > 0:
outex(website, output_file, out_path)
outex(website, output_file, out_path, yara)
else:
termex(website)
termex(website, yara)
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
pysocks
beautifulsoup4>=4.7.1
requests>=2.21.0
yara
lxml
34 changes: 34 additions & 0 deletions res/keywords.yar
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
Yara.
*/

/*
rule email_filter
{
meta:
author = "@the-siegfried"
score = 20
strings:
$email_add = /\b[\w-]+(\.[\w-]+)*@[\w-]+(\.[\w-]+)*\.[a-zA-Z-]+[\w-]\b/
condition:
any of them
}
*/

rule keyword_search
{
meta:
author = "@the-siegfried"
score = 90

strings:
$a = "Keyword1" fullword wide ascii nocase
$b = "Keyword Two" wide ascii nocase
$c = "kw 3" ascii
$d = "KEYWORD four" nocase
$e = "google-" nocase
condition:
any of them
}
18 changes: 16 additions & 2 deletions torcrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
(Defualt: terminal)
-i, --input filename : Input file with URL(s) (seperated by line)
-o, --output [filename] : Output page(s) to file(s) (for one page)
-y, --yara : Yara keyword search page categorisation
read in from /res folder. 0 search whole html object.
1 search only the text.

Crawl:
-c, --crawl : Crawl website (Default output on /links.txt)
Expand Down Expand Up @@ -155,6 +158,12 @@ def main():
'--folder',
help='The root directory which will contain the generated files'
)
parser.add_argument(
'-y',
'--yara',
help='Check for keywords and only scrape documents that contain a '
'match. 0 search whole html object. 1 search only the text.'
)

args = parser.parse_args()

Expand All @@ -164,6 +173,9 @@ def main():
c_depth = args.cdepth if args.cdepth else 0
c_pause = args.cpause if args.cpause else 1

if int(args.yara) not in [0, 1]:
parser.error("argument -y/--yara: expected argument 0 or 1.")

# Connect to TOR
if args.without is False:
check_tor(args.verbose)
Expand Down Expand Up @@ -194,9 +206,11 @@ def main():
print(f"## File created on {os.getcwd()}/{out_path}/links.txt")
if args.extract:
input_file = out_path + "/links.txt"
extractor(website, args.crawl, output_file, input_file, out_path)
extractor(website, args.crawl, output_file, input_file, out_path,
int(args.yara))
else:
extractor(website, args.crawl, output_file, input_file, out_path)
extractor(website, args.crawl, output_file, input_file, out_path,
int(args.yara))


# Stub to call main method.
Expand Down