Skip to content

Commit

Permalink
Merge pull request #15 from the-siegfried/14-implement-yara-keyword-s…
Browse files Browse the repository at this point in the history
…earch

14 implement yara keyword search
  • Loading branch information
MikeMeliz authored Mar 26, 2022
2 parents 22b0b0e + 32d7b7a commit 3241834
Show file tree
Hide file tree
Showing 6 changed files with 189 additions and 28 deletions.
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ arg | Long | Description
-e |--extract| Extract page's code to terminal or file. (Default: Terminal)
-i |--input filename| Input file with URL(s) (seperated by line)
-o |--output [filename]| Output page(s) to file(s) (for one page)
-y |--yara | Perform yara keyword search (0 = search entire html object. 1 = search only text).
**Crawl**: | |
-c |--crawl| Crawl website (Default output on /links.txt)
-d |--cdepth| Set depth of crawl's travel (Default: 1)
Expand Down Expand Up @@ -98,6 +99,14 @@ $ python torcrawl.py -u http://www.github.com | grep 'google-analytics'
<meta name="google-analytics" content="UA-*******-*">
```

Extract to file and find only the line with google-analytics using yara:
```shell
$ python torcrawl.py -v -w -u https://github.com -e -y 0
...
```
**_Note:_** update res/keyword.yar to search for other keywords.
Use ```-y 0``` for raw html searching and ```-y 1``` for text search only.

Extract a set of webpages (imported from file) to terminal:

```shell
Expand Down Expand Up @@ -156,6 +165,24 @@ $ python torcrawl.py -u http://www.github.com/ -c -e | grep '</html>'
...
```

### As Both + Keyword Search:
You can crawl a page, perform a keyword search and extract the webpages that match the findings into a folder with a single command:

```shell
$ python torcrawl.py -v -u http://www.github.com/ -c -d 2 -p 5 -e -y 0
## TOR is ready!
## URL: http://www.github.com/
## Your IP: *.*.*.*
## Crawler Started from http://www.github.com with step 1 and wait 5
## Step 1 completed with: 11 results
## File created on /script/path/FolderName/index.htm
## File created on /script/path/FolderName/projects.html
## ...
```

***Note:*** *Update res/keyword.yar to search for other keywords.
Use ```-y 0``` for raw html searching and ```-y 1``` for text search only.*

## Demo:
![peek 2018-12-08 16-11](https://user-images.githubusercontent.com/9204902/49687660-f72f8280-fb0e-11e8-981e-1bbeeac398cc.gif)

Expand Down
6 changes: 4 additions & 2 deletions modules/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ def crawl(self):
continue

ver_link = self.canonical(link)
lst.add(ver_link)
if ver_link is not None:
lst.add(ver_link)

# For each <area> tag.
for link in soup.findAll('area'):
Expand All @@ -146,7 +147,8 @@ def crawl(self):
continue

ver_link = self.canonical(link)
lst.add(ver_link)
if ver_link is not None:
lst.add(ver_link)

# TODO: For images
# TODO: For scripts
Expand Down
130 changes: 106 additions & 24 deletions modules/extractor.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,68 @@
#!/usr/bin/python
import io
import os
import sys
import yara as _yara
import urllib.error
import urllib.parse
import urllib.request
from urllib.error import HTTPError
from urllib.error import URLError
from http.client import InvalidURL
from http.client import IncompleteRead

from bs4 import BeautifulSoup

def cinex(input_file, out_path):

def text(response=None):
""" Removes all the garbage from the HTML and takes only text elements
from the page.
:param response: HTTP Response.
:return: String: Text only stripped response.
"""
soup = BeautifulSoup(response, features="lxml")
for s in soup(['script', 'style']):
s.decompose()

return ' '.join(soup.stripped_strings)


def check_yara(raw=None, yara=0):
""" Validates Yara Rule to categorize the site and check for keywords.
:param raw: HTTP Response body.
:param yara: Integer: Keyword search argument.
:return matches: List of yara rule matches.
"""

file_path = os.path.join('res/keywords.yar')

if raw is not None:
if yara == 1:
raw = text(response=raw).lower()

file = os.path.join(file_path)
rules = _yara.compile(file)
matches = rules.match(data=raw)
if len(matches) != 0:
print("found a match!")
return matches


def cinex(input_file, out_path, yara=None):
""" Ingests the crawled links from the input_file,
scrapes the contents of the resulting web pages and writes the contents to
the into out_path/{url_address}.
:param input_file: String: Filename of the crawled Urls.
:param out_path: String: Pathname of results.
:param yara: Integer: Keyword search argument.
:return: None
"""
file = io.TextIOWrapper
try:
file = open(input_file, 'r')
except IOError as err:
# error = sys.exc_info()[0]
print(f"Error: {err}\n## Can't open: {input_file}")

for line in file:
Expand All @@ -40,70 +80,111 @@ def cinex(input_file, out_path):
print(f"Error: {error}")
continue

# Extract page to file
# Extract page to file.
try:
content = urllib.request.urlopen(line, timeout=10).read()

if yara is not None:
full_match_keywords = check_yara(content, yara)

if len(full_match_keywords) == 0:
print('No matches found.')
continue

with open(out_path + "/" + output_file, 'wb') as results:
results.write(urllib.request.urlopen(line).read())
results.write(content)
print(f"# File created on: {os.getcwd()}/{out_path}/{output_file}")
except HTTPError as e:
print(f"Cinex Error: {e.code}, cannot access: {e.url}")
continue
except InvalidURL as e:
print(f"Invalid URL: {line} \n Skipping...")
continue
except IncompleteRead as e:
print(f"IncompleteRead on {line}")
continue
except IOError as err:
error = sys.exc_info()[0]
print(f"Error: {error}\nCan't write on file: {output_file}")
print(f"Error: {err}\nCan't write on file: {output_file}")
file.close()


def intermex(input_file):
def intermex(input_file, yara):
""" Input links from file and extract them into terminal.
:param input_file: String: File name of links file.
:param yara: Integer: Keyword search argument.
:return: None
"""
try:
with open(input_file, 'r') as file:
for line in file:
print((urllib.request.urlopen(line).read()))
except (HTTPError, URLError) as err:
print(f"HTTPError: {err}")
content = urllib.request.urlopen(line).read()
if yara is not None:
full_match_keywords = check_yara(raw=content, yara=yara)

if len(full_match_keywords) == 0:
print(f"No matches in: {line}")
print(content)
except (HTTPError, URLError, InvalidURL) as err:
print(f"Request Error: {err}")
except IOError as err:
# error = sys.exc_info()[0]
print(f"Error: {err}\n## Not valid file")


def outex(website, output_file, out_path):
def outex(website, output_file, out_path, yara):
""" Scrapes the contents of the provided web address and outputs the
contents to file.
:param website: String: Url of web address to scrape.
:param output_file: String: Filename of the results.
:param out_path: String: Folder name of the output findings.
:param yara: Integer: Keyword search argument.
:return: None
"""
# Extract page to file
try:
output_file = out_path + "/" + output_file
content = urllib.request.urlopen(website).read()

if yara is not None:
full_match_keywords = check_yara(raw=content, yara=yara)

if len(full_match_keywords) == 0:
print(f"No matches in: {website}")

with open(output_file, 'wb') as file:
file.write(urllib.request.urlopen(website).read())
file.write(content)
print(f"## File created on: {os.getcwd()}/{output_file}")
except (HTTPError, URLError) as err:
except (HTTPError, URLError, InvalidURL) as err:
print(f"HTTPError: {err}")
except IOError as err:
# error = sys.exc_info()[0]
print(f"Error: {err}\n Can't write on file: {output_file}")


def termex(website):
def termex(website, yara):
""" Scrapes provided web address and prints the results to the terminal.
:param website: String: URL of website to scrape.
:param yara: Integer: Keyword search argument.
:return: None
"""
try:
print((urllib.request.urlopen(website).read()))
except (urllib.error.HTTPError, urllib.error.URLError) as err:
content = urllib.request.urlopen(website).read()
if yara is not None:
full_match_keywords = check_yara(content, yara)

if len(full_match_keywords) == 0:
# No match.
print(f"No matches in: {website}")
return

print(content)
except (HTTPError, URLError, InvalidURL) as err:
print(f"Error: ({err}) {website}")
return


def extractor(website, crawl, output_file, input_file, out_path):
def extractor(website, crawl, output_file, input_file, out_path, yara):
""" Extractor - scrapes the resulting website or discovered links.
:param website: String: URL of website to scrape.
Expand All @@ -112,19 +193,20 @@ def extractor(website, crawl, output_file, input_file, out_path):
:param output_file: String: Filename of resulting output from scrape.
:param input_file: String: Filename of crawled/discovered URLs
:param out_path: String: Dir path for output files.
:param yara: Integer: keyword search option.
:return: None
"""
# TODO: Return output to torcrawl.py
if len(input_file) > 0:
if crawl:
cinex(input_file, out_path)
cinex(input_file, out_path, yara)
# TODO: Extract from list into a folder
# elif len(output_file) > 0:
# inoutex(website, input_ile, output_file)
else:
intermex(input_file)
intermex(input_file, yara)
else:
if len(output_file) > 0:
outex(website, output_file, out_path)
outex(website, output_file, out_path, yara)
else:
termex(website)
termex(website, yara)
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
pysocks
beautifulsoup4>=4.7.1
requests>=2.21.0
yara
lxml
34 changes: 34 additions & 0 deletions res/keywords.yar
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
Yara.
*/

/*
rule email_filter
{
meta:
author = "@the-siegfried"
score = 20
strings:
$email_add = /\b[\w-]+(\.[\w-]+)*@[\w-]+(\.[\w-]+)*\.[a-zA-Z-]+[\w-]\b/
condition:
any of them
}
*/

rule keyword_search
{
meta:
author = "@the-siegfried"
score = 90

strings:
$a = "Keyword1" fullword wide ascii nocase
$b = "Keyword Two" wide ascii nocase
$c = "kw 3" ascii
$d = "KEYWORD four" nocase
$e = "google-" nocase
condition:
any of them
}
18 changes: 16 additions & 2 deletions torcrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
(Defualt: terminal)
-i, --input filename : Input file with URL(s) (seperated by line)
-o, --output [filename] : Output page(s) to file(s) (for one page)
-y, --yara : Yara keyword search page categorisation
read in from /res folder. 0 search whole html object.
1 search only the text.
Crawl:
-c, --crawl : Crawl website (Default output on /links.txt)
Expand Down Expand Up @@ -155,6 +158,12 @@ def main():
'--folder',
help='The root directory which will contain the generated files'
)
parser.add_argument(
'-y',
'--yara',
help='Check for keywords and only scrape documents that contain a '
'match. 0 search whole html object. 1 search only the text.'
)

args = parser.parse_args()

Expand All @@ -164,6 +173,9 @@ def main():
c_depth = args.cdepth if args.cdepth else 0
c_pause = args.cpause if args.cpause else 1

if int(args.yara) not in [0, 1]:
parser.error("argument -y/--yara: expected argument 0 or 1.")

# Connect to TOR
if args.without is False:
check_tor(args.verbose)
Expand Down Expand Up @@ -194,9 +206,11 @@ def main():
print(f"## File created on {os.getcwd()}/{out_path}/links.txt")
if args.extract:
input_file = out_path + "/links.txt"
extractor(website, args.crawl, output_file, input_file, out_path)
extractor(website, args.crawl, output_file, input_file, out_path,
int(args.yara))
else:
extractor(website, args.crawl, output_file, input_file, out_path)
extractor(website, args.crawl, output_file, input_file, out_path,
int(args.yara))


# Stub to call main method.
Expand Down

0 comments on commit 3241834

Please sign in to comment.