MikeMeliz · MikeMeliz · Mar 26, 2022 · Mar 17, 2022 · Mar 21, 2022 · Mar 22, 2022
diff --git a/README.md b/README.md
@@ -66,6 +66,7 @@ arg | Long | Description
 -e  |--extract| Extract page's code to terminal or file. (Default: Terminal)
 -i  |--input filename| Input file with URL(s) (seperated by line)
 -o  |--output [filename]| Output page(s) to file(s) (for one page)
+-y  |--yara | Perform yara keyword search (0 = search entire html object. 1 = search only text). 
 **Crawl**: | |
 -c  |--crawl| Crawl website (Default output on /links.txt)
 -d  |--cdepth| Set depth of crawl's travel (Default: 1)
@@ -98,6 +99,14 @@ $ python torcrawl.py -u http://www.github.com | grep 'google-analytics'
     <meta name="google-analytics" content="UA-*******-*">
 ```
 
+Extract to file and find only the line with google-analytics using yara:
+```shell
+$ python torcrawl.py -v -w -u https://github.com -e -y 0
+...
+```
+**_Note:_** update res/keyword.yar to search for other keywords.
+Use ```-y 0``` for raw html searching and ```-y 1``` for text search only.
+
 Extract a set of webpages (imported from file) to terminal:
 
 ```shell
@@ -156,6 +165,24 @@ $ python torcrawl.py -u http://www.github.com/ -c -e | grep '</html>'
 ...
 ```
 
+### As Both + Keyword Search:
+You can crawl a page, perform a keyword search and extract the webpages that match the findings into a folder with a single command:
+
+```shell
+$ python torcrawl.py -v -u http://www.github.com/ -c -d 2 -p 5 -e -y 0
+## TOR is ready!
+## URL: http://www.github.com/
+## Your IP: *.*.*.*
+## Crawler Started from http://www.github.com with step 1 and wait 5
+## Step 1 completed with: 11 results
+## File created on /script/path/FolderName/index.htm
+## File created on /script/path/FolderName/projects.html
+## ...
+```
+
+***Note:*** *Update res/keyword.yar to search for other keywords.
+Use ```-y 0``` for raw html searching and ```-y 1``` for text search only.*
+
 ## Demo:
 ![peek 2018-12-08 16-11](https://user-images.githubusercontent.com/9204902/49687660-f72f8280-fb0e-11e8-981e-1bbeeac398cc.gif)
 

diff --git a/modules/crawler.py b/modules/crawler.py
@@ -136,7 +136,8 @@ def crawl(self):
                         continue
 
                     ver_link = self.canonical(link)
-                    lst.add(ver_link)
+                    if ver_link is not None:
+                        lst.add(ver_link)
 
                 # For each <area> tag.
                 for link in soup.findAll('area'):
@@ -146,7 +147,8 @@ def crawl(self):
                         continue
 
                     ver_link = self.canonical(link)
-                    lst.add(ver_link)
+                    if ver_link is not None:
+                        lst.add(ver_link)
 
                 # TODO: For images
                 # TODO: For scripts

diff --git a/modules/extractor.py b/modules/extractor.py
@@ -1,28 +1,68 @@
 #!/usr/bin/python
 import io
 import os
-import sys
+import yara as _yara
 import urllib.error
 import urllib.parse
 import urllib.request
 from urllib.error import HTTPError
 from urllib.error import URLError
+from http.client import InvalidURL
+from http.client import IncompleteRead
 
+from bs4 import BeautifulSoup
 
-def cinex(input_file, out_path):
+
+def text(response=None):
+    """ Removes all the garbage from the HTML and takes only text elements
+    from the page.
+
+    :param response: HTTP Response.
+    :return: String: Text only stripped response.
+    """
+    soup = BeautifulSoup(response, features="lxml")
+    for s in soup(['script', 'style']):
+        s.decompose()
+
+    return ' '.join(soup.stripped_strings)
+
+
+def check_yara(raw=None, yara=0):
+    """ Validates Yara Rule to categorize the site and check for keywords.
+
+    :param raw: HTTP Response body.
+    :param yara:  Integer: Keyword search argument.
+    :return matches: List of yara rule matches.
+    """
+
+    file_path = os.path.join('res/keywords.yar')
+
+    if raw is not None:
+        if yara == 1:
+            raw = text(response=raw).lower()
+
+        file = os.path.join(file_path)
+        rules = _yara.compile(file)
+        matches = rules.match(data=raw)
+        if len(matches) != 0:
+            print("found a match!")
+        return matches
+
+
+def cinex(input_file, out_path, yara=None):
     """ Ingests the crawled links from the input_file,
     scrapes the contents of the resulting web pages and writes the contents to
     the into out_path/{url_address}.
 
     :param input_file: String: Filename of the crawled Urls.
     :param out_path: String: Pathname of results.
+    :param yara: Integer: Keyword search argument.
     :return: None
     """
     file = io.TextIOWrapper
     try:
         file = open(input_file, 'r')
     except IOError as err:
-        # error = sys.exc_info()[0]
         print(f"Error: {err}\n## Can't open: {input_file}")
 
     for line in file:
@@ -40,70 +80,111 @@ def cinex(input_file, out_path):
             print(f"Error: {error}")
             continue
 
-        # Extract page to file
+        # Extract page to file.
         try:
+            content = urllib.request.urlopen(line, timeout=10).read()
+
+            if yara is not None:
+                full_match_keywords = check_yara(content, yara)
+
+                if len(full_match_keywords) == 0:
+                    print('No matches found.')
+                    continue
+
             with open(out_path + "/" + output_file, 'wb') as results:
-                results.write(urllib.request.urlopen(line).read())
+                results.write(content)
             print(f"# File created on: {os.getcwd()}/{out_path}/{output_file}")
+        except HTTPError as e:
+            print(f"Cinex Error: {e.code}, cannot access: {e.url}")
+            continue
+        except InvalidURL as e:
+            print(f"Invalid URL: {line} \n Skipping...")
+            continue
+        except IncompleteRead as e:
+            print(f"IncompleteRead on {line}")
+            continue
         except IOError as err:
-            error = sys.exc_info()[0]
-            print(f"Error: {error}\nCan't write on file: {output_file}")
+            print(f"Error: {err}\nCan't write on file: {output_file}")
     file.close()
 
 
-def intermex(input_file):
+def intermex(input_file, yara):
     """ Input links from file and extract them into terminal.
 
     :param input_file: String: File name of links file.
+    :param yara: Integer: Keyword search argument.
     :return: None
     """
     try:
         with open(input_file, 'r') as file:
             for line in file:
-                print((urllib.request.urlopen(line).read()))
-    except (HTTPError, URLError) as err:
-        print(f"HTTPError: {err}")
+                content = urllib.request.urlopen(line).read()
+                if yara is not None:
+                    full_match_keywords = check_yara(raw=content, yara=yara)
+
+                    if len(full_match_keywords) == 0:
+                        print(f"No matches in: {line}")
+                print(content)
+    except (HTTPError, URLError, InvalidURL) as err:
+        print(f"Request Error: {err}")
     except IOError as err:
-        # error = sys.exc_info()[0]
         print(f"Error: {err}\n## Not valid file")
 
 
-def outex(website, output_file, out_path):
+def outex(website, output_file, out_path, yara):
     """ Scrapes the contents of the provided web address and outputs the
     contents to file.
 
     :param website: String: Url of web address to scrape.
     :param output_file: String: Filename of the results.
     :param out_path: String: Folder name of the output findings.
+    :param yara: Integer: Keyword search argument.
     :return: None
     """
     # Extract page to file
     try:
         output_file = out_path + "/" + output_file
+        content = urllib.request.urlopen(website).read()
+
+        if yara is not None:
+            full_match_keywords = check_yara(raw=content, yara=yara)
+
+            if len(full_match_keywords) == 0:
+                print(f"No matches in: {website}")
+
         with open(output_file, 'wb') as file:
-            file.write(urllib.request.urlopen(website).read())
+            file.write(content)
         print(f"## File created on: {os.getcwd()}/{output_file}")
-    except (HTTPError, URLError) as err:
+    except (HTTPError, URLError, InvalidURL) as err:
         print(f"HTTPError: {err}")
     except IOError as err:
-        # error = sys.exc_info()[0]
         print(f"Error: {err}\n Can't write on file: {output_file}")
 
 
-def termex(website):
+def termex(website, yara):
     """ Scrapes provided web address and prints the results to the terminal.
 
     :param website: String: URL of website to scrape.
+    :param yara: Integer: Keyword search argument.
     :return: None
     """
     try:
-        print((urllib.request.urlopen(website).read()))
-    except (urllib.error.HTTPError, urllib.error.URLError) as err:
+        content = urllib.request.urlopen(website).read()
+        if yara is not None:
+            full_match_keywords = check_yara(content, yara)
+
+            if len(full_match_keywords) == 0:
+                # No match.
+                print(f"No matches in: {website}")
+                return
+
+        print(content)
+    except (HTTPError, URLError, InvalidURL) as err:
         print(f"Error: ({err}) {website}")
         return
 
 
-def extractor(website, crawl, output_file, input_file, out_path):
+def extractor(website, crawl, output_file, input_file, out_path, yara):
     """ Extractor - scrapes the resulting website or discovered links.
 
     :param website: String: URL of website to scrape.
@@ -112,19 +193,20 @@ def extractor(website, crawl, output_file, input_file, out_path):
     :param output_file: String: Filename of resulting output from scrape.
     :param input_file: String: Filename of crawled/discovered URLs
     :param out_path: String: Dir path for output files.
+    :param yara: Integer: keyword search option.
     :return: None
     """
     # TODO: Return output to torcrawl.py
     if len(input_file) > 0:
         if crawl:
-            cinex(input_file, out_path)
+            cinex(input_file, out_path, yara)
         # TODO: Extract from list into a folder
         # elif len(output_file) > 0:
         # 	inoutex(website, input_ile, output_file)
         else:
-            intermex(input_file)
+            intermex(input_file, yara)
     else:
         if len(output_file) > 0:
-            outex(website, output_file, out_path)
+            outex(website, output_file, out_path, yara)
         else:
-            termex(website)
+            termex(website, yara)
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,5 @@
 pysocks
 beautifulsoup4>=4.7.1
 requests>=2.21.0
+yara
+lxml
diff --git a/res/keywords.yar b/res/keywords.yar
@@ -0,0 +1,34 @@
+/*
+    Yara.
+*/
+
+/*
+    rule email_filter
+    {
+        meta:
+            author = "@the-siegfried"
+            score = 20
+        strings:
+              $email_add = /\b[\w-]+(\.[\w-]+)*@[\w-]+(\.[\w-]+)*\.[a-zA-Z-]+[\w-]\b/
+        condition:
+            any of them
+
+    }
+*/
+
+rule keyword_search
+{
+    meta:
+        author = "@the-siegfried"
+        score = 90
+
+    strings:
+        $a = "Keyword1" fullword wide ascii nocase
+        $b = "Keyword Two" wide ascii nocase
+        $c = "kw 3" ascii
+        $d = "KEYWORD four" nocase
+        $e = "google-" nocase
+
+    condition:
+        any of them
+}
diff --git a/torcrawl.py b/torcrawl.py
@@ -20,6 +20,9 @@
                           (Defualt: terminal)
 -i, --input filename    : Input file with URL(s) (seperated by line)
 -o, --output [filename] : Output page(s) to file(s) (for one page)
+-y, --yara              : Yara keyword search page categorisation
+                        read in from /res folder. 0 search whole html object.
+                        1 search only the text.
 
 Crawl:
 -c, --crawl       : Crawl website (Default output on /links.txt)
@@ -155,6 +158,12 @@ def main():
         '--folder',
         help='The root directory which will contain the generated files'
     )
+    parser.add_argument(
+        '-y',
+        '--yara',
+        help='Check for keywords and only scrape documents that contain a '
+             'match. 0 search whole html object. 1 search only the text.'
+    )
 
     args = parser.parse_args()
 
@@ -164,6 +173,9 @@ def main():
     c_depth = args.cdepth if args.cdepth else 0
     c_pause = args.cpause if args.cpause else 1
 
+    if int(args.yara) not in [0, 1]:
+        parser.error("argument -y/--yara: expected argument 0 or 1.")
+
     # Connect to TOR
     if args.without is False:
         check_tor(args.verbose)
@@ -194,9 +206,11 @@ def main():
         print(f"## File created on {os.getcwd()}/{out_path}/links.txt")
         if args.extract:
             input_file = out_path + "/links.txt"
-            extractor(website, args.crawl, output_file, input_file, out_path)
+            extractor(website, args.crawl, output_file, input_file, out_path,
+                      int(args.yara))
     else:
-        extractor(website, args.crawl, output_file, input_file, out_path)
+        extractor(website, args.crawl, output_file, input_file, out_path,
+                  int(args.yara))
 
 
 # Stub to call main method.