Fixed VirusTotal module (#1172)

* Added call to api endpoint to explicitly gather subdomains from zoomeye, updated user agents, replaced orjson with ujson, and fixed substring not found error. * Updated orjson to ujson. * Fixed semantic error in html check in google workaround. * Fixed flake8 errors. * Fixed VT to use API. * Fixed virustotal module. * Fixed possible edge case that could possibly cause an infinite loop.
laramies · Aug 6, 2022 · c801db6 · c801db6
1 parent 121e23b
commit c801db6
Show file tree

Hide file tree

Showing 5 changed files with 75 additions and 27 deletions.
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -19,4 +19,4 @@ setuptools==63.4.1
 shodan==1.28.0
 slowapi==0.1.5
 uvicorn==0.18.2
-uvloop==0.16.0; platform_system != "Windows"
+uvloop==0.16.0; platform_system != "Windows"
diff --git a/theHarvester.py b/theHarvester.py
@@ -25,4 +25,4 @@
 
             # As we are not using Windows we can change the spawn method to fork for greater performance
             aiomultiprocess.set_context("fork")
-    asyncio.run(__main__.entry_point())
+    asyncio.run(__main__.entry_point())
diff --git a/theHarvester/__main__.py b/theHarvester/__main__.py
@@ -154,13 +154,11 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor
         if store_people:
             people_list = await search_engine.get_people()
             await db_stash.store_all(word, people_list, 'people', source)
-
         if store_links:
             links = await search_engine.get_links()
             linkedin_links_tracker.extend(links)
             if len(links) > 0:
                 await db.store_all(word, links, 'linkedinlinks', engineitem)
-
         if store_interestingurls:
             iurls = await search_engine.get_interestingurls()
             interesting_urls.extend(iurls)
@@ -286,8 +284,6 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor
                         stor_lst.append(store(github_search, engineitem, store_host=True, store_emails=True))
                     except MissingKey as ex:
                         print(ex)
-                    else:
-                        pass
 
                 elif engineitem == 'hackertarget':
                     from theHarvester.discovery import hackertarget
@@ -303,8 +299,6 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor
                     except Exception as e:
                         if isinstance(e, MissingKey):
                             print(e)
-                        else:
-                            pass
 
                 elif engineitem == 'intelx':
                     from theHarvester.discovery import intelxsearch
@@ -388,8 +382,6 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor
                     except Exception as e:
                         if isinstance(e, MissingKey):
                             print(e)
-                        else:
-                            pass
 
                 elif engineitem == 'sublist3r':
                     from theHarvester.discovery import sublist3r
@@ -432,8 +424,6 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor
                     except Exception as e:
                         if isinstance(e, MissingKey):
                             print(e)
-                        else:
-                            pass
 
                 elif engineitem == 'yahoo':
                     from theHarvester.discovery import yahoosearch
@@ -449,8 +439,6 @@ async def store(search_engine: Any, source: str, process_param: Any = None, stor
                     except Exception as e:
                         if isinstance(e, MissingKey):
                             print(e)
-                        else:
-                            pass
         else:
             try:
                 # Check if dns_brute is defined
@@ -836,4 +824,4 @@ async def entry_point():
         print('\n\n[!] ctrl+c detected from user, quitting.\n\n ')
     except Exception as error_entry_point:
         print(error_entry_point)
-        sys.exit(1)
+        sys.exit(1)
diff --git a/theHarvester/discovery/virustotal.py b/theHarvester/discovery/virustotal.py
@@ -1,28 +1,88 @@
 from theHarvester.discovery.constants import *
 from theHarvester.lib.core import *
-from pprint import pprint
 
 
 class SearchVirustotal:
 
     def __init__(self, word):
-        self.word = word
         self.key = Core.virustotal_key()
         if self.key is None:
             raise MissingKey('virustotal')
-        self.totalhosts = set
+        self.word = word
         self.proxy = False
+        self.hostnames = []
 
     async def do_search(self):
-        url = f'https://www.virustotal.com/api/v3/domains/{self.word}/subdomains?limit=40'
-        response = await AsyncFetcher.fetch_all([url], json=True, headers={'User-Agent': Core.get_user_agent(),
-                                                                           'X-APIKEY': self.key},
-                                                proxy=self.proxy)
-        entry = [host for host in response]
-        pprint(entry.items())
+        # TODO determine if more endpoints can yield useful info given a domain
+        # based on: https://developers.virustotal.com/reference/domains-relationships
+        # base_url = "https://www.virustotal.com/api/v3/domains/domain/subdomains?limit=40"
+        headers = {
+            'User-Agent': Core.get_user_agent(),
+            "Accept": "application/json",
+            "x-apikey": self.key
+        }
+        base_url = f"https://www.virustotal.com/api/v3/domains/{self.word}/subdomains?limit=40"
+        cursor = ''
+        count = 0
+        fail_counter = 0
+        counter = 0
+        breakcon = False
+        while True:
+            if breakcon:
+                break
+            # rate limit is 4 per minute
+            # TODO add timer logic if proven to be needed
+            # in the meantime sleeping 16 seconds should eliminate hitting the rate limit
+            # in case rate limit is hit, fail counter exists and sleep for 65 seconds
+            send_url = base_url + "&cursor=" + cursor if cursor != '' and len(cursor) > 2 else base_url
+            responses = await AsyncFetcher.fetch_all([send_url], headers=headers, proxy=self.proxy, json=True)
+            jdata = responses[0]
+            if 'data' not in jdata.keys():
+                await asyncio.sleep(60 + 5)
+                fail_counter += 1
+            if 'meta' in jdata.keys():
+                cursor = jdata['meta']['cursor'] if 'cursor' in jdata['meta'].keys() else ''
+                if len(cursor) == 0 and 'data' in jdata.keys():
+                    # if cursor no longer is within the meta field have hit last entry
+                    breakcon = True
+            count += jdata['meta']['count']
+            if count == 0 or fail_counter >= 2:
+                break
+            if 'data' in jdata.keys():
+                data = jdata['data']
+                self.hostnames.extend(await self.parse_hostnames(data, self.word))
+                counter += 1
+            await asyncio.sleep(16)
+        self.hostnames = list(sorted(set(self.hostnames)))
+        # verify domains such as x.x.com.multicdn.x.com are parsed properly
+        self.hostnames = [host for host in self.hostnames if ((len(host.split('.')) >= 3)
+                                                              and host.split('.')[-2] == self.word.split('.')[-2])]
+
+    async def get_hostnames(self) -> list:
+        return self.hostnames
 
-    # async def get_hostnames(self) -> set:
-    #     return self.total_results
+    @staticmethod
+    async def parse_hostnames(data, word):
+        total_subdomains = set()
+        for attribute in data:
+            total_subdomains.add(attribute['id'].replace('"', '').replace('www.', ''))
+            attributes = attribute['attributes']
+            total_subdomains.update(
+                {value['value'].replace('"', '').replace('www.', '') for value in attributes['last_dns_records'] if
+                 word in value['value']})
+            if 'last_https_certificate' in attributes.keys():
+                total_subdomains.update({value.replace('"', '').replace('www.', '') for value in
+                                         attributes['last_https_certificate']['extensions']['subject_alternative_name']
+                                         if word in value})
+        total_subdomains = list(sorted(total_subdomains))
+        # Other false positives may occur over time and yes there are other ways to parse this, feel free to implement
+        # them and submit a PR or raise an issue if you run into this filtering not being enough
+        # TODO determine if parsing 'v=spf1 include:_spf-x.acme.com include:_spf-x.acme.com' is worth parsing
+        total_subdomains = [x for x in total_subdomains if
+                            not str(x).endswith('edgekey.net') and not str(x).endswith('akadns.net')
+                            and 'include:_spf' not in str(x)]
+        total_subdomains.sort()
+        return total_subdomains
 
     async def process(self, proxy=False):
         self.proxy = proxy

diff --git a/theHarvester/lib/core.py b/theHarvester/lib/core.py
@@ -357,4 +357,4 @@ async def fetch_all(cls, urls, headers='', params='', json=False, takeover=False
                     return texts
                 else:
                     texts = await asyncio.gather(*[AsyncFetcher.fetch(session, url, params, json) for url in urls])
-                    return texts
+                    return texts