-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
62 lines (50 loc) · 1.79 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""
the driver code.
"""
from Spider import Spider
from domainChecker import get_domain_name
from files_organization import *
from urllib.request import urlopen
import re
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
PROJECT_NAME = 'crackstation'
BASE_URL = 'https://crackstation.net/'
DOMAIN = ''
SPIDER_ID = 1
SEARCH_WORD = 'security'
PROJECT_NAME = input('Enter the project name:\t')
while True:
BASE_URL = input('Enter The website URL:\t')
if re.match(regex, BASE_URL) is not None :
RESPONSE = urlopen(BASE_URL).getcode()
if RESPONSE != 200:
WRONG = True
print ("WRONG URL")
else:
break
else:
WRONG = True
print ("WRONG URL")
DOMAIN = get_domain_name(BASE_URL)
SEARCH_WORD = input('Enter the search text, if there is none press enter:\t')
Spider(PROJECT_NAME, BASE_URL, DOMAIN, 'Spider' + str(SPIDER_ID), SEARCH_WORD)
while True:
if len(Spider.wait_list) <= 0:
break
BASE_URL = Spider.wait_list.pop()
Spider.wait_list.add(BASE_URL)
Spider(PROJECT_NAME, BASE_URL, DOMAIN, 'Spider' + str(SPIDER_ID), SEARCH_WORD)
SPIDER_ID += 1
URLS_GATHERED = len(Spider.crawled)
print ('\n' + "Finished Crawling.\n" + "Number of URLs Gathered:\t" + str(URLS_GATHERED))
if SEARCH_WORD != '' :
print("\nSearch Results:\nThe Search Word Found in These URLS:\n")
for url in Spider.search_results:
print (url)
#write_set(Spider.search_results_file, Spider.search_results)