-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmain.go
403 lines (341 loc) · 11.7 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
// Package main contains the main functionality of the program.
// This program spiders a given domain and returns the input fields
// found in the responses.
package main
import (
"bufio"
"crypto/tls"
"flag"
"fmt"
"log"
"net/http"
"net/url"
"os"
"strings"
"sync"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
// HTTP client that ignores TLS errors
var client = http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: true,
},
},
}
// Visited tracks visited URLs, to avoid redundancy & loops
type Visited struct {
URLs map[string]bool
mutex sync.RWMutex
}
var visited Visited
// Whitelist is a group of targets that are allowed to be spidered and searched.
// Targets can be either domains or IP addresses, and must contain the scheme (http or https, in this case). Example: http://www.example.com or https://127.0.0.1:8080
type Whitelist struct {
Targets []*url.URL
}
var whitelist Whitelist
// Set a limit to the number of concurrent workers
var maxWorkers chan struct{}
// Concurrency limit value, changed by the concurrency flag
var concurrencyLimit int
// URLsInProcess is a wait group to ensure that all URLs are processed
var URLsInProcess sync.WaitGroup
// The command-line flags
var flagStartURL = flag.String("urls", "", "URL or comma-separated list of URLs to search. The domain and scheme will be used as the whitelist.")
var flagURLFile = flag.String("url-file", "", "The location (relative or absolute path) of a file of newline-separated URLs to search.")
var flagConcurrency = flag.Int("concurrency", 3, "The level of concurrency in network requests and internal data processing. 0 - 5; 0 = no concurrency, 5 = very high level of concurrency.")
var flagVerbose = flag.Bool("v", false, "Enable verbose logging to the console.")
var flagVerbose2 = flag.Bool("vv", false, "Enable doubly-verbose logging to the console.")
// Function main is the entry point for the application. It parses the flags
// provided by the user and calls the router function for any URLs
// passed into the URL queue.
func main() {
// Change output location of logs
log.SetOutput(os.Stdout)
// Configure the usage message
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0])
fmt.Fprint(os.Stderr, " --help/-h: Displays this message\n")
flag.PrintDefaults()
fmt.Fprint(os.Stderr, "\nExamples:\n")
fmt.Fprintf(os.Stderr, "\t%s -urls=http://www.example.com/\n", os.Args[0])
fmt.Fprintf(os.Stderr, "\t%s -urls=https://www.example.com/\n", os.Args[0])
fmt.Fprintf(os.Stderr, "\t%s -urls=http://127.0.0.1/\n", os.Args[0])
fmt.Fprintf(os.Stderr, "\t%s -urls=http://127.0.0.1:8080/\n", os.Args[0])
fmt.Fprintf(os.Stderr, "\t%s -concurrency=0 -urls=http://127.0.0.1:8080/\n", os.Args[0])
fmt.Fprintf(os.Stderr, "\t%s -concurrency=5 -urls=http://127.0.0.1:8080/\n", os.Args[0])
fmt.Fprintf(os.Stderr, "\t%s -urls=http://127.0.0.1,http://www.example.com/\n", os.Args[0])
fmt.Fprintf(os.Stderr, "\t%s -url-file=/root/urls.txt\n", os.Args[0])
fmt.Fprintf(os.Stderr, "\t%s -url-file=urls.txt\n", os.Args[0])
fmt.Fprintf(os.Stderr, "\t%s -v -urls=http://www.example.com/example/\n", os.Args[0])
fmt.Fprintf(os.Stderr, "\t%s -vv -urls=http://www.example.com/example/page/1?id=2#heading\n", os.Args[0])
}
// Parse the command-line flags provided
flag.Parse()
// Ensure that we have required flags
if *flagStartURL == "" && *flagURLFile == "" {
// Default values provided
flag.Usage()
os.Exit(1)
}
// Set up the visited URLs
visited = Visited{
URLs: make(map[string]bool),
}
// Set the concurrency limit for requests and internal data processing
switch *flagConcurrency {
case 0:
// No concurrency
concurrencyLimit = 1
case 1:
concurrencyLimit = 2
case 2:
concurrencyLimit = 5
case 4:
concurrencyLimit = 20
case 5:
concurrencyLimit = 50
default:
// Default, == value of 3
concurrencyLimit = 10
}
maxWorkers = make(chan struct{}, concurrencyLimit)
// Check for values in the `-urls` flag
if *flagStartURL != "" {
// Prepare the starting URLs
startURLs := strings.Split(*flagStartURL, ",")
// Iterate through the URLs and add them to the whitelist
for _, urlValue := range startURLs {
// Check if the start URL is valid
validURL, err := url.Parse(urlValue)
if err != nil || validURL.String() == "" {
log.Println("[ERROR] Invalid URL provided.")
flag.Usage()
os.Exit(1)
}
// Remove hashes from the URL
validURL.Fragment = ""
// Add the URL to the whitelist
whitelist.Targets = append(whitelist.Targets, validURL)
// Queue up the URL
addURL(validURL)
}
}
// Check for values in the `-url-file` flag
if *flagURLFile != "" {
// Attempt to open the URL file
file, err := os.Open(*flagURLFile)
if err != nil {
// Error opening the file
log.Printf("[ERROR] Unable to open the file: %s\n", *flagURLFile)
flag.Usage()
os.Exit(1)
}
// Ensure the file is closed
defer file.Close()
// Initialize the file scanner
scanner := bufio.NewScanner(file)
// Iterate through the lines of the file
for scanner.Scan() {
// Grab the next URL
nextLine := scanner.Text()
validURL, err := url.Parse(nextLine)
if err != nil {
// Invalid URL provided
log.Printf("[ERROR] Invalid URL provided: %s\n", nextLine)
flag.Usage()
os.Exit(1)
}
// Remove hashes from the URL
validURL.Fragment = ""
// Add the URL to the whitelist
whitelist.Targets = append(whitelist.Targets, validURL)
// Queue up the URL
addURL(validURL)
}
}
// Wait for all URLs to be processed
URLsInProcess.Wait()
}
// Function dataRouter requests the given URL, and passes it to various helper functions.
// It returns any errors it receives throughout this process.
// Output functionality currently occurs in the helper functions.
func dataRouter(urlValue *url.URL) (err error) {
// Set up an internal wait group for processing responses locally in a concurrent manner
var wg sync.WaitGroup
defer URLsInProcess.Done() // clean up
// Increment the concurrency limit
maxWorkers <- struct{}{}
defer func() {
<-maxWorkers
}() // Clean up
// Get the first URL's document body
response, err := client.Get(urlValue.String())
if err != nil {
log.Printf("[ERROR] [%s] %s\n", urlValue.String(), err.Error())
return
}
defer response.Body.Close() // Make sure the response gets closed
document, err := html.Parse(response.Body)
if err != nil {
log.Printf("[ERROR] [%s] %s\n", urlValue.String(), err.Error())
return
}
// Run the spidering function on the html document
wg.Add(1)
go func() {
defer wg.Done()
getAnchors(document, urlValue)
}()
// Search for input fields in the html document
wg.Add(1)
go func() {
defer wg.Done()
getInputs(document, urlValue)
}()
// Wait for all the concurrent processes to finish
wg.Wait()
return
}
// Function getAnchors parses out the links from anchor elements found in the
// provided HTML node.
// It uses the provided worker pool to perform the task concurrently for the calling function,
// returning a worker back to the pool upon completion.
// urlValue is the current URL that it is working with; this is used for contextual logging.
func getAnchors(document *html.Node, currentURL *url.URL) {
// VERBOSE 2
if *flagVerbose2 {
fmt.Printf("[VERBOSE] [%s] Processing HTML for links\n", currentURL.String())
}
// Recursively search the document tree for anchor values
var nodeSearch func(*html.Node)
nodeSearch = func(node *html.Node) {
if node.Type == html.ElementNode && node.DataAtom == atom.A {
// We've found an anchor tag, get the href value
for _, attribute := range node.Attr {
if attribute.Key == "href" {
// Check for useless links
if attribute.Val == "#" || attribute.Val == "" {
continue
}
// Make sure it's a valid URL
urlValue, err := url.Parse(attribute.Val)
if err != nil || urlValue.String() == "" {
log.Printf("[ERROR] [%s] Error parsing URL: %s\n", currentURL.String(), attribute.Val)
continue
}
// Check for relative URLs
if urlValue.Scheme == "" && urlValue.String()[:1] == "/" {
// Path relative to root domain, add the appropriate scheme and domain
urlValue.Scheme = currentURL.Scheme
urlValue.Host = currentURL.Host
} else if urlValue.Scheme == "" && urlValue.String()[:2] == "//" {
// Path relative to scheme, add the appropriate scheme
urlValue.Scheme = currentURL.Scheme
}
// Queue up the URL
addURL(urlValue)
}
}
}
// recurse down the tree
for child := node.FirstChild; child != nil; child = child.NextSibling {
nodeSearch(child)
}
}
nodeSearch(document)
}
// Function addURL passes the URL back to the data router for processing
// if it is whitelisted, and has not already been visited.
func addURL(urlValue *url.URL) {
// Make sure the URL is in the whitelisted domains list
if isWhitelisted(urlValue) {
// Rebuild the url string, removing any hashes from the link
urlValue.Fragment = ""
urlString := urlValue.String()
// Check for trailing slash
var urlStringNoSlash string
if strings.HasSuffix(urlString, "/") {
urlStringNoSlash = urlString[:len(urlString)-1]
} else {
urlStringNoSlash = urlString
}
// Make sure the URL has not been visited
visited.mutex.Lock()
defer visited.mutex.Unlock()
_, exists := visited.URLs[urlString]
_, existsNoSlash := visited.URLs[urlStringNoSlash]
if !exists && !existsNoSlash {
// VERBOSE
if *flagVerbose || *flagVerbose2 {
fmt.Printf("[VERBOSE] [%s] URL found\n", urlString)
}
// Add the URL to visited now, to prevent race issues
visited.URLs[urlValue.String()] = true
// Increment the global wait group
URLsInProcess.Add(1)
// Start processing the URL on a separate thread
go dataRouter(urlValue)
}
}
return
}
// Function isWhitelisted checks if a provided URL is on the whitelist.
func isWhitelisted(urlValue *url.URL) (whitelisted bool) {
// Assume false
whitelisted = false
// Check scheme & host against whitelisted values
for _, target := range whitelist.Targets {
if strings.ToLower(urlValue.Scheme) == strings.ToLower(target.Scheme) && strings.ToLower(urlValue.Host) == strings.ToLower(target.Host) {
// URL is whitelisted
whitelisted = true
return
}
}
return
}
// Function getInputs parses out the input elements from the provided HTML node.
// It uses the worker pool to perform the task concurrently from the calling function,
// returning the worker to the pool upon completion.
// urlValue is the current URL that it is working with; this is used for contextual logging.
func getInputs(document *html.Node, urlValue *url.URL) {
// VERBOSE 2
if *flagVerbose2 {
fmt.Printf("[VERBOSE] [%s] Processing HTML for inputs\n", urlValue.String())
}
// Create a slice to hold all the input fields for the current URL
var inputs []string
// Recursively search the document tree for input fields
var nodeSearch func(*html.Node)
nodeSearch = func(node *html.Node) {
if node.Type == html.ElementNode && node.DataAtom == atom.Input {
// We've found an input tag
// Recreate the input code
var input = "<input "
for _, attribute := range node.Attr {
input = input + fmt.Sprintf(" %s=\"%s\"", attribute.Key, attribute.Val)
}
input = input + "></input>"
// Remove newline characters
cleanInput := strings.Replace(input, "\n", "", -1)
// Add the input tag to the inputs slice
inputs = append(inputs, cleanInput)
}
// recurse down the tree
for child := node.FirstChild; child != nil; child = child.NextSibling {
nodeSearch(child)
}
}
nodeSearch(document)
// Output the input elements found on the current URL, if any are found
if len(inputs) > 0 {
fmt.Printf("[%s]\n", urlValue.String())
for _, input := range inputs {
fmt.Printf("\t%s\n", input)
}
// Extra line for spacing
fmt.Println()
}
}