Skip to content

Commit

Permalink
chore: -
Browse files Browse the repository at this point in the history
  • Loading branch information
enenumxela committed Mar 17, 2023
1 parent 46973e1 commit 0d234f7
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 100 deletions.
45 changes: 22 additions & 23 deletions cmd/hqcrawl3r/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,36 +17,32 @@ import (
)

var (
concurrency int
debug bool
depth int
headers []string
includeSubdomains bool
proxy string
delay int
maxRandomDelay int
parallelism int
timeout int
userAgent string
targetURL, targetURLs string
monochrome bool
verbosity string
targetURL, targetURLs string
includeSubdomains bool
depth int
userAgent string
headers []string
timeout, delay, maxRandomDelay int
proxy string
parallelism, concurrency int
debug, monochrome bool
verbosity string
)

func init() {
pflag.IntVarP(&concurrency, "concurrency", "c", 10, "")
pflag.BoolVar(&debug, "debug", false, "")
pflag.StringVarP(&targetURL, "url", "u", "", "")
pflag.StringVarP(&targetURLs, "urls", "U", "", "")
pflag.BoolVar(&includeSubdomains, "include-subs", false, "")
pflag.IntVarP(&depth, "depth", "d", 2, "")
pflag.StringVar(&userAgent, "user-agent", "web", "")
pflag.StringSliceVarP(&headers, "headers", "H", []string{}, "")
pflag.BoolVar(&includeSubdomains, "include-subs", false, "")
pflag.StringVar(&proxy, "proxy", "", "")
pflag.IntVar(&timeout, "timeout", 10, "")
pflag.IntVar(&delay, "delay", 1, "")
pflag.IntVar(&maxRandomDelay, "max-random-delay", 1, "")
pflag.StringVar(&proxy, "proxy", "", "")
pflag.IntVarP(&parallelism, "parallelism", "p", 10, "")
pflag.IntVar(&timeout, "timeout", 10, "")
pflag.StringVarP(&targetURL, "url", "u", "", "")
pflag.StringVarP(&targetURLs, "urls", "U", "", "")
pflag.StringVar(&userAgent, "user-agent", "web", "")
pflag.IntVarP(&concurrency, "concurrency", "c", 10, "")
pflag.BoolVar(&debug, "debug", false, "")
pflag.BoolVarP(&monochrome, "monochrome", "m", false, "")
pflag.StringVarP(&verbosity, "verbosity", "v", string(levels.LevelInfo), "")

Expand Down Expand Up @@ -180,7 +176,10 @@ func main() {
go func() {
defer wg.Done()

crawler.Crawl()
_, err = crawler.Run()
if err != nil {
hqlog.Error().Msgf("%s", err)
}
}()

wg.Add(1)
Expand Down
73 changes: 30 additions & 43 deletions pkg/hqcrawl3r/hqcrawl3r.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package hqcrawl3r

import (
"crypto/tls"
"fmt"
"net"
"net/http"
"net/url"
Expand All @@ -21,8 +20,8 @@ type Crawler struct {
URL *hqurl.URL
Options *Options
PageCollector *colly.Collector
FilesRegex *regexp.Regexp
FileCollector *colly.Collector
URLsToLinkFindRegex *regexp.Regexp
URLsNotToRequestRegex *regexp.Regexp
}

Expand All @@ -41,10 +40,12 @@ type Options struct {
UserAgent string
}

var foundURLs sync.Map
var visitedURLs sync.Map
var (
foundURLs, visitedURLs sync.Map
)

func New(options *Options) (crawler Crawler, err error) {
func New(options *Options) (crawler *Crawler, err error) {
crawler = &Crawler{}
crawler.URL = options.TargetURL
crawler.Options = options

Expand All @@ -53,7 +54,6 @@ func New(options *Options) (crawler Crawler, err error) {
colly.MaxDepth(crawler.Options.Depth),
colly.IgnoreRobotsTxt(),
colly.Async(true),
colly.AllowURLRevisit(),
)

if crawler.Options.IncludeSubdomains {
Expand Down Expand Up @@ -157,55 +157,37 @@ func New(options *Options) (crawler Crawler, err error) {
crawler.PageCollector.ID = 1
crawler.FileCollector.ID = 2

crawler.URLsToLinkFindRegex = regexp.MustCompile(`(?m).*?\.*(js|json|xml|csv|txt|map)(\?.*?|)$`)
crawler.FilesRegex = regexp.MustCompile(`(?m).*?\.*(js|json|xml|csv|txt|map)(\?.*?|)$`)
crawler.URLsNotToRequestRegex = regexp.MustCompile(`(?i)\.(apng|bpm|png|bmp|gif|heif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|psd|raw|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf|css)(?:\?|#|$)`)

return
}

func (crawler *Crawler) Crawl() (results chan string, err error) {
crawler.PageCollector.OnRequest(func(request *colly.Request) {
URL := strings.TrimRight(request.URL.String(), "/")
func (crawler *Crawler) toFileCollector(URL string) (err error) {
if err = crawler.FileCollector.Visit(URL); err != nil {
return
}

if _, exists := visitedURLs.Load(URL); exists {
request.Abort()
return
}

return
}
func (crawler *Crawler) Run() (results chan string, err error) {
crawler.PageCollector.OnRequest(func(request *colly.Request) {
URL := strings.TrimRight(request.URL.String(), "/")

if match := crawler.URLsNotToRequestRegex.MatchString(URL); match {
request.Abort()

return
}

if match := crawler.URLsToLinkFindRegex.MatchString(URL); match {
if err = crawler.FileCollector.Visit(URL); err != nil {
fmt.Println(err)
}

if _, exists := visitedURLs.Load(URL); exists {
request.Abort()

return
}

visitedURLs.Store(URL, struct{}{})

return
})

crawler.FileCollector.OnResponse(func(response *colly.Response) {
URL := strings.TrimRight(response.Request.URL.String(), "/")

if _, exists := foundURLs.Load(URL); !exists {
return
}

if err := crawler.record(URL); err != nil {
return
}

foundURLs.Store(URL, struct{}{})
})

crawler.PageCollector.OnHTML("[href]", func(e *colly.HTMLElement) {
Expand Down Expand Up @@ -244,6 +226,14 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
foundURLs.Store(absoluteURL, struct{}{})

if _, exists := visitedURLs.Load(absoluteURL); !exists {
if match := crawler.FilesRegex.MatchString(absoluteURL); match {
if err = crawler.toFileCollector(absoluteURL); err != nil {
return
}

return
}

if err = e.Request.Visit(relativeURL); err != nil {
return
}
Expand All @@ -267,25 +257,23 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
if err = crawler.FileCollector.Visit(js); err != nil {
return
}

visitedURLs.Store(js, struct{}{})
}
}

visitedURLs.Store(URL, struct{}{})
})

crawler.FileCollector.OnResponse(func(response *colly.Response) {
links, err := crawler.FindLinks(string(response.Body))
body := decode(string(response.Body))

links, err := extractLinks(body)
if err != nil {
return
}

if len(links) < 1 {
return
}
for index := range links {
link := links[index]

for _, link := range links {
// Skip blank entries
if len(link) <= 0 {
continue
Expand Down Expand Up @@ -329,7 +317,6 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
return
}

// Async means we must .Wait() on each Collector
crawler.PageCollector.Wait()
crawler.FileCollector.Wait()

Expand Down
18 changes: 0 additions & 18 deletions pkg/hqcrawl3r/linkfinder.go

This file was deleted.

4 changes: 2 additions & 2 deletions pkg/hqcrawl3r/robots.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package hqcrawl3r

import (
"fmt"
"io/ioutil"
"io"
"net/http"
"regexp"
"strings"
Expand All @@ -29,7 +29,7 @@ func (crawler *Crawler) ParseRobots() {
foundURLs.Store(robotsURL, struct{}{})
}

body, err := ioutil.ReadAll(res.Body)
body, err := io.ReadAll(res.Body)
if err != nil {
return
}
Expand Down
34 changes: 20 additions & 14 deletions pkg/hqcrawl3r/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,26 @@ import (
hqurl "github.com/hueristiq/hqgoutils/url"
)

var lfRegex = regexp.MustCompile(`(?:"|')(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;| *()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{3,}(?:[\?|#][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:php|asp|aspx|jsp|json|action|html|js|txt|xml)(?:[\?|#][^"|']{0,}|)))(?:"|')`)

func decode(source string) (decodedSource string) {
replacer := strings.NewReplacer(
`\u002f`, "/",
`\u0026`, "&",
)

decodedSource = replacer.Replace(source)

return
}

func extractLinks(source string) (links []string, err error) {
links = []string{}
links = append(links, lfRegex.FindAllString(source, -1)...)

return
}

func (crawler *Crawler) fixURL(URL string) (fixedURL string) {
// decode
// this ....
Expand Down Expand Up @@ -41,21 +61,7 @@ func (crawler *Crawler) fixURL(URL string) (fixedURL string) {
return
}

func decode(URL string) string {
// In case json encoded chars
replacer := strings.NewReplacer(
`\u002f`, "/",
`\u0026`, "&",
)

URL = replacer.Replace(strings.ToLower(URL))

return URL
}

func (crawler *Crawler) record(URL string) (err error) {
URL = decode(URL)

parsedURL, err := hqurl.Parse(URL)
if err != nil {
return
Expand Down

0 comments on commit 0d234f7

Please sign in to comment.