Skip to content

Commit

Permalink
chore: -
Browse files Browse the repository at this point in the history
  • Loading branch information
enenumxela committed Apr 17, 2022
1 parent 73618c3 commit fd57d2f
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 69 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@ A fast web crawler.

## Usage

```text
$ sigrawl3r -h
```bash
sigrawl3r -h
```

```text
_ _ _____
___(_) __ _ _ __ __ ___ _| |___ / _ __
/ __| |/ _` | '__/ _` \ \ /\ / / | |_ \| '__|
Expand Down
78 changes: 39 additions & 39 deletions internal/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ import (
)

type Crawler struct {
URL *urlx.URL
Configuration *configuration.Configuration
PCollector *colly.Collector
LinkFinderCollector *colly.Collector
DocumentsToLinkFindRegex *regexp.Regexp
IgnoreRegex *regexp.Regexp
URL *urlx.URL
Configuration *configuration.Configuration
PageCollector *colly.Collector
LinkFindCollector *colly.Collector
URLsToLinkFindRegex *regexp.Regexp
URLsNotToRequestRegex *regexp.Regexp
}

var foundURLs sync.Map
Expand All @@ -42,7 +42,7 @@ func New(URL *urlx.URL, configuration *configuration.Configuration) (crawler Cra

configuration.AllowedDomains = append(configuration.AllowedDomains, []string{crawler.URL.Domain, "www." + crawler.URL.Domain}...)

crawler.PCollector = colly.NewCollector(
crawler.PageCollector = colly.NewCollector(
colly.IgnoreRobotsTxt(),
// limit crawling to the domain of the specified URL
colly.AllowedDomains(configuration.AllowedDomains...),
Expand All @@ -54,15 +54,15 @@ func New(URL *urlx.URL, configuration *configuration.Configuration) (crawler Cra

// if -subs is present, use regex to filter out subdomains in scope.
if crawler.Configuration.IncludeSubdomains {
crawler.PCollector.AllowedDomains = nil
crawler.PCollector.URLFilters = []*regexp.Regexp{
crawler.PageCollector.AllowedDomains = nil
crawler.PageCollector.URLFilters = []*regexp.Regexp{
regexp.MustCompile(".*(\\.|\\/\\/)" + strings.ReplaceAll(crawler.URL.Domain, ".", "\\.") + "((#|\\/|\\?).*)?"),
}
}

// Debug
if crawler.Configuration.Debug {
crawler.PCollector.SetDebugger(&debug.LogDebugger{})
crawler.PageCollector.SetDebugger(&debug.LogDebugger{})
}

// Setup the client with our transport to pass to the collectors
Expand Down Expand Up @@ -104,18 +104,18 @@ func New(URL *urlx.URL, configuration *configuration.Configuration) (crawler Cra
},
}

crawler.PCollector.SetClient(client)
crawler.PageCollector.SetClient(client)

// set cookie
if crawler.Configuration.Cookie != "" {
crawler.PCollector.OnRequest(func(request *colly.Request) {
crawler.PageCollector.OnRequest(func(request *colly.Request) {
request.Headers.Set("Cookie", crawler.Configuration.Cookie)
})
}

// set headers
if crawler.Configuration.Headers != "" {
crawler.PCollector.OnRequest(func(request *colly.Request) {
crawler.PageCollector.OnRequest(func(request *colly.Request) {
headers := strings.Split(crawler.Configuration.Headers, ";;")
for _, header := range headers {
var parts []string
Expand All @@ -136,18 +136,18 @@ func New(URL *urlx.URL, configuration *configuration.Configuration) (crawler Cra
// Set User-Agent
switch ua := strings.ToLower(crawler.Configuration.UserAgent); {
case strings.HasPrefix(ua, "mobi"):
extensions.RandomMobileUserAgent(crawler.PCollector)
extensions.RandomMobileUserAgent(crawler.PageCollector)
case strings.HasPrefix(ua, "web"):
extensions.RandomUserAgent(crawler.PCollector)
extensions.RandomUserAgent(crawler.PageCollector)
default:
crawler.PCollector.UserAgent = ua
crawler.PageCollector.UserAgent = ua
}

// Referer
extensions.Referer(crawler.PCollector)
extensions.Referer(crawler.PageCollector)

// Set parallelism
if err = crawler.PCollector.Limit(&colly.LimitRule{
if err = crawler.PageCollector.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: crawler.Configuration.Concurrency,
Delay: time.Duration(crawler.Configuration.Delay) * time.Second,
Expand All @@ -156,14 +156,14 @@ func New(URL *urlx.URL, configuration *configuration.Configuration) (crawler Cra
return
}

crawler.LinkFinderCollector = crawler.PCollector.Clone()
crawler.LinkFinderCollector.URLFilters = nil
crawler.LinkFindCollector = crawler.PageCollector.Clone()
crawler.LinkFindCollector.URLFilters = nil

crawler.PCollector.ID = 1
crawler.LinkFinderCollector.ID = 2
crawler.PageCollector.ID = 1
crawler.LinkFindCollector.ID = 2

crawler.DocumentsToLinkFindRegex = regexp.MustCompile(`(?m).*?\.*(js|json|xml|csv|txt|map)(\?.*?|)$`)
crawler.IgnoreRegex = regexp.MustCompile(`(?i)\.(apng|bpm|png|bmp|gif|heif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|psd|raw|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf|css)(?:\?|#|$)`)
crawler.URLsToLinkFindRegex = regexp.MustCompile(`(?m).*?\.*(js|json|xml|csv|txt|map)(\?.*?|)$`)
crawler.URLsNotToRequestRegex = regexp.MustCompile(`(?i)\.(apng|bpm|png|bmp|gif|heif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|psd|raw|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf|css)(?:\?|#|$)`)

return crawler, nil
}
Expand All @@ -177,27 +177,27 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
defer browser.GlobalCancel()

// If renderJavascript, pass the response's body to the renderer and then replace the body for .OnHTML to handle.
crawler.PCollector.OnResponse(func(request *colly.Response) {
crawler.PageCollector.OnResponse(func(request *colly.Response) {
html := browser.GetRenderedSource(request.Request.URL.String())
request.Body = []byte(html)
})
}

crawler.PCollector.OnRequest(func(request *colly.Request) {
crawler.PageCollector.OnRequest(func(request *colly.Request) {
URL := strings.TrimRight(request.URL.String(), "/")

if _, exists := visitedURLs.Load(URL); exists {
request.Abort()
return
}

if match := crawler.IgnoreRegex.MatchString(URL); match {
if match := crawler.URLsNotToRequestRegex.MatchString(URL); match {
request.Abort()
return
}

if match := crawler.DocumentsToLinkFindRegex.MatchString(URL); match {
crawler.LinkFinderCollector.Visit(URL)
if match := crawler.URLsToLinkFindRegex.MatchString(URL); match {
crawler.LinkFindCollector.Visit(URL)
request.Abort()
return
}
Expand All @@ -207,7 +207,7 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
return
})

crawler.LinkFinderCollector.OnResponse(func(response *colly.Response) {
crawler.LinkFindCollector.OnResponse(func(response *colly.Response) {
URL := strings.TrimRight(response.Request.URL.String(), "/")

if _, exists := foundURLs.Load(URL); !exists {
Expand All @@ -219,7 +219,7 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
}
})

crawler.PCollector.OnHTML("*[href]", func(e *colly.HTMLElement) {
crawler.PageCollector.OnHTML("*[href]", func(e *colly.HTMLElement) {
relativeURL := e.Attr("href")
absoluteURL := e.Request.AbsoluteURL(relativeURL)

Expand All @@ -236,7 +236,7 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
}
})

crawler.PCollector.OnHTML("script[src]", func(e *colly.HTMLElement) {
crawler.PageCollector.OnHTML("script[src]", func(e *colly.HTMLElement) {
relativeURL := e.Attr("src")
absoluteURL := e.Request.AbsoluteURL(relativeURL)

Expand All @@ -253,7 +253,7 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
}
})

crawler.LinkFinderCollector.OnRequest(func(request *colly.Request) {
crawler.LinkFindCollector.OnRequest(func(request *colly.Request) {
URL := request.URL.String()

if _, exists := visitedURLs.Load(URL); exists {
Expand All @@ -266,15 +266,15 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
js := strings.ReplaceAll(URL, ".min.js", ".js")

if _, exists := visitedURLs.Load(js); !exists {
crawler.LinkFinderCollector.Visit(js)
crawler.LinkFindCollector.Visit(js)
visitedURLs.Store(js, struct{}{})
}
}

visitedURLs.Store(URL, struct{}{})
})

crawler.LinkFinderCollector.OnResponse(func(response *colly.Response) {
crawler.LinkFindCollector.OnResponse(func(response *colly.Response) {
links, err := crawler.FindLinks(string(response.Body))
if err != nil {
return
Expand Down Expand Up @@ -317,16 +317,16 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
}

if _, exists := visitedURLs.Load(URL); !exists {
crawler.PCollector.Visit(URL)
crawler.PageCollector.Visit(URL)
}
}
})

crawler.PCollector.Visit(crawler.URL.String())
crawler.PageCollector.Visit(crawler.URL.String())

// Async means we must .Wait() on each Collector
crawler.PCollector.Wait()
crawler.LinkFinderCollector.Wait()
crawler.PageCollector.Wait()
crawler.LinkFindCollector.Wait()

return
}
48 changes: 25 additions & 23 deletions internal/crawler/robots.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,38 +11,40 @@ import (
func (crawler *Crawler) ParseRobots() {
robotsURL := fmt.Sprintf("%s://%s/robots.txt", crawler.URL.Scheme, crawler.URL.Host)

if _, exists := visitedURLs.Load(robotsURL); !exists {
res, err := http.Get(robotsURL)
if err != nil {
return
}

if res.StatusCode == 200 {
if _, exists := foundURLs.Load(robotsURL); !exists {
if err := crawler.record(robotsURL); err != nil {
return
}
if _, exists := visitedURLs.Load(robotsURL); exists {
return
}

foundURLs.Store(robotsURL, struct{}{})
}
res, err := http.Get(robotsURL)
if err != nil {
return
}

body, err := ioutil.ReadAll(res.Body)
if err != nil {
if res.StatusCode == 200 {
if _, exists := foundURLs.Load(robotsURL); !exists {
if err := crawler.record(robotsURL); err != nil {
return
}

lines := strings.Split(string(body), "\n")
foundURLs.Store(robotsURL, struct{}{})
}

body, err := ioutil.ReadAll(res.Body)
if err != nil {
return
}

lines := strings.Split(string(body), "\n")

re := regexp.MustCompile(".*llow: ")
re := regexp.MustCompile(".*llow: ")

for _, line := range lines {
if strings.Contains(line, "llow: ") {
URL := re.ReplaceAllString(line, "")
for _, line := range lines {
if strings.Contains(line, "llow: ") {
URL := re.ReplaceAllString(line, "")

URL = fmt.Sprintf("%s://%s%s", crawler.URL.Scheme, crawler.URL.Host, URL)
URL = fmt.Sprintf("%s://%s%s", crawler.URL.Scheme, crawler.URL.Host, URL)

crawler.PCollector.Visit(URL)
}
crawler.PageCollector.Visit(URL)
}
}
}
Expand Down
12 changes: 7 additions & 5 deletions internal/crawler/sitemap.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@ func (crawler *Crawler) ParseSitemap() {
for _, path := range sitemapPaths {
sitemapURL := fmt.Sprintf("%s://%s%s", crawler.URL.Scheme, crawler.URL.Host, path)

if _, exists := visitedURLs.Load(sitemapURL); !exists {
_ = sitemap.ParseFromSite(sitemapURL, func(entry sitemap.Entry) error {
crawler.PCollector.Visit(entry.GetLocation())
return nil
})
if _, exists := visitedURLs.Load(sitemapURL); exists {
continue
}

_ = sitemap.ParseFromSite(sitemapURL, func(entry sitemap.Entry) error {
crawler.PageCollector.Visit(entry.GetLocation())
return nil
})

visitedURLs.Store(sitemapURL, struct{}{})
}
}

0 comments on commit fd57d2f

Please sign in to comment.