chore: -

hueristiq · Apr 17, 2022 · fd57d2f · fd57d2f
1 parent 73618c3
commit fd57d2f
Show file tree

Hide file tree

Showing 4 changed files with 75 additions and 69 deletions.
diff --git a/README.md b/README.md
@@ -24,9 +24,11 @@ A fast web crawler.
 
 ## Usage
 
-```text
-$ sigrawl3r -h
+```bash
+sigrawl3r -h
+```
 
+```text
      _                          _ _____      
  ___(_) __ _ _ __ __ ___      _| |___ / _ __ 
 / __| |/ _` | '__/ _` \ \ /\ / / | |_ \| '__|

diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go
@@ -21,12 +21,12 @@ import (
 )
 
 type Crawler struct {
-	URL                      *urlx.URL
-	Configuration            *configuration.Configuration
-	PCollector               *colly.Collector
-	LinkFinderCollector      *colly.Collector
-	DocumentsToLinkFindRegex *regexp.Regexp
-	IgnoreRegex              *regexp.Regexp
+	URL                   *urlx.URL
+	Configuration         *configuration.Configuration
+	PageCollector         *colly.Collector
+	LinkFindCollector     *colly.Collector
+	URLsToLinkFindRegex   *regexp.Regexp
+	URLsNotToRequestRegex *regexp.Regexp
 }
 
 var foundURLs sync.Map
@@ -42,7 +42,7 @@ func New(URL *urlx.URL, configuration *configuration.Configuration) (crawler Cra
 
 	configuration.AllowedDomains = append(configuration.AllowedDomains, []string{crawler.URL.Domain, "www." + crawler.URL.Domain}...)
 
-	crawler.PCollector = colly.NewCollector(
+	crawler.PageCollector = colly.NewCollector(
 		colly.IgnoreRobotsTxt(),
 		// limit crawling to the domain of the specified URL
 		colly.AllowedDomains(configuration.AllowedDomains...),
@@ -54,15 +54,15 @@ func New(URL *urlx.URL, configuration *configuration.Configuration) (crawler Cra
 
 	// if -subs is present, use regex to filter out subdomains in scope.
 	if crawler.Configuration.IncludeSubdomains {
-		crawler.PCollector.AllowedDomains = nil
-		crawler.PCollector.URLFilters = []*regexp.Regexp{
+		crawler.PageCollector.AllowedDomains = nil
+		crawler.PageCollector.URLFilters = []*regexp.Regexp{
 			regexp.MustCompile(".*(\\.|\\/\\/)" + strings.ReplaceAll(crawler.URL.Domain, ".", "\\.") + "((#|\\/|\\?).*)?"),
 		}
 	}
 
 	// Debug
 	if crawler.Configuration.Debug {
-		crawler.PCollector.SetDebugger(&debug.LogDebugger{})
+		crawler.PageCollector.SetDebugger(&debug.LogDebugger{})
 	}
 
 	// Setup the client with our transport to pass to the collectors
@@ -104,18 +104,18 @@ func New(URL *urlx.URL, configuration *configuration.Configuration) (crawler Cra
 		},
 	}
 
-	crawler.PCollector.SetClient(client)
+	crawler.PageCollector.SetClient(client)
 
 	// set cookie
 	if crawler.Configuration.Cookie != "" {
-		crawler.PCollector.OnRequest(func(request *colly.Request) {
+		crawler.PageCollector.OnRequest(func(request *colly.Request) {
 			request.Headers.Set("Cookie", crawler.Configuration.Cookie)
 		})
 	}
 
 	// set headers
 	if crawler.Configuration.Headers != "" {
-		crawler.PCollector.OnRequest(func(request *colly.Request) {
+		crawler.PageCollector.OnRequest(func(request *colly.Request) {
 			headers := strings.Split(crawler.Configuration.Headers, ";;")
 			for _, header := range headers {
 				var parts []string
@@ -136,18 +136,18 @@ func New(URL *urlx.URL, configuration *configuration.Configuration) (crawler Cra
 	// Set User-Agent
 	switch ua := strings.ToLower(crawler.Configuration.UserAgent); {
 	case strings.HasPrefix(ua, "mobi"):
-		extensions.RandomMobileUserAgent(crawler.PCollector)
+		extensions.RandomMobileUserAgent(crawler.PageCollector)
 	case strings.HasPrefix(ua, "web"):
-		extensions.RandomUserAgent(crawler.PCollector)
+		extensions.RandomUserAgent(crawler.PageCollector)
 	default:
-		crawler.PCollector.UserAgent = ua
+		crawler.PageCollector.UserAgent = ua
 	}
 
 	// Referer
-	extensions.Referer(crawler.PCollector)
+	extensions.Referer(crawler.PageCollector)
 
 	// Set parallelism
-	if err = crawler.PCollector.Limit(&colly.LimitRule{
+	if err = crawler.PageCollector.Limit(&colly.LimitRule{
 		DomainGlob:  "*",
 		Parallelism: crawler.Configuration.Concurrency,
 		Delay:       time.Duration(crawler.Configuration.Delay) * time.Second,
@@ -156,14 +156,14 @@ func New(URL *urlx.URL, configuration *configuration.Configuration) (crawler Cra
 		return
 	}
 
-	crawler.LinkFinderCollector = crawler.PCollector.Clone()
-	crawler.LinkFinderCollector.URLFilters = nil
+	crawler.LinkFindCollector = crawler.PageCollector.Clone()
+	crawler.LinkFindCollector.URLFilters = nil
 
-	crawler.PCollector.ID = 1
-	crawler.LinkFinderCollector.ID = 2
+	crawler.PageCollector.ID = 1
+	crawler.LinkFindCollector.ID = 2
 
-	crawler.DocumentsToLinkFindRegex = regexp.MustCompile(`(?m).*?\.*(js|json|xml|csv|txt|map)(\?.*?|)$`)
-	crawler.IgnoreRegex = regexp.MustCompile(`(?i)\.(apng|bpm|png|bmp|gif|heif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|psd|raw|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf|css)(?:\?|#|$)`)
+	crawler.URLsToLinkFindRegex = regexp.MustCompile(`(?m).*?\.*(js|json|xml|csv|txt|map)(\?.*?|)$`)
+	crawler.URLsNotToRequestRegex = regexp.MustCompile(`(?i)\.(apng|bpm|png|bmp|gif|heif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|psd|raw|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf|css)(?:\?|#|$)`)
 
 	return crawler, nil
 }
@@ -177,27 +177,27 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
 		defer browser.GlobalCancel()
 
 		// If renderJavascript, pass the response's body to the renderer and then replace the body for .OnHTML to handle.
-		crawler.PCollector.OnResponse(func(request *colly.Response) {
+		crawler.PageCollector.OnResponse(func(request *colly.Response) {
 			html := browser.GetRenderedSource(request.Request.URL.String())
 			request.Body = []byte(html)
 		})
 	}
 
-	crawler.PCollector.OnRequest(func(request *colly.Request) {
+	crawler.PageCollector.OnRequest(func(request *colly.Request) {
 		URL := strings.TrimRight(request.URL.String(), "/")
 
 		if _, exists := visitedURLs.Load(URL); exists {
 			request.Abort()
 			return
 		}
 
-		if match := crawler.IgnoreRegex.MatchString(URL); match {
+		if match := crawler.URLsNotToRequestRegex.MatchString(URL); match {
 			request.Abort()
 			return
 		}
 
-		if match := crawler.DocumentsToLinkFindRegex.MatchString(URL); match {
-			crawler.LinkFinderCollector.Visit(URL)
+		if match := crawler.URLsToLinkFindRegex.MatchString(URL); match {
+			crawler.LinkFindCollector.Visit(URL)
 			request.Abort()
 			return
 		}
@@ -207,7 +207,7 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
 		return
 	})
 
-	crawler.LinkFinderCollector.OnResponse(func(response *colly.Response) {
+	crawler.LinkFindCollector.OnResponse(func(response *colly.Response) {
 		URL := strings.TrimRight(response.Request.URL.String(), "/")
 
 		if _, exists := foundURLs.Load(URL); !exists {
@@ -219,7 +219,7 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
 		}
 	})
 
-	crawler.PCollector.OnHTML("*[href]", func(e *colly.HTMLElement) {
+	crawler.PageCollector.OnHTML("*[href]", func(e *colly.HTMLElement) {
 		relativeURL := e.Attr("href")
 		absoluteURL := e.Request.AbsoluteURL(relativeURL)
 
@@ -236,7 +236,7 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
 		}
 	})
 
-	crawler.PCollector.OnHTML("script[src]", func(e *colly.HTMLElement) {
+	crawler.PageCollector.OnHTML("script[src]", func(e *colly.HTMLElement) {
 		relativeURL := e.Attr("src")
 		absoluteURL := e.Request.AbsoluteURL(relativeURL)
 
@@ -253,7 +253,7 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
 		}
 	})
 
-	crawler.LinkFinderCollector.OnRequest(func(request *colly.Request) {
+	crawler.LinkFindCollector.OnRequest(func(request *colly.Request) {
 		URL := request.URL.String()
 
 		if _, exists := visitedURLs.Load(URL); exists {
@@ -266,15 +266,15 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
 			js := strings.ReplaceAll(URL, ".min.js", ".js")
 
 			if _, exists := visitedURLs.Load(js); !exists {
-				crawler.LinkFinderCollector.Visit(js)
+				crawler.LinkFindCollector.Visit(js)
 				visitedURLs.Store(js, struct{}{})
 			}
 		}
 
 		visitedURLs.Store(URL, struct{}{})
 	})
 
-	crawler.LinkFinderCollector.OnResponse(func(response *colly.Response) {
+	crawler.LinkFindCollector.OnResponse(func(response *colly.Response) {
 		links, err := crawler.FindLinks(string(response.Body))
 		if err != nil {
 			return
@@ -317,16 +317,16 @@ func (crawler *Crawler) Crawl() (results chan string, err error) {
 			}
 
 			if _, exists := visitedURLs.Load(URL); !exists {
-				crawler.PCollector.Visit(URL)
+				crawler.PageCollector.Visit(URL)
 			}
 		}
 	})
 
-	crawler.PCollector.Visit(crawler.URL.String())
+	crawler.PageCollector.Visit(crawler.URL.String())
 
 	// Async means we must .Wait() on each Collector
-	crawler.PCollector.Wait()
-	crawler.LinkFinderCollector.Wait()
+	crawler.PageCollector.Wait()
+	crawler.LinkFindCollector.Wait()
 
 	return
 }
diff --git a/internal/crawler/robots.go b/internal/crawler/robots.go
@@ -11,38 +11,40 @@ import (
 func (crawler *Crawler) ParseRobots() {
 	robotsURL := fmt.Sprintf("%s://%s/robots.txt", crawler.URL.Scheme, crawler.URL.Host)
 
-	if _, exists := visitedURLs.Load(robotsURL); !exists {
-		res, err := http.Get(robotsURL)
-		if err != nil {
-			return
-		}
-
-		if res.StatusCode == 200 {
-			if _, exists := foundURLs.Load(robotsURL); !exists {
-				if err := crawler.record(robotsURL); err != nil {
-					return
-				}
+	if _, exists := visitedURLs.Load(robotsURL); exists {
+		return
+	}
 
-				foundURLs.Store(robotsURL, struct{}{})
-			}
+	res, err := http.Get(robotsURL)
+	if err != nil {
+		return
+	}
 
-			body, err := ioutil.ReadAll(res.Body)
-			if err != nil {
+	if res.StatusCode == 200 {
+		if _, exists := foundURLs.Load(robotsURL); !exists {
+			if err := crawler.record(robotsURL); err != nil {
 				return
 			}
 
-			lines := strings.Split(string(body), "\n")
+			foundURLs.Store(robotsURL, struct{}{})
+		}
+
+		body, err := ioutil.ReadAll(res.Body)
+		if err != nil {
+			return
+		}
+
+		lines := strings.Split(string(body), "\n")
 
-			re := regexp.MustCompile(".*llow: ")
+		re := regexp.MustCompile(".*llow: ")
 
-			for _, line := range lines {
-				if strings.Contains(line, "llow: ") {
-					URL := re.ReplaceAllString(line, "")
+		for _, line := range lines {
+			if strings.Contains(line, "llow: ") {
+				URL := re.ReplaceAllString(line, "")
 
-					URL = fmt.Sprintf("%s://%s%s", crawler.URL.Scheme, crawler.URL.Host, URL)
+				URL = fmt.Sprintf("%s://%s%s", crawler.URL.Scheme, crawler.URL.Host, URL)
 
-					crawler.PCollector.Visit(URL)
-				}
+				crawler.PageCollector.Visit(URL)
 			}
 		}
 	}

diff --git a/internal/crawler/sitemap.go b/internal/crawler/sitemap.go
@@ -12,13 +12,15 @@ func (crawler *Crawler) ParseSitemap() {
 	for _, path := range sitemapPaths {
 		sitemapURL := fmt.Sprintf("%s://%s%s", crawler.URL.Scheme, crawler.URL.Host, path)
 
-		if _, exists := visitedURLs.Load(sitemapURL); !exists {
-			_ = sitemap.ParseFromSite(sitemapURL, func(entry sitemap.Entry) error {
-				crawler.PCollector.Visit(entry.GetLocation())
-				return nil
-			})
+		if _, exists := visitedURLs.Load(sitemapURL); exists {
+			continue
 		}
 
+		_ = sitemap.ParseFromSite(sitemapURL, func(entry sitemap.Entry) error {
+			crawler.PageCollector.Visit(entry.GetLocation())
+			return nil
+		})
+
 		visitedURLs.Store(sitemapURL, struct{}{})
 	}
 }