Skip to content

Commit

Permalink
feat: Add rendering
Browse files Browse the repository at this point in the history
Render web pages including Single Page Application such as React.

resolves #4
  • Loading branch information
enenumxela committed Jun 2, 2023
1 parent 3a33f50 commit 879b46d
Show file tree
Hide file tree
Showing 7 changed files with 254 additions and 95 deletions.
40 changes: 22 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
* Parses URLs from files (`.js`, `.json`, `.xml`, `.csv`, `.txt` & `.map`).
* Parses URLs from `robots.txt`.
* Parses URLs from sitemaps.
* Renders pages (including Single Page Applications such as Angular and React).
* Cross-Platform (Windows, Linux & macOS)

## Installation
Expand Down Expand Up @@ -124,36 +125,39 @@ USAGE:
xcrawl3r [OPTIONS]
INPUT:
-d, --domain string domain to match URLs
--include-subdomains bool match subdomains' URLs
-s, --seeds string seed URLs file (use `-` to get from stdin)
-u, --url string URL to crawl
-d, --domain string domain to match URLs
--include-subdomains bool match subdomains' URLs
-s, --seeds string seed URLs file (use `-` to get from stdin)
-u, --url string URL to crawl
CONFIGURATION:
--depth int maximum depth to crawl (default 3)
--depth int maximum depth to crawl (default 3)
TIP: set it to `0` for infinite recursion
--timeout int time to wait for request in seconds (default: 10)
-H, --headers string[] custom header to include in requests
--headless bool If true the browser will be displayed while crawling.
-H, --headers string[] custom header to include in requests
e.g. -H 'Referer: http://example.com/'
TIP: use multiple flag to set multiple headers
--user-agent string User Agent to use (default: web)
--proxy string[] Proxy URL (e.g: http://127.0.0.1:8080)
TIP: use multiple flag to set multiple proxies
--render bool utilize a headless chrome instance to render pages
--timeout int time to wait for request in seconds (default: 10)
--user-agent string User Agent to use (default: web)
TIP: use `web` for a random web user-agent,
`mobile` for a random mobile user-agent,
or you can set your specific user-agent.
--proxy string[] Proxy URL (e.g: http://127.0.0.1:8080)
TIP: use multiple flag to set multiple proxies
RATE LIMIT:
-c, --concurrency int number of concurrent fetchers to use (default 10)
-p, --parallelism int number of concurrent URLs to process (default: 10)
--delay int delay between each request in seconds
--max-random-delay int maximux extra randomized delay added to `--dalay` (default: 1s)
-c, --concurrency int number of concurrent fetchers to use (default 10)
--delay int delay between each request in seconds
--max-random-delay int maximux extra randomized delay added to `--dalay` (default: 1s)
-p, --parallelism int number of concurrent URLs to process (default: 10)
OUTPUT:
--debug bool enable debug mode (default: false)
-m, --monochrome bool coloring: no colored output mode
-o, --output string output file to write found URLs
-v, --verbosity string debug, info, warning, error, fatal or silent (default: debug)
--debug bool enable debug mode (default: false)
-m, --monochrome bool coloring: no colored output mode
-o, --output string output file to write found URLs
-v, --verbosity string debug, info, warning, error, fatal or silent (default: debug)
```

## Contributing
Expand Down
106 changes: 61 additions & 45 deletions cmd/xcrawl3r/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,24 @@ var (
includeSubdomains bool
seedsFile string
URL string
depth int
timeout int
headers []string
userAgent string
proxies []string
concurrency int
parallelism int
delay int
maxRandomDelay int
debug bool
monochrome bool
output string
verbosity string

depth int
headless bool
headers []string
proxies []string
render bool
timeout int
userAgent string

concurrency int
delay int
maxRandomDelay int
parallelism int

debug bool
monochrome bool
output string
verbosity string
)

func init() {
Expand All @@ -45,15 +50,20 @@ func init() {
pflag.BoolVar(&includeSubdomains, "include-subdomains", false, "")
pflag.StringVarP(&seedsFile, "seeds", "s", "", "")
pflag.StringVarP(&URL, "url", "u", "", "")

pflag.IntVar(&depth, "depth", 3, "")
pflag.IntVar(&timeout, "timeout", 10, "")
pflag.BoolVar(&headless, "headless", false, "")
pflag.StringSliceVarP(&headers, "headers", "H", []string{}, "")
pflag.StringVar(&userAgent, "user-agent", "web", "")
pflag.StringSliceVar(&proxies, "proxy", []string{}, "")
pflag.BoolVar(&render, "render", false, "")
pflag.IntVar(&timeout, "timeout", 10, "")
pflag.StringVar(&userAgent, "user-agent", "web", "")

pflag.IntVarP(&concurrency, "concurrency", "c", 10, "")
pflag.IntVarP(&parallelism, "parallelism", "p", 10, "")
pflag.IntVar(&delay, "delay", 0, "")
pflag.IntVar(&maxRandomDelay, "max-random-delay", 1, "")
pflag.IntVarP(&parallelism, "parallelism", "p", 10, "")

pflag.BoolVar(&debug, "debug", false, "")
pflag.BoolVarP(&monochrome, "monochrome", "m", false, "")
pflag.StringVarP(&output, "output", "o", "", "")
Expand All @@ -67,37 +77,38 @@ func init() {
h += " xcrawl3r [OPTIONS]\n"

h += "\nINPUT:\n"
h += " -d, --domain string domain to match URLs\n"
h += " --include-subdomains bool match subdomains' URLs\n"
h += " -s, --seeds string seed URLs file (use `-` to get from stdin)\n"
h += " -u, --url string URL to crawl\n"
h += " -d, --domain string domain to match URLs\n"
h += " --include-subdomains bool match subdomains' URLs\n"
h += " -s, --seeds string seed URLs file (use `-` to get from stdin)\n"
h += " -u, --url string URL to crawl\n"

h += "\nCONFIGURATION:\n"
h += " --depth int maximum depth to crawl (default 3)\n"
h += " --depth int maximum depth to crawl (default 3)\n"
h += " TIP: set it to `0` for infinite recursion\n"
h += " --timeout int time to wait for request in seconds (default: 10)\n"
h += " -H, --headers string[] custom header to include in requests\n"
h += " --headless bool If true the browser will be displayed while crawling.\n"
h += " -H, --headers string[] custom header to include in requests\n"
h += " e.g. -H 'Referer: http://example.com/'\n"
h += " TIP: use multiple flag to set multiple headers\n"

h += " --user-agent string User Agent to use (default: web)\n"
h += " --proxy string[] Proxy URL (e.g: http://127.0.0.1:8080)\n"
h += " TIP: use multiple flag to set multiple proxies\n"
h += " --render bool utilize a headless chrome instance to render pages\n"
h += " --timeout int time to wait for request in seconds (default: 10)\n"
h += " --user-agent string User Agent to use (default: web)\n"
h += " TIP: use `web` for a random web user-agent,\n"
h += " `mobile` for a random mobile user-agent,\n"
h += " or you can set your specific user-agent.\n"
h += " --proxy string[] Proxy URL (e.g: http://127.0.0.1:8080)\n"
h += " TIP: use multiple flag to set multiple proxies\n"

h += "\nRATE LIMIT:\n"
h += " -c, --concurrency int number of concurrent fetchers to use (default 10)\n"
h += " -p, --parallelism int number of concurrent URLs to process (default: 10)\n"
h += " --delay int delay between each request in seconds\n"
h += " --max-random-delay int maximux extra randomized delay added to `--dalay` (default: 1s)\n"
h += " -c, --concurrency int number of concurrent fetchers to use (default 10)\n"
h += " --delay int delay between each request in seconds\n"
h += " --max-random-delay int maximux extra randomized delay added to `--dalay` (default: 1s)\n"
h += " -p, --parallelism int number of concurrent URLs to process (default: 10)\n"

h += "\nOUTPUT:\n"
h += " --debug bool enable debug mode (default: false)\n"
h += " -m, --monochrome bool coloring: no colored output mode\n"
h += " -o, --output string output file to write found URLs\n"
h += " -v, --verbosity string debug, info, warning, error, fatal or silent (default: debug)\n"
h += " --debug bool enable debug mode (default: false)\n"
h += " -m, --monochrome bool coloring: no colored output mode\n"
h += " -o, --output string output file to write found URLs\n"
h += " -v, --verbosity string debug, info, warning, error, fatal or silent (default: debug)\n"

fmt.Fprint(os.Stderr, h)
}
Expand Down Expand Up @@ -185,16 +196,21 @@ func main() {
Domain: parsedURL.Domain,
IncludeSubdomains: includeSubdomains,
Seeds: seeds,
Parallelism: parallelism,
Concurrency: concurrency,
Debug: debug,
Depth: depth,
Headers: headers,
Delay: delay,
MaxRandomDelay: maxRandomDelay,
Proxies: proxies,
Timeout: timeout,
UserAgent: userAgent,

Depth: depth,
Headless: headless,
Headers: headers,
Proxies: proxies,
Render: render,
Timeout: timeout,
UserAgent: userAgent,

Concurrency: concurrency,
Delay: delay,
MaxRandomDelay: maxRandomDelay,
Parallelism: parallelism,

Debug: debug,
}

crawler, err := xcrawl3r.New(options)
Expand Down
8 changes: 8 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/hueristiq/xcrawl3r
go 1.20

require (
github.com/chromedp/chromedp v0.9.1
github.com/gocolly/colly/v2 v2.1.0
github.com/hueristiq/hqgoutils v0.0.0-20230520130214-98e4015932b8
github.com/logrusorgru/aurora/v3 v3.0.0
Expand All @@ -16,10 +17,17 @@ require (
github.com/antchfx/htmlquery v1.2.3 // indirect
github.com/antchfx/xmlquery v1.2.4 // indirect
github.com/antchfx/xpath v1.1.8 // indirect
github.com/chromedp/cdproto v0.0.0-20230220211738-2b1ec77315c9 // indirect
github.com/chromedp/sysutil v1.0.0 // indirect
github.com/gobwas/glob v0.2.3 // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.1.0 // indirect
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect
github.com/golang/protobuf v1.4.2 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
github.com/temoto/robotstxt v1.1.1 // indirect
golang.org/x/net v0.10.0 // indirect
Expand Down
22 changes: 22 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ github.com/antchfx/xpath v1.1.6/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNY
github.com/antchfx/xpath v1.1.8 h1:PcL6bIX42Px5usSx6xRYw/wjB3wYGkj0MJ9MBzEKVgk=
github.com/antchfx/xpath v1.1.8/go.mod h1:Yee4kTMuNiPYJ7nSNorELQMr1J33uOpXDMByNYhvtNk=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/chromedp/cdproto v0.0.0-20230220211738-2b1ec77315c9 h1:wMSvdj3BswqfQOXp2R1bJOAE7xIQLt2dlMQDMf836VY=
github.com/chromedp/cdproto v0.0.0-20230220211738-2b1ec77315c9/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
github.com/chromedp/chromedp v0.9.1 h1:CC7cC5p1BeLiiS2gfNNPwp3OaUxtRMBjfiw3E3k6dFA=
github.com/chromedp/chromedp v0.9.1/go.mod h1:DUgZWRvYoEfgi66CgZ/9Yv+psgi+Sksy5DTScENWjaQ=
github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic=
github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
Expand All @@ -21,6 +27,12 @@ github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.m
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.1.0 h1:7RFti/xnNkMJnrK7D1yQ/iCIB5OrrY/54/H930kIbHA=
github.com/gobwas/ws v1.1.0/go.mod h1:nzvNcVha5eUziGrbxFCo6qFIojQHjJV5cLYIbezhfL0=
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
github.com/gocolly/colly/v2 v2.1.0 h1:k0DuZkDoCsx51bKpRJNEmcxcp+W5N8ziuwGaSDuFoGs=
github.com/gocolly/colly/v2 v2.1.0/go.mod h1:I2MuhsLjQ+Ex+IzK3afNS8/1qP3AedHOusRPcRdC5o0=
Expand All @@ -47,10 +59,18 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
github.com/hueristiq/hqgoutils v0.0.0-20230520130214-98e4015932b8 h1:Y5Hsbpr9c5oK2l/ktfWdPOLBOx9MPlH7vRQNK1mJmiQ=
github.com/hueristiq/hqgoutils v0.0.0-20230520130214-98e4015932b8/go.mod h1:GxYwOCC1RrHbilApc+wccYiaABLlRnnYaHcobxNDHos=
github.com/jawher/mow.cli v1.1.0/go.mod h1:aNaQlc7ozF3vw6IJ2dHjp2ZFiA4ozMIYY6PyuRJwlUg=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
github.com/logrusorgru/aurora/v3 v3.0.0 h1:R6zcoZZbvVcGMvDCKo45A9U/lzYyzl5NfYIvznmDfE4=
github.com/logrusorgru/aurora/v3 v3.0.0/go.mod h1:vsR12bk5grlLvLXAYrBsb5Oc/N+LxAlxggSjiwMnCUc=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw=
github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0=
github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4 h1:2vmb32OdDhjZf2ETGDlr9n8RYXx7c+jXPxMiPbwnA+8=
github.com/oxffaa/gopher-parse-sitemap v0.0.0-20191021113419-005d2eb1def4/go.mod h1:2JQx4jDHmWrbABvpOayg/+OTU6ehN0IyK2EHzceXpJo=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
Expand Down Expand Up @@ -92,6 +112,8 @@ golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5h
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201207223542-d4d67f95c62d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.8.0 h1:n5xxQn2i3PC0yLAbjTpNT85q/Kgzcr2gIoX9OrJUols=
Expand Down
75 changes: 75 additions & 0 deletions pkg/browser/browser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package browser

import (
"context"
// "urlgrab/utilities"

"github.com/chromedp/chromedp"
)

var GlobalContext context.Context
var GlobalCancel context.CancelFunc

func GetRenderedSource(url string) string {

// same browser, second tab
newCtx, newCtxCancel := chromedp.NewContext(GlobalContext)
defer newCtxCancel()

// ensure the second tab is created
if err := chromedp.Run(newCtx); err != nil {
newCtxCancel()
// utilities.Logger.Fatal(err)
}

// navigate to a page, and get it's entire HTML
var outerHtml string

if err := chromedp.Run(newCtx,
chromedp.Navigate(url),
chromedp.OuterHTML("html", &outerHtml),
); err != nil {
// utilities.Logger.Error(err)
}

return outerHtml
}

func GetGlobalContext(headless bool, proxy string) (context.Context, context.CancelFunc) {
var (
allocCtx context.Context
cancel context.CancelFunc
)
if proxy == "" {
allocCtx, cancel = chromedp.NewExecAllocator(context.Background(),
chromedp.Flag("headless", headless),
chromedp.Flag("ignore-certificate-errors", true),
chromedp.Flag("disable-extensions", true),
chromedp.Flag("no-first-run", true),
chromedp.Flag("no-default-browser-check", true),
)
} else {
allocCtx, cancel = chromedp.NewExecAllocator(context.Background(),
chromedp.Flag("headless", headless),
chromedp.Flag("ignore-certificate-errors", true),
chromedp.Flag("disable-extensions", true),
chromedp.Flag("no-first-run", true),
chromedp.Flag("no-default-browser-check", true),
chromedp.Flag("no-default-browser-check", true),
chromedp.Flag("proxy-server", proxy),
)
}

// create chrome instance
ctx, cancel := chromedp.NewContext(allocCtx,
// chromedp.WithErrorf(utilities.Logger.Errorf),
chromedp.WithBrowserOption(),
)

// ensure the first tab is created
if err := chromedp.Run(ctx); err != nil {
// utilities.Logger.Fatal(err)
}

return ctx, cancel
}
Loading

0 comments on commit 879b46d

Please sign in to comment.