diff --git a/colly.go b/colly.go index 6c018c951..44418e8d5 100644 --- a/colly.go +++ b/colly.go @@ -4,6 +4,7 @@ package colly import ( "bytes" "errors" + "fmt" "io" "io/ioutil" "net/http" @@ -130,6 +131,7 @@ func (c *Collector) Init() { c.MaxBodySize = 10 * 1024 * 1024 c.backend = &httpBackend{} c.backend.Init() + c.backend.Client.CheckRedirect = c.checkRedirectFunc() c.wg = &sync.WaitGroup{} c.lock = &sync.Mutex{} } @@ -356,6 +358,31 @@ func (c *Collector) Cookies(URL string) []*http.Cookie { return c.backend.Client.Jar.Cookies(u) } +func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Request) error { + return func(req *http.Request, via []*http.Request) error { + if !c.isDomainAllowed(req.URL.Host) { + return fmt.Errorf("Not following redirect to %s because its not in AllowedDomains", req.URL.Host) + } + + // Honor golangs default of maximum of 10 redirects + if len(via) >= 10 { + return http.ErrUseLastResponse + } + + lastRequest := via[len(via)-1] + + // Copy the headers from last request + req.Header = lastRequest.Header + + // If domain has changed, remove the Authorization-header if it exists + if req.URL.Host != lastRequest.URL.Host { + req.Header.Del("Authorization") + } + + return nil + } +} + // Attr returns the selected attribute of a HTMLElement or empty string // if no attribute found func (h *HTMLElement) Attr(k string) string {