Skip to content

Commit

Permalink
feat: Ability To Limit the Scraping Rates by Site & Scraper ie Domain (
Browse files Browse the repository at this point in the history
…xbapps#1665)

* Ability To Limit the Scraping Rates by Site & Scraper ie Domain

* Refactor to not use maps, they are not safe for concurrent thread access
  • Loading branch information
toshski authored Apr 2, 2024
1 parent 7596564 commit dfc27a4
Show file tree
Hide file tree
Showing 7 changed files with 145 additions and 13 deletions.
7 changes: 5 additions & 2 deletions pkg/scrape/genericactorscraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,10 +266,13 @@ func applyRules(actorPage string, source string, rules models.GenericScraperRule
}
})
}
url, _ := url.Parse(actorPage)
if rules.IsJson {
actorCollector.Request("GET", actorPage, nil, nil, nil)
ScraperRateLimiterWait(url.Host)
err := actorCollector.Request("GET", actorPage, nil, nil, nil)
ScraperRateLimiterCheckErrors(url.Host, err)
} else {
actorCollector.Visit(actorPage)
WaitBeforeVisit(url.Host, actorCollector.Visit, actorPage)
}
var extref models.ExternalReference
var extreflink models.ExternalReferenceLink
Expand Down
6 changes: 3 additions & 3 deletions pkg/scrape/povr.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,21 +126,21 @@ func POVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<-

// If scene exists in database, or the slternate source exists, there's no need to scrape
if !funk.ContainsString(knownScenes, sceneURL) && !strings.Contains(sceneURL, "/join") {
sceneCollector.Visit(sceneURL)
WaitBeforeVisit("povr.com", sceneCollector.Visit, sceneURL)
}
})

siteCollector.OnHTML(`div.pagination a[class="pagination__page next"]`, func(e *colly.HTMLElement) {
if !limitScraping {
pageURL := e.Request.AbsoluteURL(e.Attr("href"))
siteCollector.Visit(pageURL)
WaitBeforeVisit("povr.com", siteCollector.Visit, pageURL)
}
})

if singleSceneURL != "" {
sceneCollector.Visit(singleSceneURL)
} else {
siteCollector.Visit(siteURL)
WaitBeforeVisit("povr.com", siteCollector.Visit, siteURL)
}

if updateSite {
Expand Down
19 changes: 19 additions & 0 deletions pkg/scrape/scrape.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,25 @@ func createCollector(domains ...string) *colly.Collector {
})

c = createCallbacks(c)

// see if the domain has a limit and set it
for _, domain := range domains {
if Limiters == nil {
LoadScraperRateLimits()
}
limiter := GetRateLimiter(domain)
if limiter != nil {
randomdelay := limiter.maxDelay - limiter.minDelay
delay := limiter.minDelay
c.Limit(&colly.LimitRule{
DomainGlob: "*",
Delay: delay, // Delay between requests to domains matching the glob
RandomDelay: randomdelay, // Max additional random delay added to the delay
})
break
}
}

return c
}

Expand Down
106 changes: 106 additions & 0 deletions pkg/scrape/scrape_rate_limiter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
package scrape

import (
"math/rand"
"sync"
"time"

"github.com/tidwall/gjson"
"github.com/xbapps/xbvr/pkg/models"
)

// Colly provides Rate Limiting on collectors and this works in most scrapers.
// For scrapers that handle multiple sites, eg SLR, VRPorn, this does not work, as each
// each site creates it's own instance of the scraper with it's own colly collector, and it's own independent limits

// The ScraperRateLimiter is provides a way to limit visits across multiple instances of the same scraper.
// The calls to the colly collector Visit function must first be passed to the ScraperRateLimiter which will then coridinate
// between all instances and then call the colly Visit function
var Limiters []*ScraperRateLimiter

type ScraperRateLimiter struct {
id string
mutex sync.Mutex
lastRequest time.Time
minDelay time.Duration
maxDelay time.Duration
}

func ScraperRateLimiterWait(rateLimiter string) {
limiter := GetRateLimiter(rateLimiter)
if limiter == nil {
return
}
limiter.mutex.Lock()
defer limiter.mutex.Unlock()

if limiter.lastRequest.IsZero() {
// no previous time, don't wait
limiter.lastRequest = time.Now()
return
}
timeSinceLast := time.Since(limiter.lastRequest)

delay := limiter.minDelay
if limiter.maxDelay > limiter.minDelay {
// Introduce a random delay between minDelay and maxDelay
delay += time.Duration(rand.Int63n(int64(limiter.maxDelay - limiter.minDelay)))
}
if timeSinceLast < delay {
time.Sleep(delay - timeSinceLast)
}
limiter.lastRequest = time.Now()
}

func WaitBeforeVisit(rateLimiter string, visitFunc func(string) error, pageURL string) {
ScraperRateLimiterWait(rateLimiter)
err := visitFunc(pageURL)
if err != nil {
// if an err is returned, then a html a call was not made by colly. These are errors colly checks before calling the URL
// ie the url has not been called. No need to wait before the next call, as the site was never visited
limiter := GetRateLimiter(rateLimiter)
if limiter != nil {
limiter.lastRequest = time.Time{}
}
}
}
func ScraperRateLimiterCheckErrors(domain string, err error) {
if err != nil {
limiter := GetRateLimiter(domain)
limiter.lastRequest = time.Time{}
}
}

func LoadScraperRateLimits() {
var mutex sync.Mutex
mutex.Lock()
defer mutex.Unlock()

var limiters []*ScraperRateLimiter
commonDb, _ := models.GetCommonDB()
var kv models.KV
commonDb.Where(models.KV{Key: "scraper_rate_limits"}).Find(&kv)
if kv.Key == "scraper_rate_limits" {
sites := gjson.Get(kv.Value, "sites")
for _, site := range sites.Array() {
name := site.Get("name").String()
minDelay := int(site.Get("mindelay").Int())
maxDelay := int(site.Get("maxdelay").Int())
if maxDelay < minDelay {
maxDelay = minDelay
}
limiters = append(limiters, &ScraperRateLimiter{id: name, minDelay: time.Duration(minDelay) * time.Millisecond, maxDelay: time.Duration(maxDelay) * time.Millisecond})
}
Limiters = limiters
}
}

func GetRateLimiter(id string) *ScraperRateLimiter {
for _, limiter := range Limiters {
if limiter.id == id {
return limiter
break
}
}
return nil
}
8 changes: 5 additions & 3 deletions pkg/scrape/slrstudios.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ func SexLikeReal(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out
siteCollector.OnHTML(`div.c-pagination ul li a`, func(e *colly.HTMLElement) {
if !limitScraping {
pageURL := e.Request.AbsoluteURL(e.Attr("href"))
siteCollector.Visit(pageURL)
WaitBeforeVisit("www.sexlikereal.com", siteCollector.Visit, pageURL)
}
})

Expand Down Expand Up @@ -386,7 +386,9 @@ func SexLikeReal(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out
ctx := colly.NewContext()
ctx.Put("duration", duration)
ctx.Put("isTransScene", isTransScene)
sceneCollector.Request("GET", sceneURL, nil, ctx, nil)
ScraperRateLimiterWait("www.sexlikereal.com")
err := sceneCollector.Request("GET", sceneURL, nil, ctx, nil)
ScraperRateLimiterCheckErrors("www.sexlikereal.com", err)
}
}
})
Expand All @@ -399,7 +401,7 @@ func SexLikeReal(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out
sceneCollector.Request("GET", singleSceneURL, nil, ctx, nil)

} else {
siteCollector.Visit(siteURL + "?sort=most_recent")
WaitBeforeVisit("www.sexlikereal.com", siteCollector.Visit, siteURL+"?sort=most_recent")
}

if updateSite {
Expand Down
6 changes: 4 additions & 2 deletions pkg/scrape/vrphub.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,9 @@ func VRPHub(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<
ctx := colly.NewContext()
ctx.Put("scene", &sc)

sceneCollector.Request("GET", sceneURL, nil, ctx, nil)
ScraperRateLimiterWait("vrphub.com")
err := sceneCollector.Request("GET", sceneURL, nil, ctx, nil)
ScraperRateLimiterCheckErrors("vrphub.com", err)
}
})

Expand All @@ -201,7 +203,7 @@ func VRPHub(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<
ctx.Put("scene", &sc)
sceneCollector.Request("GET", singleSceneURL, nil, ctx, nil)
} else {
siteCollector.Visit(siteURL)
WaitBeforeVisit("vrphub.com", siteCollector.Visit, siteURL)
}

if updateSite {
Expand Down
6 changes: 3 additions & 3 deletions pkg/scrape/vrporn.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,22 +145,22 @@ func VRPorn(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<
siteCollector.OnHTML(`div.pagination a.next`, func(e *colly.HTMLElement) {
if !limitScraping {
pageURL := e.Request.AbsoluteURL(e.Attr("href"))
siteCollector.Visit(pageURL)
WaitBeforeVisit("vrporn.com", siteCollector.Visit, pageURL)
}
})

siteCollector.OnHTML(`body.tax-studio article.post div.tube-post a`, func(e *colly.HTMLElement) {
sceneURL := e.Request.AbsoluteURL(e.Attr("href"))
// If scene exists in database, or the slternate source exists, there's no need to scrape
if !funk.ContainsString(knownScenes, sceneURL) {
sceneCollector.Visit(sceneURL)
WaitBeforeVisit("vrporn.com", sceneCollector.Visit, sceneURL)
}
})

if singleSceneURL != "" {
sceneCollector.Visit(singleSceneURL)
} else {
siteCollector.Visit(siteURL + "/?sort=newest")
WaitBeforeVisit("vrporn.com", siteCollector.Visit, siteURL+"/?sort=newest")
}

if updateSite {
Expand Down

0 comments on commit dfc27a4

Please sign in to comment.