feat: Ability To Limit the Scraping Rates by Site & Scraper ie Domain (…

…xbapps#1665) * Ability To Limit the Scraping Rates by Site & Scraper ie Domain * Refactor to not use maps, they are not safe for concurrent thread access
vt-idiot · Apr 2, 2024 · dfc27a4 · dfc27a4
1 parent 7596564
commit dfc27a4
Show file tree

Hide file tree

Showing 7 changed files with 145 additions and 13 deletions.
diff --git a/pkg/scrape/genericactorscraper.go b/pkg/scrape/genericactorscraper.go
@@ -266,10 +266,13 @@ func applyRules(actorPage string, source string, rules models.GenericScraperRule
 			}
 		})
 	}
+	url, _ := url.Parse(actorPage)
 	if rules.IsJson {
-		actorCollector.Request("GET", actorPage, nil, nil, nil)
+		ScraperRateLimiterWait(url.Host)
+		err := actorCollector.Request("GET", actorPage, nil, nil, nil)
+		ScraperRateLimiterCheckErrors(url.Host, err)
 	} else {
-		actorCollector.Visit(actorPage)
+		WaitBeforeVisit(url.Host, actorCollector.Visit, actorPage)
 	}
 	var extref models.ExternalReference
 	var extreflink models.ExternalReferenceLink

diff --git a/pkg/scrape/povr.go b/pkg/scrape/povr.go
@@ -126,21 +126,21 @@ func POVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<-
 
 		// If scene exists in database, or the slternate source exists, there's no need to scrape
 		if !funk.ContainsString(knownScenes, sceneURL) && !strings.Contains(sceneURL, "/join") {
-			sceneCollector.Visit(sceneURL)
+			WaitBeforeVisit("povr.com", sceneCollector.Visit, sceneURL)
 		}
 	})
 
 	siteCollector.OnHTML(`div.pagination a[class="pagination__page next"]`, func(e *colly.HTMLElement) {
 		if !limitScraping {
 			pageURL := e.Request.AbsoluteURL(e.Attr("href"))
-			siteCollector.Visit(pageURL)
+			WaitBeforeVisit("povr.com", siteCollector.Visit, pageURL)
 		}
 	})
 
 	if singleSceneURL != "" {
 		sceneCollector.Visit(singleSceneURL)
 	} else {
-		siteCollector.Visit(siteURL)
+		WaitBeforeVisit("povr.com", siteCollector.Visit, siteURL)
 	}
 
 	if updateSite {

diff --git a/pkg/scrape/scrape.go b/pkg/scrape/scrape.go
@@ -34,6 +34,25 @@ func createCollector(domains ...string) *colly.Collector {
 	})
 
 	c = createCallbacks(c)
+
+	// see if the domain has a limit and set it
+	for _, domain := range domains {
+		if Limiters == nil {
+			LoadScraperRateLimits()
+		}
+		limiter := GetRateLimiter(domain)
+		if limiter != nil {
+			randomdelay := limiter.maxDelay - limiter.minDelay
+			delay := limiter.minDelay
+			c.Limit(&colly.LimitRule{
+				DomainGlob:  "*",
+				Delay:       delay,       // Delay between requests to domains matching the glob
+				RandomDelay: randomdelay, // Max additional random delay added to the delay
+			})
+			break
+		}
+	}
+
 	return c
 }
 

diff --git a/pkg/scrape/scrape_rate_limiter.go b/pkg/scrape/scrape_rate_limiter.go
@@ -0,0 +1,106 @@
+package scrape
+
+import (
+	"math/rand"
+	"sync"
+	"time"
+
+	"github.com/tidwall/gjson"
+	"github.com/xbapps/xbvr/pkg/models"
+)
+
+// Colly provides Rate Limiting on collectors and this works in most scrapers.
+// For scrapers that handle multiple sites, eg SLR, VRPorn, this does not work, as each
+// each site creates it's own instance of the scraper with it's own colly collector, and it's own independent limits
+
+// The ScraperRateLimiter is provides a way to limit visits across multiple instances of the same scraper.
+// The calls to the colly collector Visit function must first be passed to the ScraperRateLimiter which will then coridinate
+// between all instances and then call the colly Visit function
+var Limiters []*ScraperRateLimiter
+
+type ScraperRateLimiter struct {
+	id          string
+	mutex       sync.Mutex
+	lastRequest time.Time
+	minDelay    time.Duration
+	maxDelay    time.Duration
+}
+
+func ScraperRateLimiterWait(rateLimiter string) {
+	limiter := GetRateLimiter(rateLimiter)
+	if limiter == nil {
+		return
+	}
+	limiter.mutex.Lock()
+	defer limiter.mutex.Unlock()
+
+	if limiter.lastRequest.IsZero() {
+		// no previous time, don't wait
+		limiter.lastRequest = time.Now()
+		return
+	}
+	timeSinceLast := time.Since(limiter.lastRequest)
+
+	delay := limiter.minDelay
+	if limiter.maxDelay > limiter.minDelay {
+		// Introduce a random delay between minDelay and maxDelay
+		delay += time.Duration(rand.Int63n(int64(limiter.maxDelay - limiter.minDelay)))
+	}
+	if timeSinceLast < delay {
+		time.Sleep(delay - timeSinceLast)
+	}
+	limiter.lastRequest = time.Now()
+}
+
+func WaitBeforeVisit(rateLimiter string, visitFunc func(string) error, pageURL string) {
+	ScraperRateLimiterWait(rateLimiter)
+	err := visitFunc(pageURL)
+	if err != nil {
+		// if an err is returned, then a html a call was not made by colly.  These are errors colly checks before calling the URL
+		//		ie the url has not been called.  No need to wait before the next call, as the site was never visited
+		limiter := GetRateLimiter(rateLimiter)
+		if limiter != nil {
+			limiter.lastRequest = time.Time{}
+		}
+	}
+}
+func ScraperRateLimiterCheckErrors(domain string, err error) {
+	if err != nil {
+		limiter := GetRateLimiter(domain)
+		limiter.lastRequest = time.Time{}
+	}
+}
+
+func LoadScraperRateLimits() {
+	var mutex sync.Mutex
+	mutex.Lock()
+	defer mutex.Unlock()
+
+	var limiters []*ScraperRateLimiter
+	commonDb, _ := models.GetCommonDB()
+	var kv models.KV
+	commonDb.Where(models.KV{Key: "scraper_rate_limits"}).Find(&kv)
+	if kv.Key == "scraper_rate_limits" {
+		sites := gjson.Get(kv.Value, "sites")
+		for _, site := range sites.Array() {
+			name := site.Get("name").String()
+			minDelay := int(site.Get("mindelay").Int())
+			maxDelay := int(site.Get("maxdelay").Int())
+			if maxDelay < minDelay {
+				maxDelay = minDelay
+			}
+			limiters = append(limiters, &ScraperRateLimiter{id: name, minDelay: time.Duration(minDelay) * time.Millisecond, maxDelay: time.Duration(maxDelay) * time.Millisecond})
+		}
+		Limiters = limiters
+	}
+}
+
+func GetRateLimiter(id string) *ScraperRateLimiter {
+	for _, limiter := range Limiters {
+		if limiter.id == id {
+			return limiter
+			break
+		}
+	}
+	return nil
+}
diff --git a/pkg/scrape/slrstudios.go b/pkg/scrape/slrstudios.go
@@ -305,7 +305,7 @@ func SexLikeReal(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out
 	siteCollector.OnHTML(`div.c-pagination ul li a`, func(e *colly.HTMLElement) {
 		if !limitScraping {
 			pageURL := e.Request.AbsoluteURL(e.Attr("href"))
-			siteCollector.Visit(pageURL)
+			WaitBeforeVisit("www.sexlikereal.com", siteCollector.Visit, pageURL)
 		}
 	})
 
@@ -386,7 +386,9 @@ func SexLikeReal(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out
 				ctx := colly.NewContext()
 				ctx.Put("duration", duration)
 				ctx.Put("isTransScene", isTransScene)
-				sceneCollector.Request("GET", sceneURL, nil, ctx, nil)
+				ScraperRateLimiterWait("www.sexlikereal.com")
+				err := sceneCollector.Request("GET", sceneURL, nil, ctx, nil)
+				ScraperRateLimiterCheckErrors("www.sexlikereal.com", err)
 			}
 		}
 	})
@@ -399,7 +401,7 @@ func SexLikeReal(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out
 		sceneCollector.Request("GET", singleSceneURL, nil, ctx, nil)
 
 	} else {
-		siteCollector.Visit(siteURL + "?sort=most_recent")
+		WaitBeforeVisit("www.sexlikereal.com", siteCollector.Visit, siteURL+"?sort=most_recent")
 	}
 
 	if updateSite {

diff --git a/pkg/scrape/vrphub.go b/pkg/scrape/vrphub.go
@@ -191,7 +191,9 @@ func VRPHub(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<
 			ctx := colly.NewContext()
 			ctx.Put("scene", &sc)
 
-			sceneCollector.Request("GET", sceneURL, nil, ctx, nil)
+			ScraperRateLimiterWait("vrphub.com")
+			err := sceneCollector.Request("GET", sceneURL, nil, ctx, nil)
+			ScraperRateLimiterCheckErrors("vrphub.com", err)
 		}
 	})
 
@@ -201,7 +203,7 @@ func VRPHub(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<
 		ctx.Put("scene", &sc)
 		sceneCollector.Request("GET", singleSceneURL, nil, ctx, nil)
 	} else {
-		siteCollector.Visit(siteURL)
+		WaitBeforeVisit("vrphub.com", siteCollector.Visit, siteURL)
 	}
 
 	if updateSite {

diff --git a/pkg/scrape/vrporn.go b/pkg/scrape/vrporn.go
@@ -145,22 +145,22 @@ func VRPorn(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<
 	siteCollector.OnHTML(`div.pagination a.next`, func(e *colly.HTMLElement) {
 		if !limitScraping {
 			pageURL := e.Request.AbsoluteURL(e.Attr("href"))
-			siteCollector.Visit(pageURL)
+			WaitBeforeVisit("vrporn.com", siteCollector.Visit, pageURL)
 		}
 	})
 
 	siteCollector.OnHTML(`body.tax-studio article.post div.tube-post a`, func(e *colly.HTMLElement) {
 		sceneURL := e.Request.AbsoluteURL(e.Attr("href"))
 		// If scene exists in database, or the slternate source exists, there's no need to scrape
 		if !funk.ContainsString(knownScenes, sceneURL) {
-			sceneCollector.Visit(sceneURL)
+			WaitBeforeVisit("vrporn.com", sceneCollector.Visit, sceneURL)
 		}
 	})
 
 	if singleSceneURL != "" {
 		sceneCollector.Visit(singleSceneURL)
 	} else {
-		siteCollector.Visit(siteURL + "/?sort=newest")
+		WaitBeforeVisit("vrporn.com", siteCollector.Visit, siteURL+"/?sort=newest")
 	}
 
 	if updateSite {