Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rebuild of Reality Lovers Fixes #1851 #1864

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 104 additions & 102 deletions pkg/scrape/realitylovers.go
Original file line number Diff line number Diff line change
@@ -1,146 +1,148 @@
package scrape

import (
"fmt"
"regexp"
"strings"
"time"

"github.com/go-resty/resty/v2"
"github.com/gocolly/colly/v2"
"github.com/mozillazg/go-slugify"
"github.com/nleeper/goment"
"github.com/thoas/go-funk"
"github.com/tidwall/gjson"
"github.com/xbapps/xbvr/pkg/models"
)

func RealityLoversSite(wg *models.ScrapeWG, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene, singleSceneURL string, scraperID string, siteID string, domain string, singeScrapeAdditionalInfo string, limitScraping bool) error {
defer wg.Done()
logScrapeStart(scraperID, siteID)

sceneCollector := createCollector("realitylovers.com", "engine.realitylovers.com", "tsvirtuallovers.com", "engine.tsvirtuallovers.com")
sceneCollector := createCollector(domain)
siteCollector := createCollector(domain)

sceneCollector.OnResponse(func(r *colly.Response) {
if r.StatusCode != 200 {
return
}
json := gjson.ParseBytes(r.Body)
// These cookies are needed for age verification.
siteCollector.OnRequest(func(r *colly.Request) {
r.Headers.Set("Cookie", "agreedToDisclaimer=true")
})

sceneCollector.OnRequest(func(r *colly.Request) {
r.Headers.Set("Cookie", "agreedToDisclaimer=true")
})

sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) {
sc := models.ScrapedScene{}
sc.ScraperID = scraperID
sc.SceneType = "VR"
sc.Studio = "RealityLovers"
sc.Site = siteID
sc.HomepageURL = r.Request.Ctx.Get("sceneURL")

// Scene ID
sc.SiteID = json.Get("contentId").String()
sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID
sc.SiteID = ""
sc.HomepageURL, _ = strings.CutSuffix(e.Request.URL.String(), "/")

sc.Title = json.Get("title").String()
sc.Synopsis = json.Get("description").String()
// Cover Url
coverURL := e.Request.Ctx.GetAny("coverURL").(string)
sc.Covers = append(sc.Covers, coverURL)

covers := json.Get("mainImages.0.imgSrcSet").String()
sc.Covers = append(sc.Covers, strings.Fields(covers)[0])
// Gallery
e.ForEach(`div.owl-carousel div.item`, func(id int, e *colly.HTMLElement) {
sc.Gallery = append(sc.Gallery, e.ChildAttr("img", "src"))
})

sc.Released = json.Get("releaseDate").String()
// Incase we scrape a single scene use one of the gallery images for the cover
if singleSceneURL != "" {
sc.Covers = append(sc.Covers, sc.Gallery[0])
}

// Cast
sc.ActorDetails = make(map[string]models.ActorDetails)
json.Get("starring").ForEach(func(_, star gjson.Result) bool {
name := star.Get("name").String()
sc.Cast = append(sc.Cast, name)
sc.ActorDetails[name] = models.ActorDetails{Source: sc.ScraperID + " scrape", ProfileUrl: "https://" + domain + "/" + star.Get("uri").String()}
return true
})

// Gallery
json.Get("screenshots").ForEach(func(_, screenshot gjson.Result) bool {
imgset := screenshot.Get("galleryImgSrcSet").String()
images := strings.Split(imgset, ",")
selectedImage := ""
for _, image := range images {
parts := strings.Fields(image)
if selectedImage == "" {
selectedImage = parts[0]
}
if parts[1] == "1920w" {
selectedImage = parts[0]
e.ForEach(`table.video-description-list tbody`, func(id int, e *colly.HTMLElement) {
// Cast
e.ForEach(`tr:nth-child(1) a`, func(id int, e *colly.HTMLElement) {
if strings.TrimSpace(e.Text) != "" {
sc.Cast = append(sc.Cast, strings.TrimSpace(e.Text))
sc.ActorDetails[strings.TrimSpace(e.Text)] = models.ActorDetails{Source: sc.ScraperID + " scrape", ProfileUrl: e.Request.AbsoluteURL(e.Attr("href"))}
}
}
sc.Gallery = append(sc.Gallery, selectedImage)
return true
})
})

// Tags
e.ForEach(`tr:nth-child(2) a`, func(id int, e *colly.HTMLElement) {
tag := strings.TrimSpace(e.Text)

// Tags
json.Get("categories").ForEach(func(_, category gjson.Result) bool {
sc.Tags = append(sc.Tags, category.Get("name").String())
return true
// Standardize the resolution tags
tag, _ = strings.CutSuffix(strings.ToLower(tag), " vr porn")
tag, _ = strings.CutSuffix(tag, " ts")
sc.Tags = append(sc.Tags, tag)
})

// Date
tmpDate, _ := goment.New(strings.TrimSpace(e.ChildText(`tr:nth-child(3) td:last-child`)), "MMMM DD, YYYY")
sc.Released = tmpDate.Format("YYYY-MM-DD")
})

sc.TrailerType = "url"
sc.TrailerSrc = json.Get("trailerUrl").String()
// Synposis
sc.Synopsis = strings.TrimSpace(e.ChildText("div.accordion-body"))

tmp := strings.Split(sc.HomepageURL, "/")

// Title
sc.Title = e.Request.Ctx.GetAny("title").(string)

//Fall back incase single scene scraping
if sc.Title == "" {
sc.Title = strings.ReplaceAll(tmp[len(tmp)-1], "-", " ")
}

// Scene ID
sc.SiteID = tmp[len(tmp)-2]

if sc.SiteID != "" {
sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID

out <- sc
// save only if we got a SceneID
out <- sc
}
})

// Request scenes via REST API
if singleSceneURL == "" {
page := 0
for {
url := fmt.Sprintf("https://engine.%s/content/videos?max=12&page=%v&pornstar=&category=&perspective=&sort=NEWEST", domain, page)
log.Infoln("visiting", url)
r, err := resty.New().R().
SetHeader("User-Agent", UserAgent).
Get(url)

if err != nil {
log.Errorf("Error fetching BaberoticaVR feed: %s", err)
logScrapeFinished(scraperID, siteID)
return nil
}
siteCollector.OnHTML(`a.page-link[aria-label="Next"]:not(.disabled)`, func(e *colly.HTMLElement) {
if !limitScraping {
pageURL := e.Request.AbsoluteURL(e.Attr("href"))
siteCollector.Visit(pageURL)
}
})

scenecnt := 0
if err == nil || r.StatusCode() == 200 {
result := gjson.Get(r.String(), "contents")
result.ForEach(func(key, value gjson.Result) bool {
scenecnt++
sceneURL := "https://" + domain + "/" + value.Get("videoUri").String()
sceneID := value.Get("id").String()
if !funk.ContainsString(knownScenes, sceneURL) {
ctx := colly.NewContext()
ctx.Put("sceneURL", sceneURL)
sceneCollector.Request("GET", "https://engine."+domain+"/content/videoDetail?contentId="+sceneID, nil, ctx, nil)
}
return true
})
}
if err != nil {
log.Errorf("Error visiting %s %s", url, err)
}
if r.StatusCode() != 200 {
log.Errorf("Return code visiting %s %v", url, r.StatusCode())
}
siteCollector.OnHTML(`div#gridView`, func(e *colly.HTMLElement) {

e.ForEach("div.video-grid-view", func(id int, e *colly.HTMLElement) {

if scenecnt < 12 {
break
re := regexp.MustCompile(`.+[jJ][pP][gG]`)
tmp := strings.Split(e.ChildAttr("img", "srcset"), ",")
r := re.FindStringSubmatch(tmp[len(tmp)-1])
coverURL := ""

if len(r) > 0 {
coverURL = strings.TrimSpace(r[0])
} else {
log.Warnln("Couldn't Find Cover Img in srcset:", tmp)
}
page++
if limitScraping {
break

title := e.ChildText("p.card-title")

sceneURL := e.Request.AbsoluteURL(e.ChildAttr("a", "href"))

// If scene exist in database, there's no need to scrape
if !funk.ContainsString(knownScenes, sceneURL) {
ctx := colly.NewContext()
ctx.Put("coverURL", coverURL)
ctx.Put("title", title)
sceneCollector.Request("GET", sceneURL, nil, ctx, nil)
}
// have seen instances of status 404, so make sure we don't span will calls
time.Sleep(time.Second)
}
} else {
re := regexp.MustCompile(`.com\/vd\/(\d+)\/`)
match := re.FindStringSubmatch(singleSceneURL)
if len(match) >= 2 {
ctx := colly.NewContext()
ctx.Put("sceneURL", singleSceneURL)
sceneCollector.Request("GET", "https://engine."+domain+"/content/videoDetail?contentId="+match[1], nil, ctx, nil)
}
})
})

if singleSceneURL != "" {
ctx := colly.NewContext()
ctx.Put("coverURL", "")
ctx.Put("title", "")
sceneCollector.Request("GET", singleSceneURL, nil, ctx, nil)
} else {
siteCollector.Visit("https://" + domain + "/videos/page1")
}

if updateSite {
Expand Down
Loading