From 219f19a2107c0342c372622c91e5c66cbcae5842 Mon Sep 17 00:00:00 2001 From: toshski <104477758+toshski@users.noreply.github.com> Date: Wed, 3 Apr 2024 03:57:53 +1300 Subject: [PATCH] scraper: Rewrite VirtualPorn using API as a Source (#1654) --- pkg/migrations/migrations.go | 7 + pkg/models/model_external_reference.go | 2 +- pkg/scrape/virtualporn.go | 306 ++++++++++++++++--------- 3 files changed, 208 insertions(+), 107 deletions(-) diff --git a/pkg/migrations/migrations.go b/pkg/migrations/migrations.go index 4710c9296..6facfa8dd 100644 --- a/pkg/migrations/migrations.go +++ b/pkg/migrations/migrations.go @@ -1925,6 +1925,13 @@ func Migrate() { return err }, }, + { + ID: "0077-Update-VirtualPorn-ids", + Migrate: func(tx *gorm.DB) error { + err := scrape.UpdateVirtualPornIds() + return err + }, + }, }) if err := m.Migrate(); err != nil { diff --git a/pkg/models/model_external_reference.go b/pkg/models/model_external_reference.go index e65a0694a..63fac928d 100644 --- a/pkg/models/model_external_reference.go +++ b/pkg/models/model_external_reference.go @@ -518,7 +518,7 @@ func (scrapeRules ActorScraperConfig) buildGenericActorScraperRules() { siteDetails = GenericScraperRuleSet{} siteDetails.Domain = "virtualporn.com" - siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "image_url", Selector: `div.model__img-wrapper > img`, ResultType: "attr", Attribute: "src"}) + siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "image_url", Selector: `section[data-cy="actorProfilePicture"] img`, ResultType: "attr", Attribute: "src"}) scrapeRules.GenericActorScrapingConfig["bvr scrape"] = siteDetails siteDetails = GenericScraperRuleSet{} diff --git a/pkg/scrape/virtualporn.go b/pkg/scrape/virtualporn.go index 4edc86162..ef6744725 100644 --- a/pkg/scrape/virtualporn.go +++ b/pkg/scrape/virtualporn.go @@ -2,150 +2,167 @@ package scrape import ( "encoding/json" + "errors" + "regexp" "strconv" "strings" "sync" "github.com/gocolly/colly/v2" - "github.com/nleeper/goment" + "github.com/mozillazg/go-slugify" "github.com/thoas/go-funk" + "github.com/tidwall/gjson" "github.com/xbapps/xbvr/pkg/models" ) func VirtualPorn(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene, singleSceneURL string, singeScrapeAdditionalInfo string, limitScraping bool) error { + // this scraper is non-standard in that it gathers info via an api rather than scraping html pages defer wg.Done() scraperID := "bvr" siteID := "VirtualPorn" logScrapeStart(scraperID, siteID) + nextApiUrl := "" - sceneCollector := createCollector("virtualporn.com") siteCollector := createCollector("virtualporn.com") - pageCnt := 1 - - sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) { - sc := models.ScrapedScene{} - sc.ScraperID = scraperID - sc.SceneType = "VR" - sc.Studio = "BangBros" - sc.Site = siteID - sc.HomepageURL = strings.Split(e.Request.URL.String(), "?")[0] - sc.MembersUrl = "https://members.bangbros.com/product/655/movie/" + strings.Replace(strings.Split(e.Request.URL.String(), "/")[3], "video", "", 1) - - // Title / Cover / ID / Filenames - e.ForEach(`dl8-video`, func(id int, e *colly.HTMLElement) { - sc.Title = strings.TrimSpace(e.Attr("title")) - - tmpCover := e.Request.AbsoluteURL(e.Request.AbsoluteURL(e.Attr("poster"))) - sc.Covers = append(sc.Covers, tmpCover) - - tmp := strings.Split(tmpCover, "/") - sc.SceneID = strings.Replace(tmp[5], "bvr", "bvr-", 1) - - e.ForEach(`source`, func(id int, e *colly.HTMLElement) { - tmpFile := strings.Split(e.Attr("src"), "/") - sc.Filenames = append(sc.Filenames, strings.Replace(tmpFile[len(tmpFile)-1], "trailer-", "", -1)) + apiCollector := createCollector("site-api.project1service.com") + offset := 0 + + apiCollector.OnResponse(func(r *colly.Response) { + sceneListJson := gjson.ParseBytes(r.Body) + + processScene := func(scene gjson.Result) { + sc := models.ScrapedScene{} + sc.ScraperID = scraperID + sc.SceneType = "VR" + sc.Studio = "BangBros" + sc.Site = siteID + id := strconv.Itoa(int(scene.Get("id").Int())) + sc.SceneID = "bvr-" + id + + sc.Title = scene.Get("title").String() + sc.HomepageURL = "https://virtualporn.com/video/" + id + "/" + slugify.Slugify(strings.ReplaceAll(sc.Title, "'", "")) + sc.MembersUrl = "https://site-ma.virtualporn.com/scene/" + id + "/" + slugify.Slugify(strings.ReplaceAll(sc.Title, "'", "")) + sc.Synopsis = scene.Get("description").String() + dateParts := strings.Split(scene.Get("dateReleased").String(), "T") + sc.Released = dateParts[0] + + scene.Get("images.poster").ForEach(func(key, imgGroup gjson.Result) bool { + if key.String() == "0" { + imgurl := imgGroup.Get("xl.urls.webp").String() + if imgurl != "" { + sc.Covers = append(sc.Covers, imgurl) + } + + } else { + imgurl := imgGroup.Get("xl.urls.webp").String() + if imgurl != "" { + if len(sc.Covers) == 0 { + sc.Covers = append(sc.Covers, imgurl) + } else { + sc.Gallery = append(sc.Gallery, imgurl) + } + } + } + return true }) - }) - file5kExists := false - for _, filename := range sc.Filenames { - if strings.Contains(filename, "5k") { - file5kExists = true - } - } - if !file5kExists { - sc.Filenames = append(sc.Filenames, strings.Replace(sc.SceneID, "bvr-", "bvr", -1)+"-5k.mp4") - } - - // Gallery - e.ForEach(`div.player__thumbs img`, func(id int, e *colly.HTMLElement) { - sc.Gallery = append(sc.Gallery, e.Attr("src")) - }) + // Cast + sc.ActorDetails = make(map[string]models.ActorDetails) + scene.Get("actors").ForEach(func(key, actor gjson.Result) bool { + name := actor.Get("name").String() + if actor.Get("gender").String() == "female" { + sc.Cast = append(sc.Cast, name) + } + sc.ActorDetails[actor.Get("name").String()] = models.ActorDetails{Source: scraperID + " scrape", ProfileUrl: "https://virtualporn.com/model/" + strconv.Itoa(int(actor.Get("id").Int())) + "/" + slugify.Slugify(name)} + return true + }) - // trailer details - sc.TrailerType = "scrape_html" - params := models.TrailerScrape{SceneUrl: sc.HomepageURL, HtmlElement: "dl8-video source", ContentPath: "src", QualityPath: "quality"} - strParams, _ := json.Marshal(params) - sc.TrailerSrc = string(strParams) - - // Cast - sc.ActorDetails = make(map[string]models.ActorDetails) - e.ForEach(`div.player__stats p.player__stats__cast a`, func(id int, e *colly.HTMLElement) { - if strings.TrimSpace(e.Text) != "" { - sc.Cast = append(sc.Cast, strings.TrimSpace(strings.ReplaceAll(e.Text, "!", ""))) - sc.ActorDetails[strings.TrimSpace(strings.ReplaceAll(e.Text, "!", ""))] = models.ActorDetails{Source: scraperID + " scrape", ProfileUrl: e.Request.AbsoluteURL(e.Attr("href"))} - } - }) + // Tags + scene.Get("tags").ForEach(func(key, tag gjson.Result) bool { + if tag.Get("isVisible").Bool() { + sc.Tags = append(sc.Tags, tag.Get("name").String()) + } + return true + }) - // Tags - e.ForEach(`div.video__tags__list a.tags`, func(id int, e *colly.HTMLElement) { - tag := strings.TrimSpace(e.Text) - if tag != "" { - sc.Tags = append(sc.Tags, strings.ToLower(tag)) - } - }) + // trailer & filename details + sc.TrailerType = "urls" + var trailers []models.VideoSource + scene.Get("children").ForEach(func(key, child gjson.Result) bool { + child.Get("videos.full.files").ForEach(func(key, file gjson.Result) bool { + quality := file.Get("format").String() + url := file.Get("urls.view").String() + filename := file.Get("urls.download").String() + if url != "" { + trailers = append(trailers, models.VideoSource{URL: url, Quality: quality}) + } + pos := strings.Index(filename, "?filename=") + if pos != -1 { + sc.Filenames = append(sc.Filenames, filename[pos+10:]) + } + return true + }) + return true + }) + trailerJson, _ := json.Marshal(models.VideoSourceResponse{VideoSources: trailers}) + sc.TrailerSrc = string(trailerJson) - // Synposis - e.ForEach(`p.player__description`, func(id int, e *colly.HTMLElement) { - sc.Synopsis = strings.TrimSpace(e.Text) - }) + out <- sc - // Release date / Duration - tmpDate, _ := goment.New(strings.TrimSpace(e.Request.Ctx.GetAny("date").(string)), "MMM DD, YYYY") - sc.Released = tmpDate.Format("YYYY-MM-DD") - tmpDuration, err := strconv.Atoi(strings.TrimSpace(strings.Replace(e.Request.Ctx.GetAny("dur").(string), "mins", "", -1))) - if err == nil { - sc.Duration = tmpDuration + } + total := int(sceneListJson.Get("meta.total").Int()) + scenes := sceneListJson.Get("result") + if strings.Contains(r.Request.URL.RawQuery, "offset=") { + scenes.ForEach(func(key, scene gjson.Result) bool { + // check if we have the scene already + matches := funk.Filter(knownScenes, func(s string) bool { + return strings.Contains(s, scene.Get("id").String()) + }) + if funk.IsEmpty(matches) { + processScene(scene) + } + return true + }) + } else { + processScene(scenes) } - out <- sc - }) - - siteCollector.OnHTML(`body`, func(e *colly.HTMLElement) { - sceneCnt := 0 - e.ForEach(`div.recommended__item`, func(id int, e *colly.HTMLElement) { - sceneCnt += 1 - }) - - if sceneCnt > 0 { - pageCnt += 1 + offset += 24 + if offset < total { if !limitScraping { - siteCollector.Visit("https://virtualporn.com/videos/" + strconv.Itoa(pageCnt)) + apiCollector.Visit("https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset=" + strconv.Itoa(offset)) } } }) - siteCollector.OnHTML(`div.recommended__item`, func(e *colly.HTMLElement) { - sceneURL := e.Request.AbsoluteURL(e.ChildAttr(`a`, "href")) - - // If scene exist in database, there's no need to scrape - if !funk.ContainsString(knownScenes, sceneURL) { - - //Date & Duration from main index - ctx := colly.NewContext() - e.ForEach(`span.recommended__item__info__date`, func(id int, e *colly.HTMLElement) { - if id == 0 { - ctx.Put("date", strings.TrimSpace(e.Text)) - } - }) - e.ForEach(`span.recommended__item__time`, func(id int, e *colly.HTMLElement) { - if id == 0 { - ctx.Put("dur", strings.TrimSpace(e.Text)) - } + siteCollector.OnHTML(`script`, func(e *colly.HTMLElement) { + // only interested in a script containg window\.__JUAN\.rawInstance + re := regexp.MustCompile(`window\.__JUAN\.rawInstance = (\{.*?\});`) + matches := re.FindStringSubmatch(e.Text) + if len(matches) > 1 { + instanceJson := gjson.ParseBytes([]byte(matches[1])) + token := instanceJson.Get("jwt").String() + // set up api requests to use the token in the Instance Header + apiCollector.OnRequest(func(r *colly.Request) { + r.Headers.Set("Instance", token) }) - - sceneCollector.Request("GET", sceneURL, nil, ctx, nil) + apiCollector.Visit(nextApiUrl) } }) - if singleSceneURL != "" { ctx := colly.NewContext() ctx.Put("dur", "") ctx.Put("date", "") + urlParts := strings.Split(singleSceneURL, "/") + id := urlParts[len(urlParts)-2] + offset = 9999 // do read more pages, we only need 1 + nextApiUrl = "https://site-api.project1service.com/v2/releases/" + id + siteCollector.Visit("https://virtualporn.com/videos") - sceneCollector.Request("GET", singleSceneURL, nil, ctx, nil) } else { - siteCollector.Visit("https://virtualporn.com/videos/" + strconv.Itoa(pageCnt)) + // call virtualporn.com, this is just to get the instance token to use the api for this session + nextApiUrl = "https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset=" + strconv.Itoa(offset) + siteCollector.Visit("https://virtualporn.com/videos") } if updateSite { @@ -158,3 +175,80 @@ func VirtualPorn(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out func init() { registerScraper("bvr", "VirtualPorn", "https://images.cn77nd.com/members/bangbros/favicon/apple-icon-60x60.png", "virtualporn.com", VirtualPorn) } + +// one off conversion routine called by migrations.go +func UpdateVirtualPornIds() error { + collector := createCollector("virtualporn.com") + apiCollector := createCollector("site-api.project1service.com") + offset := 0 + sceneCnt := 0 + + collector.OnHTML(`script`, func(e *colly.HTMLElement) { + // only interested in a script containg window\.__JUAN\.rawInstance + re := regexp.MustCompile(`window\.__JUAN\.rawInstance = (\{.*?\});`) + matches := re.FindStringSubmatch(e.Text) + if len(matches) > 1 { + instanceJson := gjson.ParseBytes([]byte(matches[1])) + token := instanceJson.Get("jwt").String() + // set up api requests to use the token in the Instance Header + apiCollector.OnRequest(func(r *colly.Request) { + r.Headers.Set("Instance", token) + }) + apiCollector.Visit("https://site-api.project1service.com/v2/releases?type=scene&limit=100&offset=" + strconv.Itoa(offset)) + } + }) + + apiCollector.OnResponse(func(r *colly.Response) { + db, _ := models.GetDB() + defer db.Close() + + sceneListJson := gjson.ParseBytes(r.Body) + sceneCnt = int(sceneListJson.Get("meta.total").Int()) + scenes := sceneListJson.Get("result") + scenes.ForEach(func(key, apiScene gjson.Result) bool { + id := strconv.Itoa(int(apiScene.Get("id").Int())) + title := apiScene.Get("title").String() + dateParts := strings.Split(apiScene.Get("dateReleased").String(), "T") + releasedDate := dateParts[0] + var scene models.Scene + scene.GetIfExist("bvr-" + id) + if scene.ID > 0 { + // get the next record, this one already matches the new id + return true + } + db.Where("scraper_id = ? and release_date_text = ?", "bvr", releasedDate).Find(&scene) + if scene.ID > 0 { + oldSceneId := scene.SceneID + log.Infof("Updating SceneId %s to %s ", oldSceneId, "bvr-"+id) + scene.LegacySceneID = scene.SceneID + scene.SceneID = "bvr-" + id + scene.SceneURL = "https://virtualporn.com/video/" + id + "/" + slugify.Slugify(strings.ReplaceAll(title, "'", "")) + scene.MemberURL = "https://site-ma.virtualporn.com/scene/" + id + "/" + slugify.Slugify(strings.ReplaceAll(title, "'", "")) + + scene.Save() + result := db.Model(&models.Action{}).Where("scene_id = ?", oldSceneId).Update("scene_id", scene.SceneID) + if result.Error != nil { + log.Infof("Converting Actions for VirtualPorn Scene %s to %s failed, %s", oldSceneId, scene.SceneID, result.Error) + } + result = db.Model(&models.ExternalReferenceLink{}).Where("internal_table = 'scenes' and internal_name_id = ?", oldSceneId).Update("internal_name_id", scene.SceneID) + if result.Error != nil { + log.Infof("Converting External Reference Links for VirtualPorn Scene %s to %s failed, %s", oldSceneId, scene.SceneID, result.Error) + } + } + return true + }) + offset += 100 + if offset < sceneCnt { + apiCollector.Visit("https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset=" + strconv.Itoa(offset)) + } + }) + + collector.Visit("https://virtualporn.com/videos") + + if sceneCnt > 0 { + return nil + } else { + return errors.New("No scenes updated") + } + +}