Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite VirtualPorn (using API as a Source) #1654

Merged
merged 1 commit into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pkg/migrations/migrations.go
Original file line number Diff line number Diff line change
Expand Up @@ -1925,6 +1925,13 @@ func Migrate() {
return err
},
},
{
ID: "0077-Update-VirtualPorn-ids",
Migrate: func(tx *gorm.DB) error {
err := scrape.UpdateVirtualPornIds()
return err
},
},
})

if err := m.Migrate(); err != nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/models/model_external_reference.go
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,7 @@ func (scrapeRules ActorScraperConfig) buildGenericActorScraperRules() {

siteDetails = GenericScraperRuleSet{}
siteDetails.Domain = "virtualporn.com"
siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "image_url", Selector: `div.model__img-wrapper > img`, ResultType: "attr", Attribute: "src"})
siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "image_url", Selector: `section[data-cy="actorProfilePicture"] img`, ResultType: "attr", Attribute: "src"})
scrapeRules.GenericActorScrapingConfig["bvr scrape"] = siteDetails

siteDetails = GenericScraperRuleSet{}
Expand Down
306 changes: 200 additions & 106 deletions pkg/scrape/virtualporn.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,150 +2,167 @@ package scrape

import (
"encoding/json"
"errors"
"regexp"
"strconv"
"strings"
"sync"

"github.com/gocolly/colly/v2"
"github.com/nleeper/goment"
"github.com/mozillazg/go-slugify"
"github.com/thoas/go-funk"
"github.com/tidwall/gjson"
"github.com/xbapps/xbvr/pkg/models"
)

func VirtualPorn(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- models.ScrapedScene, singleSceneURL string, singeScrapeAdditionalInfo string, limitScraping bool) error {
// this scraper is non-standard in that it gathers info via an api rather than scraping html pages
defer wg.Done()
scraperID := "bvr"
siteID := "VirtualPorn"
logScrapeStart(scraperID, siteID)
nextApiUrl := ""

sceneCollector := createCollector("virtualporn.com")
siteCollector := createCollector("virtualporn.com")
pageCnt := 1

sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) {
sc := models.ScrapedScene{}
sc.ScraperID = scraperID
sc.SceneType = "VR"
sc.Studio = "BangBros"
sc.Site = siteID
sc.HomepageURL = strings.Split(e.Request.URL.String(), "?")[0]
sc.MembersUrl = "https://members.bangbros.com/product/655/movie/" + strings.Replace(strings.Split(e.Request.URL.String(), "/")[3], "video", "", 1)

// Title / Cover / ID / Filenames
e.ForEach(`dl8-video`, func(id int, e *colly.HTMLElement) {
sc.Title = strings.TrimSpace(e.Attr("title"))

tmpCover := e.Request.AbsoluteURL(e.Request.AbsoluteURL(e.Attr("poster")))
sc.Covers = append(sc.Covers, tmpCover)

tmp := strings.Split(tmpCover, "/")
sc.SceneID = strings.Replace(tmp[5], "bvr", "bvr-", 1)

e.ForEach(`source`, func(id int, e *colly.HTMLElement) {
tmpFile := strings.Split(e.Attr("src"), "/")
sc.Filenames = append(sc.Filenames, strings.Replace(tmpFile[len(tmpFile)-1], "trailer-", "", -1))
apiCollector := createCollector("site-api.project1service.com")
offset := 0

apiCollector.OnResponse(func(r *colly.Response) {
sceneListJson := gjson.ParseBytes(r.Body)

processScene := func(scene gjson.Result) {
sc := models.ScrapedScene{}
sc.ScraperID = scraperID
sc.SceneType = "VR"
sc.Studio = "BangBros"
sc.Site = siteID
id := strconv.Itoa(int(scene.Get("id").Int()))
sc.SceneID = "bvr-" + id

sc.Title = scene.Get("title").String()
sc.HomepageURL = "https://virtualporn.com/video/" + id + "/" + slugify.Slugify(strings.ReplaceAll(sc.Title, "'", ""))
sc.MembersUrl = "https://site-ma.virtualporn.com/scene/" + id + "/" + slugify.Slugify(strings.ReplaceAll(sc.Title, "'", ""))
sc.Synopsis = scene.Get("description").String()
dateParts := strings.Split(scene.Get("dateReleased").String(), "T")
sc.Released = dateParts[0]

scene.Get("images.poster").ForEach(func(key, imgGroup gjson.Result) bool {
if key.String() == "0" {
imgurl := imgGroup.Get("xl.urls.webp").String()
if imgurl != "" {
sc.Covers = append(sc.Covers, imgurl)
}

} else {
imgurl := imgGroup.Get("xl.urls.webp").String()
if imgurl != "" {
if len(sc.Covers) == 0 {
sc.Covers = append(sc.Covers, imgurl)
} else {
sc.Gallery = append(sc.Gallery, imgurl)
}
}
}
return true
})
})

file5kExists := false
for _, filename := range sc.Filenames {
if strings.Contains(filename, "5k") {
file5kExists = true
}
}
if !file5kExists {
sc.Filenames = append(sc.Filenames, strings.Replace(sc.SceneID, "bvr-", "bvr", -1)+"-5k.mp4")
}

// Gallery
e.ForEach(`div.player__thumbs img`, func(id int, e *colly.HTMLElement) {
sc.Gallery = append(sc.Gallery, e.Attr("src"))
})
// Cast
sc.ActorDetails = make(map[string]models.ActorDetails)
scene.Get("actors").ForEach(func(key, actor gjson.Result) bool {
name := actor.Get("name").String()
if actor.Get("gender").String() == "female" {
sc.Cast = append(sc.Cast, name)
}
sc.ActorDetails[actor.Get("name").String()] = models.ActorDetails{Source: scraperID + " scrape", ProfileUrl: "https://virtualporn.com/model/" + strconv.Itoa(int(actor.Get("id").Int())) + "/" + slugify.Slugify(name)}
return true
})

// trailer details
sc.TrailerType = "scrape_html"
params := models.TrailerScrape{SceneUrl: sc.HomepageURL, HtmlElement: "dl8-video source", ContentPath: "src", QualityPath: "quality"}
strParams, _ := json.Marshal(params)
sc.TrailerSrc = string(strParams)

// Cast
sc.ActorDetails = make(map[string]models.ActorDetails)
e.ForEach(`div.player__stats p.player__stats__cast a`, func(id int, e *colly.HTMLElement) {
if strings.TrimSpace(e.Text) != "" {
sc.Cast = append(sc.Cast, strings.TrimSpace(strings.ReplaceAll(e.Text, "!", "")))
sc.ActorDetails[strings.TrimSpace(strings.ReplaceAll(e.Text, "!", ""))] = models.ActorDetails{Source: scraperID + " scrape", ProfileUrl: e.Request.AbsoluteURL(e.Attr("href"))}
}
})
// Tags
scene.Get("tags").ForEach(func(key, tag gjson.Result) bool {
if tag.Get("isVisible").Bool() {
sc.Tags = append(sc.Tags, tag.Get("name").String())
}
return true
})

// Tags
e.ForEach(`div.video__tags__list a.tags`, func(id int, e *colly.HTMLElement) {
tag := strings.TrimSpace(e.Text)
if tag != "" {
sc.Tags = append(sc.Tags, strings.ToLower(tag))
}
})
// trailer & filename details
sc.TrailerType = "urls"
var trailers []models.VideoSource
scene.Get("children").ForEach(func(key, child gjson.Result) bool {
child.Get("videos.full.files").ForEach(func(key, file gjson.Result) bool {
quality := file.Get("format").String()
url := file.Get("urls.view").String()
filename := file.Get("urls.download").String()
if url != "" {
trailers = append(trailers, models.VideoSource{URL: url, Quality: quality})
}
pos := strings.Index(filename, "?filename=")
if pos != -1 {
sc.Filenames = append(sc.Filenames, filename[pos+10:])
}
return true
})
return true
})
trailerJson, _ := json.Marshal(models.VideoSourceResponse{VideoSources: trailers})
sc.TrailerSrc = string(trailerJson)

// Synposis
e.ForEach(`p.player__description`, func(id int, e *colly.HTMLElement) {
sc.Synopsis = strings.TrimSpace(e.Text)
})
out <- sc

// Release date / Duration
tmpDate, _ := goment.New(strings.TrimSpace(e.Request.Ctx.GetAny("date").(string)), "MMM DD, YYYY")
sc.Released = tmpDate.Format("YYYY-MM-DD")
tmpDuration, err := strconv.Atoi(strings.TrimSpace(strings.Replace(e.Request.Ctx.GetAny("dur").(string), "mins", "", -1)))
if err == nil {
sc.Duration = tmpDuration
}
total := int(sceneListJson.Get("meta.total").Int())
scenes := sceneListJson.Get("result")
if strings.Contains(r.Request.URL.RawQuery, "offset=") {
scenes.ForEach(func(key, scene gjson.Result) bool {
// check if we have the scene already
matches := funk.Filter(knownScenes, func(s string) bool {
return strings.Contains(s, scene.Get("id").String())
})
if funk.IsEmpty(matches) {
processScene(scene)
}
return true
})
} else {
processScene(scenes)
}

out <- sc
})

siteCollector.OnHTML(`body`, func(e *colly.HTMLElement) {
sceneCnt := 0
e.ForEach(`div.recommended__item`, func(id int, e *colly.HTMLElement) {
sceneCnt += 1
})

if sceneCnt > 0 {
pageCnt += 1
offset += 24
if offset < total {
if !limitScraping {
siteCollector.Visit("https://virtualporn.com/videos/" + strconv.Itoa(pageCnt))
apiCollector.Visit("https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset=" + strconv.Itoa(offset))
}
}
})

siteCollector.OnHTML(`div.recommended__item`, func(e *colly.HTMLElement) {
sceneURL := e.Request.AbsoluteURL(e.ChildAttr(`a`, "href"))

// If scene exist in database, there's no need to scrape
if !funk.ContainsString(knownScenes, sceneURL) {

//Date & Duration from main index
ctx := colly.NewContext()
e.ForEach(`span.recommended__item__info__date`, func(id int, e *colly.HTMLElement) {
if id == 0 {
ctx.Put("date", strings.TrimSpace(e.Text))
}
})
e.ForEach(`span.recommended__item__time`, func(id int, e *colly.HTMLElement) {
if id == 0 {
ctx.Put("dur", strings.TrimSpace(e.Text))
}
siteCollector.OnHTML(`script`, func(e *colly.HTMLElement) {
// only interested in a script containg window\.__JUAN\.rawInstance
re := regexp.MustCompile(`window\.__JUAN\.rawInstance = (\{.*?\});`)
matches := re.FindStringSubmatch(e.Text)
if len(matches) > 1 {
instanceJson := gjson.ParseBytes([]byte(matches[1]))
token := instanceJson.Get("jwt").String()
// set up api requests to use the token in the Instance Header
apiCollector.OnRequest(func(r *colly.Request) {
r.Headers.Set("Instance", token)
})

sceneCollector.Request("GET", sceneURL, nil, ctx, nil)
apiCollector.Visit(nextApiUrl)
}
})

if singleSceneURL != "" {
ctx := colly.NewContext()
ctx.Put("dur", "")
ctx.Put("date", "")
urlParts := strings.Split(singleSceneURL, "/")
id := urlParts[len(urlParts)-2]
offset = 9999 // do read more pages, we only need 1
nextApiUrl = "https://site-api.project1service.com/v2/releases/" + id
siteCollector.Visit("https://virtualporn.com/videos")

sceneCollector.Request("GET", singleSceneURL, nil, ctx, nil)
} else {
siteCollector.Visit("https://virtualporn.com/videos/" + strconv.Itoa(pageCnt))
// call virtualporn.com, this is just to get the instance token to use the api for this session
nextApiUrl = "https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset=" + strconv.Itoa(offset)
siteCollector.Visit("https://virtualporn.com/videos")
}

if updateSite {
Expand All @@ -158,3 +175,80 @@ func VirtualPorn(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out
func init() {
registerScraper("bvr", "VirtualPorn", "https://images.cn77nd.com/members/bangbros/favicon/apple-icon-60x60.png", "virtualporn.com", VirtualPorn)
}

// one off conversion routine called by migrations.go
func UpdateVirtualPornIds() error {
collector := createCollector("virtualporn.com")
apiCollector := createCollector("site-api.project1service.com")
offset := 0
sceneCnt := 0

collector.OnHTML(`script`, func(e *colly.HTMLElement) {
// only interested in a script containg window\.__JUAN\.rawInstance
re := regexp.MustCompile(`window\.__JUAN\.rawInstance = (\{.*?\});`)
matches := re.FindStringSubmatch(e.Text)
if len(matches) > 1 {
instanceJson := gjson.ParseBytes([]byte(matches[1]))
token := instanceJson.Get("jwt").String()
// set up api requests to use the token in the Instance Header
apiCollector.OnRequest(func(r *colly.Request) {
r.Headers.Set("Instance", token)
})
apiCollector.Visit("https://site-api.project1service.com/v2/releases?type=scene&limit=100&offset=" + strconv.Itoa(offset))
}
})

apiCollector.OnResponse(func(r *colly.Response) {
db, _ := models.GetDB()
defer db.Close()

sceneListJson := gjson.ParseBytes(r.Body)
sceneCnt = int(sceneListJson.Get("meta.total").Int())
scenes := sceneListJson.Get("result")
scenes.ForEach(func(key, apiScene gjson.Result) bool {
id := strconv.Itoa(int(apiScene.Get("id").Int()))
title := apiScene.Get("title").String()
dateParts := strings.Split(apiScene.Get("dateReleased").String(), "T")
releasedDate := dateParts[0]
var scene models.Scene
scene.GetIfExist("bvr-" + id)
if scene.ID > 0 {
// get the next record, this one already matches the new id
return true
}
db.Where("scraper_id = ? and release_date_text = ?", "bvr", releasedDate).Find(&scene)
if scene.ID > 0 {
oldSceneId := scene.SceneID
log.Infof("Updating SceneId %s to %s ", oldSceneId, "bvr-"+id)
scene.LegacySceneID = scene.SceneID
scene.SceneID = "bvr-" + id
scene.SceneURL = "https://virtualporn.com/video/" + id + "/" + slugify.Slugify(strings.ReplaceAll(title, "'", ""))
scene.MemberURL = "https://site-ma.virtualporn.com/scene/" + id + "/" + slugify.Slugify(strings.ReplaceAll(title, "'", ""))

scene.Save()
result := db.Model(&models.Action{}).Where("scene_id = ?", oldSceneId).Update("scene_id", scene.SceneID)
if result.Error != nil {
log.Infof("Converting Actions for VirtualPorn Scene %s to %s failed, %s", oldSceneId, scene.SceneID, result.Error)
}
result = db.Model(&models.ExternalReferenceLink{}).Where("internal_table = 'scenes' and internal_name_id = ?", oldSceneId).Update("internal_name_id", scene.SceneID)
if result.Error != nil {
log.Infof("Converting External Reference Links for VirtualPorn Scene %s to %s failed, %s", oldSceneId, scene.SceneID, result.Error)
}
}
return true
})
offset += 100
if offset < sceneCnt {
apiCollector.Visit("https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset=" + strconv.Itoa(offset))
}
})

collector.Visit("https://virtualporn.com/videos")

if sceneCnt > 0 {
return nil
} else {
return errors.New("No scenes updated")
}

}
Loading