Skip to content

Commit

Permalink
WetVR Scraper changes (xbapps#1488)
Browse files Browse the repository at this point in the history
  • Loading branch information
toshski authored Nov 13, 2023
1 parent 65a5c70 commit 1371217
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 27 deletions.
30 changes: 30 additions & 0 deletions pkg/migrations/migrations.go
Original file line number Diff line number Diff line change
Expand Up @@ -1803,6 +1803,36 @@ func Migrate() {
return nil
},
},
{
ID: "0071-Update-WetVR",
Migrate: func(tx *gorm.DB) error {
var scenes []models.Scene

err := tx.Where("site = ?", "WetVR").Find(&scenes).Error
if err != nil {
return err
}
for _, scene := range scenes {
scene.TrailerType = "scrape_html"
scene.TrailerSource = `{"scene_url":"` + scene.SceneURL + `","html_element":"deo-video source","extract_regex":"","content_base_url":"","record_path":"","content_path":"src","encoding_path":"","quality_path":"quality"}`
scene.MemberURL = strings.Replace(scene.SceneURL, "https://wetvr.com/", "https://wetvr.com/members/", 1)

var filenames []string
err = json.Unmarshal([]byte(scene.FilenamesArr), &filenames)
baseFilename := strings.TrimPrefix(scene.SceneURL, "https://wetvr.com/video/")
if !strings.Contains(scene.FilenamesArr, "2700.mp4") {
filenames = append(filenames, "wetvr-"+baseFilename+"-2700.mp4")
filenames = append(filenames, "wetvr-"+baseFilename+"-2048.mp4")
filenames = append(filenames, "wetvr-"+baseFilename+"-1600.mp4")
filenames = append(filenames, "wetvr-"+baseFilename+"-960.mp4")
tmp, _ := json.Marshal(filenames)
scene.FilenamesArr = string(tmp)
}
tx.Save(&scene)
}
return nil
},
},
})

if err := m.Migrate(); err != nil {
Expand Down
2 changes: 2 additions & 0 deletions pkg/models/model_scene.go
Original file line number Diff line number Diff line change
Expand Up @@ -1047,6 +1047,8 @@ func queryScenes(db *gorm.DB, r RequestSceneList) (*gorm.DB, *gorm.DB) {
tx = tx.Order("updated_at desc")
case "script_published_desc":
tx = tx.Order("script_published desc")
case "scene_id_desc":
tx = tx.Order("scene_id desc")
case "random":
if dbConn.Driver == "mysql" {
tx = tx.Order("rand()")
Expand Down
57 changes: 30 additions & 27 deletions pkg/scrape/wetvr.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ package scrape

import (
"encoding/json"
"regexp"
"strconv"
"strings"
"sync"
"time"
Expand All @@ -24,89 +22,94 @@ func WetVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<-
sceneCollector := createCollector("wetvr.com")
siteCollector := createCollector("wetvr.com")

// RegEx Patterns
durationRegEx := regexp.MustCompile(`(?i)DURATION:\W(\d+)`)

sceneCollector.OnHTML(`div#t2019`, func(e *colly.HTMLElement) {
sceneCollector.OnHTML(`div#trailer_player`, func(e *colly.HTMLElement) {
sc := models.ScrapedScene{}
sc.ScraperID = scraperID
sc.SceneType = "VR"
sc.Studio = "WetVR"
sc.Site = siteID
sc.HomepageURL = strings.Split(e.Request.URL.String(), "?")[0]
sc.MembersUrl = strings.Replace(sc.HomepageURL, "https://wetvr.com/", "https://members.wetvr.com/", 1)
sc.MembersUrl = strings.Replace(sc.HomepageURL, "https://wetvr.com/", "https://wetvr.com/members/", 1)

// Scene ID - get from previous page
sc.SiteID = e.Request.Ctx.GetAny("scene-id").(string)
sc.SceneID = slugify.Slugify(sc.Site + "-" + sc.SiteID)

// Title
sc.Title = strings.TrimSpace(e.ChildText(`h1.t2019-stitle`))
sc.Title = strings.TrimSpace(e.ChildText(`div.scene-info h1`))

// Date
scenedate := e.Request.Ctx.GetAny("scene-date").(string)
if scenedate != "" {
tmpDate, _ := goment.New(scenedate, "MMMM DD, YYYY")
tmpDate, _ := goment.New(scenedate, "MM/DD/YYYY")
sc.Released = tmpDate.Format("YYYY-MM-DD")
}

// Duration
tmpDuration := durationRegEx.FindStringSubmatch(e.ChildText(`div#t2019-stime`))[1]
sc.Duration, _ = strconv.Atoi(tmpDuration)

// Cover URLs
coverSrc := e.ChildAttr(`div#t2019-video deo-video`, "cover-image")
coverSrc := e.ChildAttr(`div[id="player-wrapper"] deo-video`, "cover-image")
if coverSrc == "" {
coverSrc = e.ChildAttr(`div#t2019-video img#no-player-image`, "src")
coverSrc = strings.Split(e.ChildAttr(`div[id="no-player-wrapper"] div.bg-cover`, "style"), "background-image: url(")[1]
coverSrc = strings.TrimPrefix(coverSrc, "'")
coverSrc = strings.TrimSuffix(coverSrc, "')")
}
if coverSrc != "" {
sc.Covers = append(sc.Covers, e.Request.AbsoluteURL(coverSrc))
}

// Gallery
e.ForEach(`div.t2019-thumbs img`, func(id int, e *colly.HTMLElement) {
e.ForEach(`div.items-center a[href="/join" ] img`, func(id int, e *colly.HTMLElement) {
if id > 0 {
sc.Gallery = append(sc.Gallery, e.Request.AbsoluteURL(e.Attr("src")))
}
})

// Synopsis
sc.Synopsis = strings.TrimSpace(e.ChildText(`div#t2019-description`))
sc.Synopsis = strings.TrimSpace(e.ChildText(`div.items-start span`))

// trailer details
sc.TrailerType = "deovr"
sc.TrailerSrc = strings.Replace(sc.HomepageURL, "/video/", "/deovr/", 1)
sc.TrailerType = "scrape_html"
params := models.TrailerScrape{SceneUrl: sc.HomepageURL, HtmlElement: "deo-video source", ContentPath: "src", QualityPath: "quality"}
strParams, _ := json.Marshal(params)
sc.TrailerSrc = string(strParams)

// Cast
e.ForEach(`div#t2019-models a`, func(id int, e *colly.HTMLElement) {
e.ForEach(`a[href^="/models/"]`, func(id int, e *colly.HTMLElement) {
sc.Cast = append(sc.Cast, strings.TrimSpace(e.Text))
})

// Tags
// no tags on this site

// Filenames
// NOTE: no way to guess filename
baseFilename := strings.TrimPrefix(sc.HomepageURL, "https://wetvr.com/video/")
sc.Filenames = append(sc.Filenames, "wetvr-"+baseFilename+"-2700.mp4")
sc.Filenames = append(sc.Filenames, "wetvr-"+baseFilename+"-2048.mp4")
sc.Filenames = append(sc.Filenames, "wetvr-"+baseFilename+"-1600.mp4")
sc.Filenames = append(sc.Filenames, "wetvr-"+baseFilename+"-960.mp4")

out <- sc
})

siteCollector.OnHTML(`ul.pagination a.page-link`, func(e *colly.HTMLElement) {
siteCollector.OnHTML(`ul a.page-link`, func(e *colly.HTMLElement) {
pageURL := e.Request.AbsoluteURL(e.Attr("href"))
siteCollector.Visit(pageURL)
})

siteCollector.OnHTML(`div.card`, func(e *colly.HTMLElement) {
siteCollector.OnHTML(`div:has(p:contains("Latest")) div[id^="r-"]`, func(e *colly.HTMLElement) {
sceneURL := e.Request.AbsoluteURL(e.ChildAttr("a", "href"))

// If scene exist in database, there's no need to scrape
if !funk.ContainsString(knownScenes, sceneURL) && !strings.Contains(sceneURL, "/join") {

// SceneID and release date are only available here on div.card
ctx := colly.NewContext()
ctx.Put("scene-id", e.Attr("data-video-id"))
ctx.Put("scene-date", e.Attr("data-date"))

ctx.Put("scene-id", strings.TrimPrefix(e.Attr("id"), "r-"))
// get the date if it exists
pDate := e.DOM.Find(`div.video-thumbnail-footer div>span`)
if pDate.Length() > 0 {
ctx.Put("scene-date", pDate.Text())
} else {
ctx.Put("scene-date", "")
}
sceneCollector.Request("GET", sceneURL, nil, ctx, nil)
}
})
Expand Down
1 change: 1 addition & 0 deletions ui/src/views/scenes/Filters.vue
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
<option value="last_opened_desc">↓ {{ $t("Last viewed date") }}</option>
<option value="last_opened_asc">↑ {{ $t("Last viewed date") }}</option>
<option value="script_published_desc">↓ {{ $t("Published Script Added") }}</option>
<option value="scene_id_desc">↓ {{ $t("Scene Id") }}</option>
<option value="random">↯ {{ $t("Random") }}</option>
</select>
</div>
Expand Down

0 comments on commit 1371217

Please sign in to comment.