From 1371217219ec182e03f71b39ac9dafa25bc91118 Mon Sep 17 00:00:00 2001 From: toshski <104477758+toshski@users.noreply.github.com> Date: Tue, 14 Nov 2023 09:17:29 +1300 Subject: [PATCH] WetVR Scraper changes (#1488) --- pkg/migrations/migrations.go | 30 +++++++++++++++++ pkg/models/model_scene.go | 2 ++ pkg/scrape/wetvr.go | 57 +++++++++++++++++---------------- ui/src/views/scenes/Filters.vue | 1 + 4 files changed, 63 insertions(+), 27 deletions(-) diff --git a/pkg/migrations/migrations.go b/pkg/migrations/migrations.go index ef087c9aa..afa05ed4c 100644 --- a/pkg/migrations/migrations.go +++ b/pkg/migrations/migrations.go @@ -1803,6 +1803,36 @@ func Migrate() { return nil }, }, + { + ID: "0071-Update-WetVR", + Migrate: func(tx *gorm.DB) error { + var scenes []models.Scene + + err := tx.Where("site = ?", "WetVR").Find(&scenes).Error + if err != nil { + return err + } + for _, scene := range scenes { + scene.TrailerType = "scrape_html" + scene.TrailerSource = `{"scene_url":"` + scene.SceneURL + `","html_element":"deo-video source","extract_regex":"","content_base_url":"","record_path":"","content_path":"src","encoding_path":"","quality_path":"quality"}` + scene.MemberURL = strings.Replace(scene.SceneURL, "https://wetvr.com/", "https://wetvr.com/members/", 1) + + var filenames []string + err = json.Unmarshal([]byte(scene.FilenamesArr), &filenames) + baseFilename := strings.TrimPrefix(scene.SceneURL, "https://wetvr.com/video/") + if !strings.Contains(scene.FilenamesArr, "2700.mp4") { + filenames = append(filenames, "wetvr-"+baseFilename+"-2700.mp4") + filenames = append(filenames, "wetvr-"+baseFilename+"-2048.mp4") + filenames = append(filenames, "wetvr-"+baseFilename+"-1600.mp4") + filenames = append(filenames, "wetvr-"+baseFilename+"-960.mp4") + tmp, _ := json.Marshal(filenames) + scene.FilenamesArr = string(tmp) + } + tx.Save(&scene) + } + return nil + }, + }, }) if err := m.Migrate(); err != nil { diff --git a/pkg/models/model_scene.go b/pkg/models/model_scene.go index 62d8b225e..980607413 100644 --- a/pkg/models/model_scene.go +++ b/pkg/models/model_scene.go @@ -1047,6 +1047,8 @@ func queryScenes(db *gorm.DB, r RequestSceneList) (*gorm.DB, *gorm.DB) { tx = tx.Order("updated_at desc") case "script_published_desc": tx = tx.Order("script_published desc") + case "scene_id_desc": + tx = tx.Order("scene_id desc") case "random": if dbConn.Driver == "mysql" { tx = tx.Order("rand()") diff --git a/pkg/scrape/wetvr.go b/pkg/scrape/wetvr.go index 8af34c8c2..e6d864520 100644 --- a/pkg/scrape/wetvr.go +++ b/pkg/scrape/wetvr.go @@ -2,8 +2,6 @@ package scrape import ( "encoding/json" - "regexp" - "strconv" "strings" "sync" "time" @@ -24,61 +22,57 @@ func WetVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- sceneCollector := createCollector("wetvr.com") siteCollector := createCollector("wetvr.com") - // RegEx Patterns - durationRegEx := regexp.MustCompile(`(?i)DURATION:\W(\d+)`) - - sceneCollector.OnHTML(`div#t2019`, func(e *colly.HTMLElement) { + sceneCollector.OnHTML(`div#trailer_player`, func(e *colly.HTMLElement) { sc := models.ScrapedScene{} sc.ScraperID = scraperID sc.SceneType = "VR" sc.Studio = "WetVR" sc.Site = siteID sc.HomepageURL = strings.Split(e.Request.URL.String(), "?")[0] - sc.MembersUrl = strings.Replace(sc.HomepageURL, "https://wetvr.com/", "https://members.wetvr.com/", 1) + sc.MembersUrl = strings.Replace(sc.HomepageURL, "https://wetvr.com/", "https://wetvr.com/members/", 1) // Scene ID - get from previous page sc.SiteID = e.Request.Ctx.GetAny("scene-id").(string) sc.SceneID = slugify.Slugify(sc.Site + "-" + sc.SiteID) // Title - sc.Title = strings.TrimSpace(e.ChildText(`h1.t2019-stitle`)) + sc.Title = strings.TrimSpace(e.ChildText(`div.scene-info h1`)) - // Date scenedate := e.Request.Ctx.GetAny("scene-date").(string) if scenedate != "" { - tmpDate, _ := goment.New(scenedate, "MMMM DD, YYYY") + tmpDate, _ := goment.New(scenedate, "MM/DD/YYYY") sc.Released = tmpDate.Format("YYYY-MM-DD") } - // Duration - tmpDuration := durationRegEx.FindStringSubmatch(e.ChildText(`div#t2019-stime`))[1] - sc.Duration, _ = strconv.Atoi(tmpDuration) - // Cover URLs - coverSrc := e.ChildAttr(`div#t2019-video deo-video`, "cover-image") + coverSrc := e.ChildAttr(`div[id="player-wrapper"] deo-video`, "cover-image") if coverSrc == "" { - coverSrc = e.ChildAttr(`div#t2019-video img#no-player-image`, "src") + coverSrc = strings.Split(e.ChildAttr(`div[id="no-player-wrapper"] div.bg-cover`, "style"), "background-image: url(")[1] + coverSrc = strings.TrimPrefix(coverSrc, "'") + coverSrc = strings.TrimSuffix(coverSrc, "')") } if coverSrc != "" { sc.Covers = append(sc.Covers, e.Request.AbsoluteURL(coverSrc)) } // Gallery - e.ForEach(`div.t2019-thumbs img`, func(id int, e *colly.HTMLElement) { + e.ForEach(`div.items-center a[href="/join" ] img`, func(id int, e *colly.HTMLElement) { if id > 0 { sc.Gallery = append(sc.Gallery, e.Request.AbsoluteURL(e.Attr("src"))) } }) // Synopsis - sc.Synopsis = strings.TrimSpace(e.ChildText(`div#t2019-description`)) + sc.Synopsis = strings.TrimSpace(e.ChildText(`div.items-start span`)) // trailer details - sc.TrailerType = "deovr" - sc.TrailerSrc = strings.Replace(sc.HomepageURL, "/video/", "/deovr/", 1) + sc.TrailerType = "scrape_html" + params := models.TrailerScrape{SceneUrl: sc.HomepageURL, HtmlElement: "deo-video source", ContentPath: "src", QualityPath: "quality"} + strParams, _ := json.Marshal(params) + sc.TrailerSrc = string(strParams) // Cast - e.ForEach(`div#t2019-models a`, func(id int, e *colly.HTMLElement) { + e.ForEach(`a[href^="/models/"]`, func(id int, e *colly.HTMLElement) { sc.Cast = append(sc.Cast, strings.TrimSpace(e.Text)) }) @@ -86,17 +80,21 @@ func WetVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- // no tags on this site // Filenames - // NOTE: no way to guess filename + baseFilename := strings.TrimPrefix(sc.HomepageURL, "https://wetvr.com/video/") + sc.Filenames = append(sc.Filenames, "wetvr-"+baseFilename+"-2700.mp4") + sc.Filenames = append(sc.Filenames, "wetvr-"+baseFilename+"-2048.mp4") + sc.Filenames = append(sc.Filenames, "wetvr-"+baseFilename+"-1600.mp4") + sc.Filenames = append(sc.Filenames, "wetvr-"+baseFilename+"-960.mp4") out <- sc }) - siteCollector.OnHTML(`ul.pagination a.page-link`, func(e *colly.HTMLElement) { + siteCollector.OnHTML(`ul a.page-link`, func(e *colly.HTMLElement) { pageURL := e.Request.AbsoluteURL(e.Attr("href")) siteCollector.Visit(pageURL) }) - siteCollector.OnHTML(`div.card`, func(e *colly.HTMLElement) { + siteCollector.OnHTML(`div:has(p:contains("Latest")) div[id^="r-"]`, func(e *colly.HTMLElement) { sceneURL := e.Request.AbsoluteURL(e.ChildAttr("a", "href")) // If scene exist in database, there's no need to scrape @@ -104,9 +102,14 @@ func WetVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<- // SceneID and release date are only available here on div.card ctx := colly.NewContext() - ctx.Put("scene-id", e.Attr("data-video-id")) - ctx.Put("scene-date", e.Attr("data-date")) - + ctx.Put("scene-id", strings.TrimPrefix(e.Attr("id"), "r-")) + // get the date if it exists + pDate := e.DOM.Find(`div.video-thumbnail-footer div>span`) + if pDate.Length() > 0 { + ctx.Put("scene-date", pDate.Text()) + } else { + ctx.Put("scene-date", "") + } sceneCollector.Request("GET", sceneURL, nil, ctx, nil) } }) diff --git a/ui/src/views/scenes/Filters.vue b/ui/src/views/scenes/Filters.vue index 04b3e6538..a6738904b 100644 --- a/ui/src/views/scenes/Filters.vue +++ b/ui/src/views/scenes/Filters.vue @@ -56,6 +56,7 @@ +