Skip to content

Commit

Permalink
Added VRHush scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
cld9x committed May 27, 2019
1 parent 866a7d1 commit 5bf9c8b
Show file tree
Hide file tree
Showing 2 changed files with 155 additions and 0 deletions.
152 changes: 152 additions & 0 deletions pkg/scrape/vrhush.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
package scrape

import (
"log"
"net/url"
"strings"

"github.com/gocolly/colly"
"github.com/mozillazg/go-slugify"
"github.com/nleeper/goment"
"github.com/thoas/go-funk"
)

func ScrapeVRHush(knownScenes []string, out *[]ScrapedScene) error {
siteCollector := colly.NewCollector(
colly.AllowedDomains("vrhush.com"),
colly.CacheDir(siteCacheDir),
colly.UserAgent(userAgent),
)

sceneCollector := colly.NewCollector(
colly.AllowedDomains("vrhush.com"),
colly.CacheDir(sceneCacheDir),
colly.UserAgent(userAgent),
)

castCollector := colly.NewCollector(
colly.AllowedDomains("vrhush.com"),
colly.CacheDir(sceneCacheDir),
colly.UserAgent(userAgent),
colly.AllowURLRevisit(),
)

siteCollector.OnRequest(func(r *colly.Request) {
log.Println("visiting", r.URL.String())
})

sceneCollector.OnRequest(func(r *colly.Request) {
log.Println("visiting", r.URL.String())
})

castCollector.OnRequest(func(r *colly.Request) {
log.Println("visiting", r.URL.String())
})

sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) {
sc := ScrapedScene{}
sc.SceneType = "VR"
sc.Studio = "VRHush"
sc.Site = "VRHush"
sc.HomepageURL = strings.Split(e.Request.URL.String(), "?")[0]

// Scene ID - get from URL
tmp := strings.Split(sc.HomepageURL, "/")
tmp2 := strings.Split(tmp[len(tmp)-1], "_")[0]
sc.SiteID = strings.Replace(tmp2, "vrh", "", -1)
sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID

// Title
e.ForEach(`h1.latest-scene-title`, func(id int, e *colly.HTMLElement) {
sc.Title = strings.TrimSpace(e.Text)
})

// Cover URLs
e.ForEach(`dl8-video`, func(id int, e *colly.HTMLElement) {
sc.Covers = append(sc.Covers, e.Request.AbsoluteURL(e.Attr("poster")))
})

// Gallery
e.ForEach(`div.owl-carousel img.img-responsive`, func(id int, e *colly.HTMLElement) {
sc.Gallery = append(sc.Gallery, e.Request.AbsoluteURL(e.Attr("src")))
})

// Synopsis
e.ForEach(`span.full-description`, func(id int, e *colly.HTMLElement) {
sc.Synopsis = strings.TrimSpace(e.Text)
})

// Tags
e.ForEach(`p.tag-container a.label-tag`, func(id int, e *colly.HTMLElement) {
sc.Tags = append(sc.Tags, strings.TrimSpace(e.Text))
})

// Cast
var tmpCast []string
e.ForEach(`h5.latest-scene-subtitle a`, func(id int, e *colly.HTMLElement) {
tmpCast = append(tmpCast, e.Attr("href"))
})

// Date
e.ForEach(`div.latest-scene-meta-1 div.text-left`, func(id int, e *colly.HTMLElement) {
tmpDate, _ := goment.New(e.Text, "MMM DD, YYYY")
sc.Released = tmpDate.Format("YYYY-MM-DD")
})

// Duration
sc.Duration = 0

// Filenames
e.ForEach(`input.stream-input-box`, func(id int, e *colly.HTMLElement) {
origURL, _ := url.Parse(e.Attr("value"))
sc.Filenames = append(sc.Filenames, origURL.Query().Get("name"))
})

ctx := colly.NewContext()
ctx.Put("scene", &sc)

for i := range tmpCast {
castCollector.Request("GET", tmpCast[i], nil, ctx, nil)
}

*out = append(*out, sc)
})

castCollector.OnHTML(`html`, func(e *colly.HTMLElement) {
sc := e.Request.Ctx.GetAny("scene").(*ScrapedScene)

var name string
e.ForEach(`h1#model-name`, func(id int, e *colly.HTMLElement) {
name = strings.TrimSpace(e.Text)
})

var gender string
e.ForEach(`ul.model-attributes li`, func(id int, e *colly.HTMLElement) {
if strings.Split(e.Text, " ")[0] == "Gender" {
gender = strings.Split(e.Text, " ")[1]
}
})

if gender == "Female" {
sc.Cast = append(sc.Cast, name)
}
})

siteCollector.OnHTML(`ul.pagination a`, func(e *colly.HTMLElement) {
pageURL := e.Request.AbsoluteURL(e.Attr("href"))
siteCollector.Visit(pageURL)
})

siteCollector.OnHTML(`div.row div.col-md-4 p.desc a`, func(e *colly.HTMLElement) {
sceneURL := e.Request.AbsoluteURL(e.Attr("href"))

// If scene exist in database, there's no need to scrape
if !funk.ContainsString(knownScenes, sceneURL) {
sceneCollector.Visit(sceneURL)
}
})

siteCollector.Visit("https://vrhush.com/scenes")

return nil
}
3 changes: 3 additions & 0 deletions pkg/xbvr/task_content.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ func Scrape() {
tlog.Infof("Scraping VirtualRealPorn")
scrape.ScrapeVirtualRealPorn(knownScenes, &collectedScenes)

tlog.Infof("Scraping VRHush")
scrape.ScrapeVRHush(knownScenes, &collectedScenes)

if len(collectedScenes) > 0 {
tlog.Infof("Scraped %v new scenes", len(collectedScenes))

Expand Down

0 comments on commit 5bf9c8b

Please sign in to comment.