Skip to content

Commit

Permalink
feat: Add JAVDatabase actor scraping (xbapps#1548)
Browse files Browse the repository at this point in the history
* Added first version of generic scraping for javdatabase

* Added DOMNextText as post processing function

* Implemented JAVDatabase actor scraper.

* Added Native function for JAVDatabase to decide the gender.

* Fixed image scraping for JAVDatabase

* Removed ConstantValue post processing function

* Reverted some old local changes.

* Removed Native function call for JAVDatabase gender.

* Added postprocessing functions 'SetWhenValueContains' and 'SetWhenValueNotContains'. Gender is now correctly written to the database depending on the tags of the actress

* Fixed selector

* Made the selectors more restrictive.
  • Loading branch information
pl33x authored Jan 2, 2024
1 parent 65ea8c0 commit ce68185
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 0 deletions.
39 changes: 39 additions & 0 deletions pkg/models/model_external_reference.go
Original file line number Diff line number Diff line change
Expand Up @@ -974,6 +974,45 @@ func (scrapeRules ActorScraperConfig) buildGenericActorScraperRules() {
siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "nationality", Selector: `.about-me-mobile .stars-params-title:contains("Nationality:") + .stars-params-value`, PostProcessing: []PostProcessing{{Function: "Lookup Country"}}})
siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "hair_color", Selector: `.about-me-mobile .stars-params-title:contains("Hair Color:") + .stars-params-value`})
scrapeRules.GenericActorScrapingConfig["vrspy scrape"] = siteDetails

siteDetails = GenericScraperRuleSet{}
siteDetails.Domain = "www.javdatabase.com"
siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "image_url", Selector: `img[data-src^="https://www.javdatabase.com/idolimages/full/"]`, ResultType: "attr", Attribute: "data-src", PostProcessing: []PostProcessing{
{Function: "AbsoluteUrl"},
}})
siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "images", Selector: `a[href^="https://pics.dmm.co.jp/digital/video/"]:not([href^="https://pics.dmm.co.jp/digital/video/mdj010/"])`, ResultType: "attr", Attribute: "href", PostProcessing: []PostProcessing{
{Function: "AbsoluteUrl"},
}})
siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "biography", Selector: `div[id="biography"] > div`, ResultType: "text"})
siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "hair_color", Selector: `div > b:contains("Hair Color(s):") + a`, ResultType: "text"})
siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "birth_date", Selector: `div > b:contains("DOB:") + a`, ResultType: "text"})
siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "height", Selector: `div > b:contains("Height:") + a`, ResultType: "text", PostProcessing: []PostProcessing{
{Function: "RegexString", Params: []string{`\d+`, "0"}},
}})

siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "band_size", Selector: `div > b:contains("Measurements:")`, ResultType: "text", PostProcessing: []PostProcessing{
{Function: "DOMNextText"},
{Function: "RegexString", Params: []string{`(\d+)-(\d+)-(\d+)`, "1"}},
}})
siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "cup_size", Selector: `div > b:contains("Cup:") + a`, ResultType: "text"})
siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "waist_size", Selector: `div > b:contains("Measurements:")`, ResultType: "text", PostProcessing: []PostProcessing{
{Function: "DOMNextText"},
{Function: "RegexString", Params: []string{`(\d+)-(\d+)-(\d+)`, "2"}},
}})
siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "hip_size", Selector: `div > b:contains("Measurements:")`, ResultType: "text", PostProcessing: []PostProcessing{
{Function: "DOMNextText"},
{Function: "RegexString", Params: []string{`(\d+)-(\d+)-(\d+)`, "3"}},
}})
siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "aliases", Selector: `div > p > b:contains("Alt:")`, ResultType: "text", PostProcessing: []PostProcessing{
{Function: "DOMNextText"},
}})

siteDetails.SiteRules = append(siteDetails.SiteRules, GenericActorScraperRule{XbvrField: "gender", Selector: `div > p:contains("Tags")`, ResultType: "text", PostProcessing: []PostProcessing{
{Function: "SetWhenValueContains", Params: []string{"Trans", "Transgender Female"}},
{Function: "SetWhenValueNotContains", Params: []string{"Trans", "Female"}},
}})

scrapeRules.GenericActorScrapingConfig["javdatabase scrape"] = siteDetails
}

// Loads custom rules from actor_scrapers_examples.json
Expand Down
37 changes: 37 additions & 0 deletions pkg/scrape/genericactorscraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/xbapps/xbvr/pkg/config"
"github.com/xbapps/xbvr/pkg/externalreference"
"github.com/xbapps/xbvr/pkg/models"
nethtml "golang.org/x/net/html"
)

type outputList struct {
Expand Down Expand Up @@ -525,13 +526,49 @@ func postProcessing(rule models.GenericActorScraperRule, value string, htmlEleme
value = getSubRuleResult(postprocessing.SubRule, htmlElement)
case "DOMNext":
value = strings.TrimSpace(htmlElement.DOM.Next().Text())
case "DOMNextText":
node := htmlElement.DOM.Get(0)
textNodeType := nethtml.TextNode
nextSibling := node.NextSibling

if nextSibling != nil && nextSibling.Type == textNodeType {
value = strings.TrimSpace(nextSibling.Data)
}
case "SetWhenValueContains":
searchValue := postprocessing.Params[0]
newValue := postprocessing.Params[1]

if strings.Contains(value, searchValue) {
value = newValue
}
case "SetWhenValueNotContains":
searchValue := postprocessing.Params[0]
newValue := postprocessing.Params[1]

if !strings.Contains(value, searchValue) {
value = newValue
}
case "UnescapeString":
value = html.UnescapeString(value)
}
}
return value
}

func substr(s string, start, end int) string {
counter, startIdx := 0, 0
for i := range s {
if counter == start {
startIdx = i
}
if counter == end {
return s[startIdx:i]
}
counter++
}
return s[startIdx:]
}

func getCountryCode(countryName string) string {
switch strings.ToLower(countryName) {
case "united states", "american":
Expand Down

0 comments on commit ce68185

Please sign in to comment.