Skip to content

Commit

Permalink
feat: Ability to Limit the Number of Scrapers Running Concurrently (#…
Browse files Browse the repository at this point in the history
…1664)

* Ability to Limit the Number of Scrapers Running Concurrently

* Remove debugging log message
  • Loading branch information
toshski authored Apr 2, 2024
1 parent a21431a commit 7596564
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 15 deletions.
29 changes: 15 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,17 +136,18 @@ left arrow - cycles backwards in gallery
right arrow - cycles forward in gallery
esc - closes details pane

#### using Command Line Arguments
| Command line parameter | Type | Description |
|------------------------|-------|--------------
| `--enableLocalStorage` | boolean |Use local folder to store application data|
| `--app_dir` | String|path to the application directory|
| `--cache_dir` | String|path to the tempoarary scraper cache directory|
| `--imgproxy_dir` | String|path to the imageproxy directory|
| `--search_dir` | String| path to the Search Index directory|
| `--preview_dir` | String| path to the Scraper Cache directory|
| `--scriptsheatmap_dir` | String| path to the scripts_heatmap directory|
| `--myfiles_dir` | String| path to the myfiles directory for serving users own content (eg images|
| `--databaseurl` | String|override default database path|
| `--web_port` | Int| override default Web Page port 9999|
| `--ws_addr` | String| override default Websocket address from the default 0.0.0.0:9998|
#### using Command Line Arguments/Environment Variables
| Command line parameter | Environment Variable | Type | Description |
|------------------------|--------------|------|-------------|
| `--enableLocalStorage` | | boolean |Use local folder to store application data|
| `--app_dir` | XBVR_APPDIR | String|path to the application directory|
| `--cache_dir` | XBVR_CACHEDIR | String|path to the tempoarary scraper cache directory|
| `--imgproxy_dir` | XBVR_IMAGEPROXYDIR | String|path to the imageproxy directory|
| `--search_dir` | XBVR_SEARCHDIR | String| path to the Search Index directory|
| `--preview_dir` | XBVR_VIDEOPREVIEWDIR | String| path to the Scraper Cache directory|
| `--scriptsheatmap_dir` | XBVR_SCRIPTHEATMAPDIR | String| path to the scripts_heatmap directory|
| `--myfiles_dir` | XBVR_MYFILESDIR | String| path to the myfiles directory for serving users own content (eg images|
| `--databaseurl` | DATABASE_URL | String|override default database path|
| `--web_port` | XBVR_WEB_PORT | Int| override default Web Page port 9999|
| `--ws_addr` | DB_CONNECTION_POOL_SIZE | String| override default Websocket address from the default 0.0.0.0:9998|
| `--concurrent_scrapers` | CONCURRENT_SCRAPERS | String| set the number of scrapers that run concurrently default 9999|
1 change: 1 addition & 0 deletions pkg/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ type EnvConfigSpec struct {
WsAddr string `envconfig:"XBVR_WS_ADDR" required:"false" default:""`
WebPort int `envconfig:"XBVR_WEB_PORT" required:"false" default:"0"`
DBConnectionPoolSize int `envconfig:"DB_CONNECTION_POOL_SIZE" required:"false" default:"0"`
ConcurrentScrapers int `envconfig:"CONCURRENT_SCRAPERS" required:"false" default:"9999"`
}

var EnvConfig EnvConfigSpec
Expand Down
7 changes: 7 additions & 0 deletions pkg/common/paths.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ var MyFilesDir string
var DownloadDir string
var WebPort int
var DBConnectionPoolSize int
var ConcurrentScrapers int

func DirSize(path string) (int64, error) {
var size int64
Expand Down Expand Up @@ -53,6 +54,7 @@ func InitPaths() {
web_port := flag.Int("web_port", 0, "Optional: override default Web Page port 9999")
ws_addr := flag.String("ws_addr", "", "Optional: override default Websocket address from the default 0.0.0.0:9998")
db_connection_pool_size := flag.Int("db_connection_pool_size", 0, "Optional: sets a limit to the number of db connections while scraping")
concurrentSscrapers := flag.Int("concurrent_scrapers", 0, "Optional: sets a limit to the number of concurrent scrapers")

flag.Parse()

Expand Down Expand Up @@ -120,6 +122,11 @@ func InitPaths() {
} else {
DBConnectionPoolSize = EnvConfig.DBConnectionPoolSize
}
if *concurrentSscrapers != 0 {
ConcurrentScrapers = *concurrentSscrapers
} else {
ConcurrentScrapers = EnvConfig.ConcurrentScrapers
}

_ = os.MkdirAll(AppDir, os.ModePerm)
_ = os.MkdirAll(ImgDir, os.ModePerm)
Expand Down
21 changes: 20 additions & 1 deletion pkg/tasks/content.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,31 @@ func runScrapers(knownScenes []string, toScrape string, updateSite bool, collect

var wg sync.WaitGroup

sitecnt := 1
concurrent_scrapers := common.ConcurrentScrapers
if concurrent_scrapers == 0 {
concurrent_scrapers = 99999
}
if len(sites) > 0 {
for _, site := range sites {
for _, scraper := range scrapers {
if site.ID == scraper.ID {
wg.Add(1)
go scraper.Scrape(&wg, updateSite, knownScenes, collectedScenes, singleSceneURL, singeScrapeAdditionalInfo, site.LimitScraping)
go func(scraper models.Scraper) {
scraper.Scrape(&wg, updateSite, knownScenes, collectedScenes, singleSceneURL, singeScrapeAdditionalInfo, site.LimitScraping)
var site models.Site
err := site.GetIfExist(scraper.ID)
if err != nil {
log.Error(err)
return
}
site.Save()
}(scraper)

if sitecnt%concurrent_scrapers == 0 { // processing batches of 35 sites
wg.Wait()
}
sitecnt++
}
}
}
Expand Down

0 comments on commit 7596564

Please sign in to comment.