From 7596564bd1138ada4478559747d2b47fb2724a59 Mon Sep 17 00:00:00 2001 From: toshski <104477758+toshski@users.noreply.github.com> Date: Wed, 3 Apr 2024 04:10:19 +1300 Subject: [PATCH] feat: Ability to Limit the Number of Scrapers Running Concurrently (#1664) * Ability to Limit the Number of Scrapers Running Concurrently * Remove debugging log message --- README.md | 29 +++++++++++++++-------------- pkg/common/common.go | 1 + pkg/common/paths.go | 7 +++++++ pkg/tasks/content.go | 21 ++++++++++++++++++++- 4 files changed, 43 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 706cf84da..c3b2b701e 100644 --- a/README.md +++ b/README.md @@ -136,17 +136,18 @@ left arrow - cycles backwards in gallery right arrow - cycles forward in gallery esc - closes details pane -#### using Command Line Arguments -| Command line parameter | Type | Description | -|------------------------|-------|-------------- -| `--enableLocalStorage` | boolean |Use local folder to store application data| -| `--app_dir` | String|path to the application directory| -| `--cache_dir` | String|path to the tempoarary scraper cache directory| -| `--imgproxy_dir` | String|path to the imageproxy directory| -| `--search_dir` | String| path to the Search Index directory| -| `--preview_dir` | String| path to the Scraper Cache directory| -| `--scriptsheatmap_dir` | String| path to the scripts_heatmap directory| -| `--myfiles_dir` | String| path to the myfiles directory for serving users own content (eg images| -| `--databaseurl` | String|override default database path| -| `--web_port` | Int| override default Web Page port 9999| -| `--ws_addr` | String| override default Websocket address from the default 0.0.0.0:9998| +#### using Command Line Arguments/Environment Variables +| Command line parameter | Environment Variable | Type | Description | +|------------------------|--------------|------|-------------| +| `--enableLocalStorage` | | boolean |Use local folder to store application data| +| `--app_dir` | XBVR_APPDIR | String|path to the application directory| +| `--cache_dir` | XBVR_CACHEDIR | String|path to the tempoarary scraper cache directory| +| `--imgproxy_dir` | XBVR_IMAGEPROXYDIR | String|path to the imageproxy directory| +| `--search_dir` | XBVR_SEARCHDIR | String| path to the Search Index directory| +| `--preview_dir` | XBVR_VIDEOPREVIEWDIR | String| path to the Scraper Cache directory| +| `--scriptsheatmap_dir` | XBVR_SCRIPTHEATMAPDIR | String| path to the scripts_heatmap directory| +| `--myfiles_dir` | XBVR_MYFILESDIR | String| path to the myfiles directory for serving users own content (eg images| +| `--databaseurl` | DATABASE_URL | String|override default database path| +| `--web_port` | XBVR_WEB_PORT | Int| override default Web Page port 9999| +| `--ws_addr` | DB_CONNECTION_POOL_SIZE | String| override default Websocket address from the default 0.0.0.0:9998| +| `--concurrent_scrapers` | CONCURRENT_SCRAPERS | String| set the number of scrapers that run concurrently default 9999| diff --git a/pkg/common/common.go b/pkg/common/common.go index 693b91486..6fc1d2b1b 100644 --- a/pkg/common/common.go +++ b/pkg/common/common.go @@ -22,6 +22,7 @@ type EnvConfigSpec struct { WsAddr string `envconfig:"XBVR_WS_ADDR" required:"false" default:""` WebPort int `envconfig:"XBVR_WEB_PORT" required:"false" default:"0"` DBConnectionPoolSize int `envconfig:"DB_CONNECTION_POOL_SIZE" required:"false" default:"0"` + ConcurrentScrapers int `envconfig:"CONCURRENT_SCRAPERS" required:"false" default:"9999"` } var EnvConfig EnvConfigSpec diff --git a/pkg/common/paths.go b/pkg/common/paths.go index 7e5b258b0..c3bd1145e 100644 --- a/pkg/common/paths.go +++ b/pkg/common/paths.go @@ -24,6 +24,7 @@ var MyFilesDir string var DownloadDir string var WebPort int var DBConnectionPoolSize int +var ConcurrentScrapers int func DirSize(path string) (int64, error) { var size int64 @@ -53,6 +54,7 @@ func InitPaths() { web_port := flag.Int("web_port", 0, "Optional: override default Web Page port 9999") ws_addr := flag.String("ws_addr", "", "Optional: override default Websocket address from the default 0.0.0.0:9998") db_connection_pool_size := flag.Int("db_connection_pool_size", 0, "Optional: sets a limit to the number of db connections while scraping") + concurrentSscrapers := flag.Int("concurrent_scrapers", 0, "Optional: sets a limit to the number of concurrent scrapers") flag.Parse() @@ -120,6 +122,11 @@ func InitPaths() { } else { DBConnectionPoolSize = EnvConfig.DBConnectionPoolSize } + if *concurrentSscrapers != 0 { + ConcurrentScrapers = *concurrentSscrapers + } else { + ConcurrentScrapers = EnvConfig.ConcurrentScrapers + } _ = os.MkdirAll(AppDir, os.ModePerm) _ = os.MkdirAll(ImgDir, os.ModePerm) diff --git a/pkg/tasks/content.go b/pkg/tasks/content.go index 0355a7d92..59d358d2f 100644 --- a/pkg/tasks/content.go +++ b/pkg/tasks/content.go @@ -117,12 +117,31 @@ func runScrapers(knownScenes []string, toScrape string, updateSite bool, collect var wg sync.WaitGroup + sitecnt := 1 + concurrent_scrapers := common.ConcurrentScrapers + if concurrent_scrapers == 0 { + concurrent_scrapers = 99999 + } if len(sites) > 0 { for _, site := range sites { for _, scraper := range scrapers { if site.ID == scraper.ID { wg.Add(1) - go scraper.Scrape(&wg, updateSite, knownScenes, collectedScenes, singleSceneURL, singeScrapeAdditionalInfo, site.LimitScraping) + go func(scraper models.Scraper) { + scraper.Scrape(&wg, updateSite, knownScenes, collectedScenes, singleSceneURL, singeScrapeAdditionalInfo, site.LimitScraping) + var site models.Site + err := site.GetIfExist(scraper.ID) + if err != nil { + log.Error(err) + return + } + site.Save() + }(scraper) + + if sitecnt%concurrent_scrapers == 0 { // processing batches of 35 sites + wg.Wait() + } + sitecnt++ } } }