From 9366e34b6f5680e0076e53f17a345ae438e64624 Mon Sep 17 00:00:00 2001 From: Bread White <32078281+breadrock1@users.noreply.github.com> Date: Fri, 15 Nov 2024 11:49:17 +0300 Subject: [PATCH] Improve: moved topics config to db (#11) * fix(compose): docker compose fixes * chore(topics): impled loading topics config and sources from db * fix(rss): fixed parsing rss item content for crawler-llm and without * chore(test): updated tests after all changes --------- Co-authored-by: Bread White --- config/development.toml | 15 --------------- config/production.toml | 8 +------- src/bin/main.rs | 7 ++----- src/config.rs | 2 -- src/feeds/rss_feeds/mod.rs | 22 +++++++++++++++------- tests/test_publish_feeds.rs | 10 +++++++++- 6 files changed, 27 insertions(+), 37 deletions(-) diff --git a/config/development.toml b/config/development.toml index 206b038..3de4d9f 100644 --- a/config/development.toml +++ b/config/development.toml @@ -39,18 +39,3 @@ max_pool_size = 10 [crawler.llm] api_key = "sk-no-key-required" base_url = "http://localhost:8081/v1" - -[topics.rss] -max_retries = 3 -timeout = 100 -interval_secs = 3600 -source_name = "NDTV World News" -target_url = "https://feeds.feedburner.com/ndtvnews-world-news" - -# Available rss news sources: -#target_url = "https://feeds.skynews.com/feeds/rss/world.xml" -#target_url = "https://media.rss.com/ukrainewatch/feed.xml" -#target_url = "https://feeds.feedburner.com/pri/theworld" -#target_url = "https://www.mnnonline.org/rss/countries/ukr.xml" -#target_url = "https://www.wdiy.org/podcast/the-jennings-report/rss.xml" -#target_url = "http://feeds.feedburner.com/NewsHourHeadlinesPodcast" diff --git a/config/production.toml b/config/production.toml index 82375a8..ad1eec0 100644 --- a/config/production.toml +++ b/config/production.toml @@ -11,6 +11,7 @@ expired_secs = 10368000 address = "redis://redis:6379" username = "redis" password = "redis" +expired_secs = 10368000 [publish.rmq] address = "amqp://rabbitmq:5672" @@ -38,10 +39,3 @@ max_pool_size = 10 [crawler.llm] api_key = "sk-no-key-required" base_url = "http://llm:8081/v1" - -[topics.rss] -max_retries = 3 -timeout = 100 -interval_secs = 3600 -source_name = "NDTV World News" -target_url = "https://feeds.feedburner.com/ndtvnews-world-news" diff --git a/src/bin/main.rs b/src/bin/main.rs index 75caa97..c553463 100644 --- a/src/bin/main.rs +++ b/src/bin/main.rs @@ -43,10 +43,9 @@ async fn main() -> Result<(), anyhow::Error> { #[cfg(feature = "crawler-llm")] let crawler = build_llm_crawler(&config).await?; - let rss_config = config.topics().rss(); let pgsql_config = config.storage().pgsql(); let storage = PgsqlTopicStorage::connect(pgsql_config).await?; - let rss_config = load_topics_from_pgsql(&rss_config, &storage).await?; + let rss_config = load_topics_from_pgsql(&storage).await?; let pg_storage = Arc::new(storage); let rss_workers = rss_config @@ -132,10 +131,9 @@ pub async fn build_llm_crawler(config: &ServiceConfig) -> Result } pub async fn load_topics_from_pgsql( - rss_config: &RssConfig, storage: &PgsqlTopicStorage, ) -> Result, anyhow::Error> { - let mut topics = storage + let topics = storage .load_at_launch() .await .map_err(|err| { @@ -148,7 +146,6 @@ pub async fn load_topics_from_pgsql( .map(|it: RssConfig| (it.target_url().to_owned(), it)) .collect::>(); - topics.insert(rss_config.target_url().to_owned(), rss_config.to_owned()); let topics = topics.into_values().collect(); Ok(topics) } diff --git a/src/config.rs b/src/config.rs index 4644c57..6e42171 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,6 +1,5 @@ use crate::cache::config::CacheConfig; use crate::crawler::config::CrawlerConfig; -use crate::feeds::config::TopicsConfig; use crate::logger::LoggerConfig; use crate::publish::config::PublishConfig; use crate::server::config::ServerConfig; @@ -20,7 +19,6 @@ pub struct ServiceConfig { server: ServerConfig, cache: CacheConfig, publish: PublishConfig, - topics: TopicsConfig, crawler: CrawlerConfig, storage: StorageConfig, } diff --git a/src/feeds/rss_feeds/mod.rs b/src/feeds/rss_feeds/mod.rs index 7dd5f2e..d21a6f1 100644 --- a/src/feeds/rss_feeds/mod.rs +++ b/src/feeds/rss_feeds/mod.rs @@ -149,7 +149,7 @@ where async fn extract_item(&self, item: &rss::Item) -> Result { let guid = item.guid().ok_or(anyhow::Error::msg("empty guid"))?; let title = item.title().ok_or(anyhow::Error::msg("empty title"))?; - let link = item.link().ok_or(anyhow::Error::msg("empty link"))?; + let link = item.link().unwrap_or(guid.value()); let source = Url::parse(link) .map(|it| it.domain().map(|t| t.to_string())) @@ -163,15 +163,12 @@ where Some(data) => self.clear_html_tags(data)?, None => { #[allow(unused_variables)] - let data = description; + let data = description.to_string(); #[cfg(feature = "crawler-llm")] - let data = link; + let data = self.scrape(link).await?; - self.crawler() - .scrape_by_url(data) - .await - .map_err(|err| anyhow::Error::msg(err.to_string()))? + data } }; @@ -204,4 +201,15 @@ where let result_text = regex.replace_all(content, "").to_string(); Ok(result_text) } + + #[cfg(feature = "crawler-llm")] + async fn scrape(&self, link: &str) -> Result { + let result = self + .crawler() + .scrape_by_url(link) + .await + .map_err(|err| anyhow::Error::msg(err.to_string()))?; + + Ok(result) + } } diff --git a/tests/test_publish_feeds.rs b/tests/test_publish_feeds.rs index 3b37406..4d4ef8f 100644 --- a/tests/test_publish_feeds.rs +++ b/tests/test_publish_feeds.rs @@ -3,6 +3,7 @@ mod tests_helper; use mocks::mock_rmq_publish::MockRabbitPublisher; use news_rss::config::ServiceConfig; +use news_rss::feeds::rss_feeds::config::RssConfig; use news_rss::feeds::rss_feeds::RssFeeds; use news_rss::feeds::FetchTopic; use news_rss::server::RssWorker; @@ -35,7 +36,14 @@ async fn test_rss_feeds() -> Result<(), anyhow::Error> { #[cfg(feature = "crawler-llm")] let crawler = tests_helper::build_llm_crawler(&config).await?; - let rss_config = vec![config.topics().rss()]; + let rss_config = vec![RssConfig::builder() + .source_name("NDTV World News".to_owned()) + .target_url("https://feeds.feedburner.com/ndtvnews-world-news".to_owned()) + .max_retries(3) + .timeout(10) + .interval_secs(5) + .build()?]; + let _ = rss_config .into_iter() .filter_map(|it| RssFeeds::new(it, publish.clone(), cache.clone(), crawler.clone()).ok())