From 9366e34b6f5680e0076e53f17a345ae438e64624 Mon Sep 17 00:00:00 2001
From: Bread White <32078281+breadrock1@users.noreply.github.com>
Date: Fri, 15 Nov 2024 11:49:17 +0300
Subject: [PATCH] Improve: moved topics config to db (#11)

* fix(compose): docker compose fixes

* chore(topics): impled loading topics config and sources from db

* fix(rss): fixed parsing rss item content for crawler-llm and without

* chore(test): updated tests after all changes

---------

Co-authored-by: Bread White <breadrock1@email.net>
---
 config/development.toml     | 15 ---------------
 config/production.toml      |  8 +-------
 src/bin/main.rs             |  7 ++-----
 src/config.rs               |  2 --
 src/feeds/rss_feeds/mod.rs  | 22 +++++++++++++++-------
 tests/test_publish_feeds.rs | 10 +++++++++-
 6 files changed, 27 insertions(+), 37 deletions(-)
diff --git a/config/development.toml b/config/development.toml
index 206b038..3de4d9f 100644
--- a/config/development.toml
+++ b/config/development.toml
@@ -39,18 +39,3 @@ max_pool_size = 10
 [crawler.llm]
 api_key = "sk-no-key-required"
 base_url = "http://localhost:8081/v1"
-
-[topics.rss]
-max_retries = 3
-timeout = 100
-interval_secs = 3600
-source_name = "NDTV World News"
-target_url = "https://feeds.feedburner.com/ndtvnews-world-news"
-
-# Available rss news sources:
-#target_url = "https://feeds.skynews.com/feeds/rss/world.xml"
-#target_url = "https://media.rss.com/ukrainewatch/feed.xml"
-#target_url = "https://feeds.feedburner.com/pri/theworld"
-#target_url = "https://www.mnnonline.org/rss/countries/ukr.xml"
-#target_url = "https://www.wdiy.org/podcast/the-jennings-report/rss.xml"
-#target_url = "http://feeds.feedburner.com/NewsHourHeadlinesPodcast"
diff --git a/config/production.toml b/config/production.toml
index 82375a8..ad1eec0 100644
--- a/config/production.toml
+++ b/config/production.toml
@@ -11,6 +11,7 @@ expired_secs = 10368000
 address = "redis://redis:6379"
 username = "redis"
 password = "redis"
+expired_secs = 10368000
 
 [publish.rmq]
 address = "amqp://rabbitmq:5672"
@@ -38,10 +39,3 @@ max_pool_size = 10
 [crawler.llm]
 api_key = "sk-no-key-required"
 base_url = "http://llm:8081/v1"
-
-[topics.rss]
-max_retries = 3
-timeout = 100
-interval_secs = 3600
-source_name = "NDTV World News"
-target_url = "https://feeds.feedburner.com/ndtvnews-world-news"
diff --git a/src/bin/main.rs b/src/bin/main.rs
index 75caa97..c553463 100644
--- a/src/bin/main.rs
+++ b/src/bin/main.rs
@@ -43,10 +43,9 @@ async fn main() -> Result<(), anyhow::Error> {
     #[cfg(feature = "crawler-llm")]
     let crawler = build_llm_crawler(&config).await?;
 
-    let rss_config = config.topics().rss();
     let pgsql_config = config.storage().pgsql();
     let storage = PgsqlTopicStorage::connect(pgsql_config).await?;
-    let rss_config = load_topics_from_pgsql(&rss_config, &storage).await?;
+    let rss_config = load_topics_from_pgsql(&storage).await?;
     let pg_storage = Arc::new(storage);
 
     let rss_workers = rss_config
@@ -132,10 +131,9 @@ pub async fn build_llm_crawler(config: &ServiceConfig) -> Result<Arc<LlmCrawler>
 }
 
 pub async fn load_topics_from_pgsql(
-    rss_config: &RssConfig,
     storage: &PgsqlTopicStorage,
 ) -> Result<Vec<RssConfig>, anyhow::Error> {
-    let mut topics = storage
+    let topics = storage
         .load_at_launch()
         .await
         .map_err(|err| {
@@ -148,7 +146,6 @@ pub async fn load_topics_from_pgsql(
         .map(|it: RssConfig| (it.target_url().to_owned(), it))
         .collect::<HashMap<String, RssConfig>>();
 
-    topics.insert(rss_config.target_url().to_owned(), rss_config.to_owned());
     let topics = topics.into_values().collect();
     Ok(topics)
 }
diff --git a/src/config.rs b/src/config.rs
index 4644c57..6e42171 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -1,6 +1,5 @@
 use crate::cache::config::CacheConfig;
 use crate::crawler::config::CrawlerConfig;
-use crate::feeds::config::TopicsConfig;
 use crate::logger::LoggerConfig;
 use crate::publish::config::PublishConfig;
 use crate::server::config::ServerConfig;
@@ -20,7 +19,6 @@ pub struct ServiceConfig {
     server: ServerConfig,
     cache: CacheConfig,
     publish: PublishConfig,
-    topics: TopicsConfig,
     crawler: CrawlerConfig,
     storage: StorageConfig,
 }
diff --git a/src/feeds/rss_feeds/mod.rs b/src/feeds/rss_feeds/mod.rs
index 7dd5f2e..d21a6f1 100644
--- a/src/feeds/rss_feeds/mod.rs
+++ b/src/feeds/rss_feeds/mod.rs
@@ -149,7 +149,7 @@ where
     async fn extract_item(&self, item: &rss::Item) -> Result<RssResponse, anyhow::Error> {
         let guid = item.guid().ok_or(anyhow::Error::msg("empty guid"))?;
         let title = item.title().ok_or(anyhow::Error::msg("empty title"))?;
-        let link = item.link().ok_or(anyhow::Error::msg("empty link"))?;
+        let link = item.link().unwrap_or(guid.value());
 
         let source = Url::parse(link)
             .map(|it| it.domain().map(|t| t.to_string()))
@@ -163,15 +163,12 @@ where
             Some(data) => self.clear_html_tags(data)?,
             None => {
                 #[allow(unused_variables)]
-                let data = description;
+                let data = description.to_string();
 
                 #[cfg(feature = "crawler-llm")]
-                let data = link;
+                let data = self.scrape(link).await?;
 
-                self.crawler()
-                    .scrape_by_url(data)
-                    .await
-                    .map_err(|err| anyhow::Error::msg(err.to_string()))?
+                data
             }
         };
 
@@ -204,4 +201,15 @@ where
         let result_text = regex.replace_all(content, "").to_string();
         Ok(result_text)
     }
+
+    #[cfg(feature = "crawler-llm")]
+    async fn scrape(&self, link: &str) -> Result<String, anyhow::Error> {
+        let result = self
+            .crawler()
+            .scrape_by_url(link)
+            .await
+            .map_err(|err| anyhow::Error::msg(err.to_string()))?;
+
+        Ok(result)
+    }
 }
diff --git a/tests/test_publish_feeds.rs b/tests/test_publish_feeds.rs
index 3b37406..4d4ef8f 100644
--- a/tests/test_publish_feeds.rs
+++ b/tests/test_publish_feeds.rs
@@ -3,6 +3,7 @@ mod tests_helper;
 
 use mocks::mock_rmq_publish::MockRabbitPublisher;
 use news_rss::config::ServiceConfig;
+use news_rss::feeds::rss_feeds::config::RssConfig;
 use news_rss::feeds::rss_feeds::RssFeeds;
 use news_rss::feeds::FetchTopic;
 use news_rss::server::RssWorker;
@@ -35,7 +36,14 @@ async fn test_rss_feeds() -> Result<(), anyhow::Error> {
     #[cfg(feature = "crawler-llm")]
     let crawler = tests_helper::build_llm_crawler(&config).await?;
 
-    let rss_config = vec![config.topics().rss()];
+    let rss_config = vec![RssConfig::builder()
+        .source_name("NDTV World News".to_owned())
+        .target_url("https://feeds.feedburner.com/ndtvnews-world-news".to_owned())
+        .max_retries(3)
+        .timeout(10)
+        .interval_secs(5)
+        .build()?];
+
     let _ = rss_config
         .into_iter()
         .filter_map(|it| RssFeeds::new(it, publish.clone(), cache.clone(), crawler.clone()).ok())