Skip to content

Commit

Permalink
Merge branch 'merge/merge-local-changes' into 'master'
Browse files Browse the repository at this point in the history
Merged: merged local changes

See merge request data-lake/news-rss!7
  • Loading branch information
Bread White committed Nov 15, 2024
2 parents b9fd80f + d3752b7 commit cd3e516
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 37 deletions.
15 changes: 0 additions & 15 deletions config/development.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,3 @@ max_pool_size = 10
[crawler.llm]
api_key = "sk-no-key-required"
base_url = "http://localhost:8081/v1"

[topics.rss]
max_retries = 3
timeout = 100
interval_secs = 3600
source_name = "NDTV World News"
target_url = "https://feeds.feedburner.com/ndtvnews-world-news"

# Available rss news sources:
#target_url = "https://feeds.skynews.com/feeds/rss/world.xml"
#target_url = "https://media.rss.com/ukrainewatch/feed.xml"
#target_url = "https://feeds.feedburner.com/pri/theworld"
#target_url = "https://www.mnnonline.org/rss/countries/ukr.xml"
#target_url = "https://www.wdiy.org/podcast/the-jennings-report/rss.xml"
#target_url = "http://feeds.feedburner.com/NewsHourHeadlinesPodcast"
8 changes: 1 addition & 7 deletions config/production.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ expired_secs = 10368000
address = "redis://redis:6379"
username = "redis"
password = "redis"
expired_secs = 10368000

[publish.rmq]
address = "amqp://rabbitmq:5672"
Expand Down Expand Up @@ -38,10 +39,3 @@ max_pool_size = 10
[crawler.llm]
api_key = "sk-no-key-required"
base_url = "http://llm:8081/v1"

[topics.rss]
max_retries = 3
timeout = 100
interval_secs = 3600
source_name = "NDTV World News"
target_url = "https://feeds.feedburner.com/ndtvnews-world-news"
7 changes: 2 additions & 5 deletions src/bin/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,9 @@ async fn main() -> Result<(), anyhow::Error> {
#[cfg(feature = "crawler-llm")]
let crawler = build_llm_crawler(&config).await?;

let rss_config = config.topics().rss();
let pgsql_config = config.storage().pgsql();
let storage = PgsqlTopicStorage::connect(pgsql_config).await?;
let rss_config = load_topics_from_pgsql(&rss_config, &storage).await?;
let rss_config = load_topics_from_pgsql(&storage).await?;
let pg_storage = Arc::new(storage);

let rss_workers = rss_config
Expand Down Expand Up @@ -132,10 +131,9 @@ pub async fn build_llm_crawler(config: &ServiceConfig) -> Result<Arc<LlmCrawler>
}

pub async fn load_topics_from_pgsql(
rss_config: &RssConfig,
storage: &PgsqlTopicStorage,
) -> Result<Vec<RssConfig>, anyhow::Error> {
let mut topics = storage
let topics = storage
.load_at_launch()
.await
.map_err(|err| {
Expand All @@ -148,7 +146,6 @@ pub async fn load_topics_from_pgsql(
.map(|it: RssConfig| (it.target_url().to_owned(), it))
.collect::<HashMap<String, RssConfig>>();

topics.insert(rss_config.target_url().to_owned(), rss_config.to_owned());
let topics = topics.into_values().collect();
Ok(topics)
}
2 changes: 0 additions & 2 deletions src/config.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use crate::cache::config::CacheConfig;
use crate::crawler::config::CrawlerConfig;
use crate::feeds::config::TopicsConfig;
use crate::logger::LoggerConfig;
use crate::publish::config::PublishConfig;
use crate::server::config::ServerConfig;
Expand All @@ -20,7 +19,6 @@ pub struct ServiceConfig {
server: ServerConfig,
cache: CacheConfig,
publish: PublishConfig,
topics: TopicsConfig,
crawler: CrawlerConfig,
storage: StorageConfig,
}
Expand Down
22 changes: 15 additions & 7 deletions src/feeds/rss_feeds/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ where
async fn extract_item(&self, item: &rss::Item) -> Result<RssResponse, anyhow::Error> {
let guid = item.guid().ok_or(anyhow::Error::msg("empty guid"))?;
let title = item.title().ok_or(anyhow::Error::msg("empty title"))?;
let link = item.link().ok_or(anyhow::Error::msg("empty link"))?;
let link = item.link().unwrap_or(guid.value());

let source = Url::parse(link)
.map(|it| it.domain().map(|t| t.to_string()))
Expand All @@ -163,15 +163,12 @@ where
Some(data) => self.clear_html_tags(data)?,
None => {
#[allow(unused_variables)]
let data = description;
let data = description.to_string();

#[cfg(feature = "crawler-llm")]
let data = link;
let data = self.scrape(link).await?;

self.crawler()
.scrape_by_url(data)
.await
.map_err(|err| anyhow::Error::msg(err.to_string()))?
data
}
};

Expand Down Expand Up @@ -204,4 +201,15 @@ where
let result_text = regex.replace_all(content, "").to_string();
Ok(result_text)
}

#[cfg(feature = "crawler-llm")]
async fn scrape(&self, link: &str) -> Result<String, anyhow::Error> {
let result = self
.crawler()
.scrape_by_url(link)
.await
.map_err(|err| anyhow::Error::msg(err.to_string()))?;

Ok(result)
}
}
10 changes: 9 additions & 1 deletion tests/test_publish_feeds.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ mod tests_helper;

use mocks::mock_rmq_publish::MockRabbitPublisher;
use news_rss::config::ServiceConfig;
use news_rss::feeds::rss_feeds::config::RssConfig;
use news_rss::feeds::rss_feeds::RssFeeds;
use news_rss::feeds::FetchTopic;
use news_rss::server::RssWorker;
Expand Down Expand Up @@ -35,7 +36,14 @@ async fn test_rss_feeds() -> Result<(), anyhow::Error> {
#[cfg(feature = "crawler-llm")]
let crawler = tests_helper::build_llm_crawler(&config).await?;

let rss_config = vec![config.topics().rss()];
let rss_config = vec![RssConfig::builder()
.source_name("NDTV World News".to_owned())
.target_url("https://feeds.feedburner.com/ndtvnews-world-news".to_owned())
.max_retries(3)
.timeout(10)
.interval_secs(5)
.build()?];

let _ = rss_config
.into_iter()
.filter_map(|it| RssFeeds::new(it, publish.clone(), cache.clone(), crawler.clone()).ok())
Expand Down

0 comments on commit cd3e516

Please sign in to comment.