From 5af5657be0f77f71b9b38787f18872aafbc20c30 Mon Sep 17 00:00:00 2001 From: Francesco Date: Tue, 6 Feb 2024 21:25:50 +0100 Subject: [PATCH] Skip non UTF8 files --- README.md | 1 + search/src/engine/builder.rs | 29 +++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index ae257c9..0a312ec 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,7 @@ example ``` The builder will walk recursively down the input folder, skipping hidden ones. +The indexer will skip and show an error for non UTF-8 files. **Load a document collection** diff --git a/search/src/engine/builder.rs b/search/src/engine/builder.rs index 507e984..4b67705 100644 --- a/search/src/engine/builder.rs +++ b/search/src/engine/builder.rs @@ -12,8 +12,9 @@ use indicatif::{ParallelProgressIterator, ProgressIterator, ProgressStyle}; use rayon::prelude::*; use std::{ collections::{hash_map::Entry, BTreeMap}, - fs, + fs::{self}, }; +use walkdir::DirEntry; const PROGRESS_STYLE: &str = "Documents per second: {per_sec:<3}\n\n[{elapsed_precise}] [{bar:50}] {pos}/{len} [{eta_precise}]"; @@ -56,13 +57,7 @@ fn build_in_memory( let processed_documents: Vec<(String, Vec)> = files .into_par_iter() .progress_with_style(iterator_style.clone()) - .map(|d| { - let file_content = fs::read_to_string(d.path()).expect("error while reading file"); - ( - d.path().to_str().unwrap().to_string(), - preprocessor.tokenize_and_stem(&file_content), - ) - }) + .filter_map(|d| process_document(d, preprocessor)) .collect(); println!("- Indexing phase"); @@ -133,3 +128,21 @@ fn build_in_memory( documents, } } + +fn process_document( + dir_entry: DirEntry, + preprocessor: &Preprocessor, +) -> Option<(String, Vec)> { + let file_path = dir_entry.path(); + match fs::read_to_string(file_path) { + Ok(file_content) => Some(( + dir_entry.path().to_str().unwrap().to_string(), + preprocessor.tokenize_and_stem(&file_content), + )), + Err(err) => { + // Print an error message including the file path + eprintln!("Error reading file {:?}: {}", file_path, err); + None + } + } +}