Skip to content

Commit

Permalink
Changed Exclude String filters in favour of Callback Functions. (#204)
Browse files Browse the repository at this point in the history
* Added logic to ommit files or directories based on regex

* Fixed error for dir loader

* Fixed failing tests

* Changed dir loader to use functional filters for flexibility

* Changed dirfilter to pathfilter and passed a path object instead of a string

* Fixed compilation errors
  • Loading branch information
aliumujib authored Aug 21, 2024
1 parent 54fbe26 commit b0befa0
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 20 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ Cargo.lock
.DS_Store
.fastembed_cache
.vscode
.idea
66 changes: 47 additions & 19 deletions src/document_loaders/dir_loader.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,46 @@
use async_recursion::async_recursion;
use std::{path::Path, pin::Pin};
use std::sync::Arc;
use std::{fmt, path::Path, pin::Pin};
use tokio::fs;

use super::LoaderError;

pub struct PathFilter(Arc<dyn Fn(&Path) -> bool + Send + Sync>);

impl PathFilter {
pub fn new<F>(f: F) -> Self
where
F: Fn(&Path) -> bool + Send + Sync + 'static,
{
PathFilter(Arc::new(f))
}
}

impl fmt::Debug for PathFilter {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Filter")
}
}

impl Clone for PathFilter {
fn clone(&self) -> Self {
PathFilter(Arc::clone(&self.0))
}
}

#[derive(Debug, Clone, Default)]
pub struct DirLoaderOptions {
pub glob: Option<String>,
pub suffixes: Option<Vec<String>>,
pub exclude: Option<Vec<String>>,
pub path_filter: Option<PathFilter>,
}

/// Recursively list all files in a directory
#[async_recursion]
pub async fn list_files_in_path(
dir_path: &Path,
files: &mut Vec<String>,
opts: &DirLoaderOptions,
) -> Result<Pin<Box<()>>, LoaderError> {
if dir_path.is_file() {
files.push(dir_path.to_string_lossy().to_string());
Expand All @@ -30,11 +55,18 @@ pub async fn list_files_in_path(
let mut reader = fs::read_dir(dir_path).await.unwrap();
while let Some(entry) = reader.next_entry().await.unwrap() {
let path = entry.path();

if path.is_file() {
files.push(path.to_string_lossy().to_string());
} else if path.is_dir() {
list_files_in_path(&path, files).await.unwrap();
if opts
.path_filter
.as_ref()
.map_or(false, |f| f.0(path.as_path()))
{
continue;
}

list_files_in_path(&path, files, opts).await.unwrap();
}
}
Ok(Box::pin(()))
Expand All @@ -44,14 +76,14 @@ pub async fn list_files_in_path(
pub async fn find_files_with_extension(folder_path: &str, opts: &DirLoaderOptions) -> Vec<String> {
let mut matching_files = Vec::new();
let folder_path = Path::new(folder_path);

let mut all_files: Vec<String> = Vec::new();
list_files_in_path(folder_path, &mut all_files)

list_files_in_path(folder_path, &mut all_files, &opts.clone())
.await
.unwrap();

for file_name in all_files {
let path_str = file_name;
let path_str = file_name.clone();

// check if the file has the required extension
if let Some(suffixes) = &opts.suffixes {
Expand All @@ -67,18 +99,14 @@ pub async fn find_files_with_extension(folder_path: &str, opts: &DirLoaderOption
}
}

if let Some(exclude_list) = &opts.exclude {
let mut is_excluded = false;
for exclude_pattern in exclude_list {
if path_str.contains(exclude_pattern) {
is_excluded = true;
break;
}
}
if is_excluded {
continue;
}
if opts
.path_filter
.as_ref()
.map_or(false, |f| f.0(&Path::new(&file_name)))
{
continue; // Skip this path if the filter returns true
}

// check if the file matches the glob pattern
if let Some(glob_pattern) = &opts.glob {
let glob = glob::Pattern::new(glob_pattern).unwrap();
Expand Down Expand Up @@ -132,7 +160,7 @@ mod tests {
&DirLoaderOptions {
glob: None,
suffixes: Some(vec![".txt".to_string()]),
exclude: None,
path_filter: None,
},
)
.await
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ mod tests {
.with_dir_loader_options(DirLoaderOptions {
glob: None,
suffixes: Some(vec!["rs".to_string()]),
exclude: None,
path_filter: None,
});

let stream = loader_with_dir.load().await.unwrap();
Expand Down

0 comments on commit b0befa0

Please sign in to comment.