diff --git a/autorag/VERSION b/autorag/VERSION index 7c97c9b17..c44fec2a0 100644 --- a/autorag/VERSION +++ b/autorag/VERSION @@ -1 +1 @@ -0.3.11rc2 +0.3.11rc3 diff --git a/autorag/data/parse/run.py b/autorag/data/parse/run.py index 849b2cbc3..a289e35ba 100644 --- a/autorag/data/parse/run.py +++ b/autorag/data/parse/run.py @@ -53,12 +53,23 @@ def run_parser( ) set_file_types = set([module["file_type"] for module in module_params]) + # Calculate the set difference once + file_types_to_remove = set_file_types - file_types + + # Use list comprehension to filter out unwanted elements + module_params = [ + param + for param in module_params + if param["file_type"] not in file_types_to_remove + ] + modules = [ + module + for module, param in zip(modules, module_params) + if param["file_type"] not in file_types_to_remove + ] + # create a list of only those file_types that are in file_types but not in set_file_types missing_file_types = list(file_types - set_file_types) - if list(set_file_types - file_types): - raise ValueError( - f"File types {list(set_file_types - file_types)} are not in the data path." - ) if missing_file_types: add_modules_list = [] diff --git a/tests/autorag/data/parse/test_parse_run.py b/tests/autorag/data/parse/test_parse_run.py index 9c97a6d03..bf0c2f76f 100644 --- a/tests/autorag/data/parse/test_parse_run.py +++ b/tests/autorag/data/parse/test_parse_run.py @@ -24,3 +24,25 @@ def test_run_parser(): "file_type": "pdf", } assert os.path.exists(os.path.join(temp_dir, "pdf.parquet")) + + +def test_run_parser_two(): + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_dir: + modules = [langchain_parse, langchain_parse] + module_params = [ + {"parse_method": "pdfminer", "file_type": "pdf"}, + {"parse_method": "csv", "file_type": "csv"}, + ] + data_path_glob = eng_text_glob + summary_df = run_parser( + modules, module_params, data_path_glob, temp_dir, all_files=False + ) + assert os.path.exists(os.path.join(temp_dir, "summary.csv")) + expect_columns = {"filename", "module_name", "module_params", "execution_time"} + assert set(summary_df.columns) == expect_columns + assert len(summary_df) == 1 + assert summary_df["module_params"][0] == { + "parse_method": "pdfminer", + "file_type": "pdf", + } + assert os.path.exists(os.path.join(temp_dir, "pdf.parquet"))