diff --git a/src/fuzz_introspector/frontends/frontend_c.py b/src/fuzz_introspector/frontends/frontend_c.py index 335a5ad4d..f440a9669 100644 --- a/src/fuzz_introspector/frontends/frontend_c.py +++ b/src/fuzz_introspector/frontends/frontend_c.py @@ -16,7 +16,6 @@ """Fuzz Introspector Light frontend""" import os -import pathlib import logging @@ -643,38 +642,23 @@ def get_linenumber(self, bytepos): return -1 -def capture_source_files_in_tree(directory_tree, language): - """Captures source code files in a given directory.""" - language_extensions = {'c': ['.c', '.h']} - language_files = [] - paths_to_avoid = [ - '/src/aflplusplus', '/src/honggfuzz', '/src/libfuzzer', '/src/fuzztest' - ] +def load_treesitter_trees(source_files: list[str], + is_log: bool = True) -> list[SourceCodeFile]: + """Creates treesitter trees for all files in a given list of source files.""" + results = [] - for dirpath, _dirnames, filenames in os.walk(directory_tree): - if any([x for x in paths_to_avoid if dirpath.startswith(x)]): + for code_file in source_files: + if not os.path.isfile(code_file): continue - for filename in filenames: - for extensions in language_extensions[language]: - if pathlib.Path(filename).suffix in extensions: - language_files.append(os.path.join(dirpath, filename)) - return language_files + source_cls = SourceCodeFile(code_file, 'c') -def load_treesitter_trees(source_files, log_harnesses=True): - """Creates treesitter trees for all files in a given list of source files.""" - results = [] + if is_log: + if source_cls.has_libfuzzer_harness(): + logger.info('harness: %s', code_file) + + results.append(source_cls) - for language in source_files: - if language == 'c': - for code_file in source_files[language]: - if not os.path.isfile(code_file): - continue - source_cls = SourceCodeFile(code_file, language) - if log_harnesses: - if source_cls.has_libfuzzer_harness(): - logger.info('harness: %s', code_file) - results.append(source_cls) return results @@ -684,13 +668,3 @@ def analyse_source_code(source_content: str) -> SourceCodeFile: language='c', source_content=source_content.encode()) return source_code - - -def analyse_folder(folder_path: str, language: str = 'c') -> Project: - """Constructs a project based on the source code in a folder.""" - source_files = {} - source_files[language] = capture_source_files_in_tree( - folder_path, language) - source_codes = load_treesitter_trees(source_files) - project = Project(source_codes) - return project diff --git a/src/fuzz_introspector/frontends/frontend_cpp.py b/src/fuzz_introspector/frontends/frontend_cpp.py index d9c1c8773..a17688391 100644 --- a/src/fuzz_introspector/frontends/frontend_cpp.py +++ b/src/fuzz_introspector/frontends/frontend_cpp.py @@ -17,7 +17,6 @@ from typing import Any, Optional, Set, List import os -import pathlib import logging from tree_sitter import Language, Parser, Node @@ -898,30 +897,6 @@ def _recursive_function_depth(function: FunctionDefinition) -> int: return func_depth -def capture_source_files_in_tree(directory_tree): - """Captures source code files in a given directory.""" - language_files = [] - language_extensions = [ - '.c', '.cpp', '.cc', '.c++', '.cxx', '.h', '.hpp', '.hh', '.hxx', - '.inl' - ] - exclude_directories = [ - 'build', 'target', 'node_modules', 'aflplusplus', 'honggfuzz', - 'inspector', 'libfuzzer', 'fuzztest' - ] - - for dirpath, _, filenames in os.walk(directory_tree): - # Skip some non project directories - if any(exclude in dirpath for exclude in exclude_directories): - continue - - for filename in filenames: - if pathlib.Path(filename).suffix.lower() in language_extensions: - language_files.append(os.path.join(dirpath, filename)) - - return language_files - - def load_treesitter_trees(source_files, is_log=True): """Creates treesitter trees for all files in a given list of source files.""" results = [] diff --git a/src/fuzz_introspector/frontends/frontend_go.py b/src/fuzz_introspector/frontends/frontend_go.py index 424f773ad..5b1d9cd06 100644 --- a/src/fuzz_introspector/frontends/frontend_go.py +++ b/src/fuzz_introspector/frontends/frontend_go.py @@ -18,7 +18,6 @@ from typing import Optional import os -import pathlib import logging from tree_sitter import Language, Parser, Node @@ -786,17 +785,6 @@ def extract_callsites(self, all_funcs_meths: dict[str, 'FunctionMethod']): self.detailed_callsites.append({'Src': src_loc, 'Dst': dst}) -def capture_source_files_in_tree(directory_tree: str) -> list[str]: - """Captures source code files in a given directory.""" - language_extensions = ['.go', '.cgo'] - language_files = [] - for dirpath, _dirnames, filenames in os.walk(directory_tree): - for filename in filenames: - if pathlib.Path(filename).suffix in language_extensions: - language_files.append(os.path.join(dirpath, filename)) - return language_files - - def load_treesitter_trees(source_files: list[str], is_log: bool = True) -> list[SourceCodeFile]: """Creates treesitter trees for all files in a given list of source files.""" diff --git a/src/fuzz_introspector/frontends/frontend_jvm.py b/src/fuzz_introspector/frontends/frontend_jvm.py index 2eac8149e..63b698e92 100644 --- a/src/fuzz_introspector/frontends/frontend_jvm.py +++ b/src/fuzz_introspector/frontends/frontend_jvm.py @@ -18,7 +18,6 @@ from typing import Optional import os -import pathlib import logging from tree_sitter import Language, Parser, Node @@ -1278,25 +1277,6 @@ def get_reachable_methods( return visited_methods -def capture_source_files_in_tree(directory_tree: str) -> list[str]: - """Captures source code files in a given directory.""" - exclude_directories = [ - 'target', 'node_modules', 'aflplusplus', 'honggfuzz', 'inspector', - 'libfuzzer' - ] - language_extensions = ['.java'] - language_files = [] - for dirpath, _, filenames in os.walk(directory_tree): - # Skip some non project directories - if any(exclude in dirpath for exclude in exclude_directories): - continue - - for filename in filenames: - if pathlib.Path(filename).suffix in language_extensions: - language_files.append(os.path.join(dirpath, filename)) - return language_files - - def load_treesitter_trees(source_files: list[str], entrypoint: str, is_log: bool = True) -> list[SourceCodeFile]: diff --git a/src/fuzz_introspector/frontends/frontend_rust.py b/src/fuzz_introspector/frontends/frontend_rust.py index 6003ec28e..a537be494 100644 --- a/src/fuzz_introspector/frontends/frontend_rust.py +++ b/src/fuzz_introspector/frontends/frontend_rust.py @@ -18,7 +18,6 @@ from typing import Any, Optional import os -import pathlib import logging from tree_sitter import Language, Parser, Node @@ -849,25 +848,6 @@ def get_reachable_functions( return visited_funcs -def capture_source_files_in_tree(directory_tree: str) -> list[str]: - """Captures source code files in a given directory.""" - exclude_directories = [ - 'tests', 'examples', 'benches', 'node_modules', 'aflplusplus', - 'honggfuzz', 'inspector', 'libfuzzer' - ] - language_extensions = ['.rs'] - language_files = [] - for dirpath, _, filenames in os.walk(directory_tree): - # Skip some non project directories - if any(exclude in dirpath for exclude in exclude_directories): - continue - - for filename in filenames: - if pathlib.Path(filename).suffix in language_extensions: - language_files.append(os.path.join(dirpath, filename)) - return language_files - - def load_treesitter_trees(source_files: list[str], is_log: bool = True) -> list[SourceCodeFile]: """Creates treesitter trees for all files in a given list of source files.""" diff --git a/src/fuzz_introspector/frontends/oss_fuzz.py b/src/fuzz_introspector/frontends/oss_fuzz.py index c15d7319d..aa1098843 100644 --- a/src/fuzz_introspector/frontends/oss_fuzz.py +++ b/src/fuzz_introspector/frontends/oss_fuzz.py @@ -16,6 +16,7 @@ import os import argparse +import pathlib import logging from typing import Any @@ -29,6 +30,22 @@ logger = logging.getLogger(name=__name__) LOG_FMT = '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s' +LANGUAGE_EXTENSION_MAP = { + 'c': ['.c', '.h'], + 'c++': + ['.c', '.cpp', '.cc', '.c++', '.cxx', '.h', '.hpp', '.hh', '.hxx', '.inl'], + 'cpp': + ['.c', '.cpp', '.cc', '.c++', '.cxx', '.h', '.hpp', '.hh', '.hxx', '.inl'], + 'go': ['.go', '.cgo'], + 'jvm': ['.java'], + 'rust': ['.rs'], +} + +EXCLUDE_DIRECTORIES = [ + 'node_modules', 'aflplusplus', 'honggfuzz', 'inspector', 'libfuzzer', + 'fuzztest', 'target', 'build' +] + def setup_logging(): """Initializes logging""" @@ -54,22 +71,38 @@ def parse_args(): return parser.parse_args() +def capture_source_files_in_tree(directory_tree: str, + language: str) -> list[str]: + """Captures source code files in a given directory.""" + language_files = [] + language_extensions = LANGUAGE_EXTENSION_MAP.get(language.lower(), []) + + for dirpath, _, filenames in os.walk(directory_tree): + # Skip some non project directories + if any(exclude in dirpath for exclude in EXCLUDE_DIRECTORIES): + continue + + for filename in filenames: + if pathlib.Path(filename).suffix in language_extensions: + language_files.append(os.path.join(dirpath, filename)) + return language_files + + def process_c_project(target_dir: str, entrypoint: str, out: str, + source_files: list[str], module_only: bool = False, dump_output=True) -> frontend_c.Project: """Process a project in C language""" + calltrees = [] + # Default entrypoint if not entrypoint: entrypoint = 'LLVMFuzzerTestOneInput' - calltrees = [] - source_files = {} - source_files['c'] = frontend_c.capture_source_files_in_tree( - target_dir, 'c') - logger.info('Found %d files to include in analysis', - len(source_files['c'])) + logger.info('Going C route') + logger.info('Found %d files to include in analysis', len(source_files)) logger.info('Loading tree-sitter trees') source_codes = frontend_c.load_treesitter_trees(source_files) @@ -120,22 +153,19 @@ def process_c_project(target_dir: str, return project -def process_cpp_project(target_dir: str, - entrypoint: str, +def process_cpp_project(entrypoint: str, out: str, + source_files: list[str], dump_output=True) -> frontend_cpp.Project: """Process a project in CPP language""" + calltrees = [] + # Default entrypoint if not entrypoint: entrypoint = 'LLVMFuzzerTestOneInput' - # Extract c++ source files - logger.info('Going C++ route') - calltrees = [] - source_files = [] - source_files = frontend_cpp.capture_source_files_in_tree(target_dir) - # Process tree sitter for c++ source files + logger.info('Going C++ route') logger.info('Found %d files to include in analysis', len(source_files)) logger.info('Loading tree-sitter trees') source_codes = frontend_cpp.load_treesitter_trees(source_files) @@ -170,17 +200,14 @@ def process_cpp_project(target_dir: str, return project -def process_go_project(target_dir: str, - out: str, +def process_go_project(out: str, + source_files: list[str], dump_output=True) -> frontend_go.Project: """Process a project in Go language""" - # Extract go source files - logger.info('Going Go route') calltrees = [] - source_files = [] - source_files = frontend_go.capture_source_files_in_tree(target_dir) # Process tree sitter for go source files + logger.info('Going Go route') logger.info('Found %d files to include in analysis', len(source_files)) logger.info('Loading tree-sitter trees') source_codes = frontend_go.load_treesitter_trees(source_files) @@ -207,22 +234,19 @@ def process_go_project(target_dir: str, return project -def process_jvm_project(target_dir: str, - entrypoint: str, +def process_jvm_project(entrypoint: str, out: str, + source_files: list[str], dump_output=True) -> frontend_jvm.Project: """Process a project in JVM based language""" + calltrees = [] + # Default entrypoint if not entrypoint: entrypoint = 'fuzzerTestOneInput' - # Extract java source files - logger.info('Going JVM route') - calltrees = [] - source_files = [] - source_files = frontend_jvm.capture_source_files_in_tree(target_dir) - # Process tree sitter for go source files + logger.info('Going JVM route') logger.info('Found %d files to include in analysis', len(source_files)) logger.info('Loading tree-sitter trees') source_codes = frontend_jvm.load_treesitter_trees(source_files, entrypoint) @@ -251,17 +275,14 @@ def process_jvm_project(target_dir: str, return project -def process_rust_project(target_dir: str, - out: str, +def process_rust_project(out: str, + source_files: list[str], dump_output=True) -> frontend_rust.Project: """Process a project in Rust based language""" - # Extract rust source files - logger.info('Going Rust route') calltrees = [] - source_files = [] - source_files = frontend_rust.capture_source_files_in_tree(target_dir) # Process tree sitter for rust source files + logger.info('Going Rust route') logger.info('Found %d files to include in analysis', len(source_files)) logger.info('Loading tree-sitter trees') source_codes = frontend_rust.load_treesitter_trees(source_files) @@ -298,26 +319,30 @@ def analyse_folder(language: str = '', dump_output=True) -> Any: """Runs a full frontend analysis on a given directory""" + # Extract source files for target language + source_files = capture_source_files_in_tree(directory, language) + if language == 'c': return process_c_project(directory, entrypoint, out, + source_files, module_only, dump_output=dump_output) elif language.lower() in ['cpp', 'c++']: - return process_cpp_project(directory, - entrypoint, + return process_cpp_project(entrypoint, out, + source_files, dump_output=dump_output) elif language == 'go': - return process_go_project(directory, out, dump_output=dump_output) + return process_go_project(out, source_files, dump_output=dump_output) elif language == 'jvm': - return process_jvm_project(directory, - entrypoint, + return process_jvm_project(entrypoint, out, + source_files, dump_output=dump_output) elif language == 'rust': - return process_rust_project(directory, out, dump_output=dump_output) + return process_rust_project(out, source_files, dump_output=dump_output) return [], None