From fe43b0c2bfba4a885b619b56e1296857258f841c Mon Sep 17 00:00:00 2001 From: Arthur Chan Date: Sat, 7 Dec 2024 19:01:25 +0000 Subject: [PATCH] [JVM] Add retry logic for no coverage gain (#742) This PR relocate the calculation of coverage total and coverage diff into the checking and retry loop. This fixes can then use the lively caluclated coverage diff to determine if the successfully build and run harness increase the project coverage or not. If it does not increase the project coverage, use the new retry logic prompts to ask LLM to help fixing the harness. The fixing of errors in generated harness as well as harness with no coverage increase are count together. After this fix, it increase the rate of successfully build and run for generated harness and make sure more of them could have help improveing project coverage. *Remark, this coverage feedback approach is currently only work exclusively for JVM projects.* --------- Signed-off-by: Arthur Chan --- data_prep/introspector.py | 6 +- experiment/evaluator.py | 106 ++++++++++-------- llm_toolkit/code_fixer.py | 6 +- llm_toolkit/prompt_builder.py | 39 ++++++- .../jvm_requirement_coverage_fixing.txt | 94 ++++++++++++++++ .../jvm_requirement_error_fixing.txt | 2 +- 6 files changed, 199 insertions(+), 54 deletions(-) create mode 100644 prompts/template_xml/jvm_requirement_coverage_fixing.txt diff --git a/data_prep/introspector.py b/data_prep/introspector.py index fa29c41da..6d7a04285 100755 --- a/data_prep/introspector.py +++ b/data_prep/introspector.py @@ -343,8 +343,10 @@ def query_introspector_public_classes(project: str) -> list[str]: return _get_data(resp, 'classes', []) -def query_introspector_source_code(project: str, filepath: str, begin_line: int, - end_line: int) -> str: +def query_introspector_source_code(project: str, + filepath: str, + begin_line: int = 0, + end_line: int = 10000) -> str: """Queries FuzzIntrospector API for source code of a file |filepath| between |begin_line| and |end_line|.""" diff --git a/experiment/evaluator.py b/experiment/evaluator.py index 6b5d77416..5c1fb249b 100644 --- a/experiment/evaluator.py +++ b/experiment/evaluator.py @@ -283,18 +283,23 @@ def _fix_generated_fuzz_target(self, ai_binary: str, run_result: Optional[RunResult], dual_logger: _Logger, language: str): """Fixes the generated fuzz target.""" - if build_result.succeeded and not language == 'jvm': - if run_result: - error_desc, errors = run_result.semantic_check.get_error_info() + jvm_coverage_fix = False + error_desc, errors = '', [] + if build_result.succeeded: + if language == 'jvm': + jvm_coverage_fix = True else: - dual_logger.log(f'Warning: Build succeed but no run_result in ' - f'{generated_oss_fuzz_project}.') - error_desc, errors = '', [] + if run_result: + error_desc, errors = run_result.semantic_check.get_error_info() + else: + dual_logger.log(f'Warning: Build succeed but no run_result in ' + f'{generated_oss_fuzz_project}.') else: error_desc, errors = None, build_result.errors + code_fixer.llm_fix(ai_binary, target_path, self.benchmark, iteration, error_desc, errors, self.builder_runner.fixer_model_name, - language) + language, jvm_coverage_fix) shutil.copyfile( target_path, os.path.join(oss_fuzz_checkout.OSS_FUZZ_DIR, 'projects', @@ -388,9 +393,57 @@ def check_target(self, ai_binary, target_path: str) -> Result: build_result = BuildResult() run_result = None + # 2. Calculate coverage percentage and coverage diff + coverage_summary = None + total_lines = 0 + coverage_percent = 0.0 + coverage_diff = 0.0 + if run_result: + # Gets line coverage (diff) details. + coverage_summary = self._load_existing_coverage_summary() + + if self.benchmark.language in ['python', 'jvm'] and run_result.coverage: + # The Jacoco.xml coverage report used to generate summary.json on + # OSS-Fuzz for JVM projects does not trace the source file location. + # Thus the conversion may miss some classes because they are not + # present during coverage report generation. This fix gets the total + # line calculation from the jacoco.xml report of the current run + # directly and compares it with the total_lines retrieved from + # summary.json. Then the larger total_lines is used which is assumed + # to be more accurate. This is the same case for python project which + # the total line is determined from the all_cov.json file. + total_lines = run_result.coverage.total_lines + elif coverage_summary: + total_lines = compute_total_lines_without_fuzz_targets( + coverage_summary, generated_target_name) + else: + total_lines = 0 + + if run_result.total_pcs: + coverage_percent = run_result.cov_pcs / run_result.total_pcs + else: + dual_logger.log( + f'Warning: total_pcs == 0 in {generated_oss_fuzz_project}.') + coverage_percent = 0.0 + + existing_textcov = self.load_existing_textcov() + if run_result.coverage: + run_result.coverage.subtract_covered_lines(existing_textcov) + + if total_lines and run_result.coverage: + coverage_diff = run_result.coverage.covered_lines / total_lines + else: + dual_logger.log( + f'Warning: total_lines == 0 in {generated_oss_fuzz_project}.') + coverage_diff = 0.0 + if self.benchmark.language == 'jvm': - # Unexpected exceptions that crash JVM fuzzers does not need to be fixed. + # For JVM, the generation is consider success if either is true + # 1) Build success and run crashed (expected for exceptions) + # 2) Build success, run success and coverage diff > 0 gen_succ = build_result.succeeded and run_result + if gen_succ and run_result and run_result.succeeded: + gen_succ = gen_succ and (coverage_diff > 0) else: gen_succ = build_result.succeeded and run_result and run_result.succeeded @@ -465,43 +518,6 @@ def check_target(self, ai_binary, target_path: str) -> Result: run_result.coverage_report_path, run_result.reproducer_path, True, run_result.semantic_check.type, run_result.triage)) - # Gets line coverage (diff) details. - coverage_summary = self._load_existing_coverage_summary() - - if self.benchmark.language in ['python', 'jvm']: - # The Jacoco.xml coverage report used to generate summary.json on OSS-Fuzz - # for JVM projects does not trace the source file location. Thus the - # conversion may miss some classes because they are not present during - # coverage report generation. This fix gets the total line calculation - # from the jacoco.xml report of the current run directly and compares it - # with the total_lines retrieved from summary.json. Then the larger - # total_lines is used which is assumed to be more accurate. - # This is the same case for python project which the total line - # is determined from the all_cov.json file. - total_lines = run_result.coverage.total_lines - elif coverage_summary: - total_lines = compute_total_lines_without_fuzz_targets( - coverage_summary, generated_target_name) - else: - total_lines = 0 - - if run_result.total_pcs: - coverage_percent = run_result.cov_pcs / run_result.total_pcs - else: - dual_logger.log( - f'Warning: total_pcs == 0 in {generated_oss_fuzz_project}.') - coverage_percent = 0.0 - - existing_textcov = self.load_existing_textcov() - run_result.coverage.subtract_covered_lines(existing_textcov) - - if total_lines: - coverage_diff = run_result.coverage.covered_lines / total_lines - else: - dual_logger.log( - f'Warning: total_lines == 0 in {generated_oss_fuzz_project}.') - coverage_diff = 0.0 - dual_logger.log( f'Result for {generated_oss_fuzz_project}: ' f'crashes={run_result.crashes}, coverage={coverage_percent} ' diff --git a/llm_toolkit/code_fixer.py b/llm_toolkit/code_fixer.py index e40fb8712..bb6bea80a 100755 --- a/llm_toolkit/code_fixer.py +++ b/llm_toolkit/code_fixer.py @@ -368,7 +368,7 @@ def group_error_messages(error_lines: list[str]) -> list[str]: def llm_fix(ai_binary: str, target_path: str, benchmark: benchmarklib.Benchmark, llm_fix_id: int, error_desc: Optional[str], errors: list[str], - fixer_model_name: str, language: str) -> None: + fixer_model_name: str, language: str, jvm_cov_fix: bool) -> None: """Reads and fixes |target_path| in place with LLM based on |error_log|.""" fuzz_target_source_code = parser.parse_code(target_path) @@ -385,6 +385,7 @@ def llm_fix(ai_binary: str, target_path: str, benchmark: benchmarklib.Benchmark, prompt_path, response_dir, language, + jvm_cov_fix, fixer_model_name, temperature=0.5 - llm_fix_id * 0.04) @@ -427,6 +428,7 @@ def apply_llm_fix(ai_binary: str, prompt_path: str, response_dir: str, language: str, + jvm_cov_fix: bool, fixer_model_name: str = models.DefaultModel.name, temperature: float = 0.4): """Queries LLM to fix the code.""" @@ -440,7 +442,7 @@ def apply_llm_fix(ai_binary: str, if language == 'jvm': builder = prompt_builder.JvmErrorFixingBuilder(fixer_model, benchmark, fuzz_target_source_code, - errors) + errors, jvm_cov_fix) prompt = builder.build([], None, None) prompt.save(prompt_path) else: diff --git a/llm_toolkit/prompt_builder.py b/llm_toolkit/prompt_builder.py index 814b0e210..a42aebb99 100644 --- a/llm_toolkit/prompt_builder.py +++ b/llm_toolkit/prompt_builder.py @@ -1062,16 +1062,22 @@ def __init__(self, benchmark: Benchmark, generated_harness: str, errors: list[str], + jvm_cov_fix: bool, template_dir: str = DEFAULT_TEMPLATE_DIR): super().__init__(model) self._template_dir = template_dir self.benchmark = benchmark self.generated_harness = generated_harness self.error_str = '\n'.join(errors) + self.jvm_cov_fix = jvm_cov_fix # Load templates. - self.template_file = self._find_template( - template_dir, 'jvm_requirement_error_fixing.txt') + if self.jvm_cov_fix: + self.template_file = self._find_template( + template_dir, 'jvm_requirement_coverage_fixing.txt') + else: + self.template_file = self._find_template( + template_dir, 'jvm_requirement_error_fixing.txt') def _find_template(self, template_dir: str, template_name: str) -> str: """Finds template file based on |template_dir|.""" @@ -1099,15 +1105,40 @@ def build(self, with open(self.template_file, 'r') as f: prompt_text = f.read() + proj = self.benchmark.project + # Format the repository target_repository = oss_fuzz_checkout.get_project_repository( self.benchmark.project) prompt_text = prompt_text.replace('{TARGET_REPO}', target_repository) + prompt_text = prompt_text.replace('{HARNESS_NAME}', + self.benchmark.target_name) - # Add the generated harness and error string to prompt + # Add the generated harness to prompt prompt_text = prompt_text.replace('{GENERATED_HARNESS}', self.generated_harness) - prompt_text = prompt_text.replace('{ERRORS}', self.error_str) + + if self.jvm_cov_fix: + # Add source code of all existing harnesses to prompt + source_list = [] + harnesses = introspector.query_introspector_for_harness_intrinsics(proj) + for pair in harnesses: + path = pair.get('source', '') + if path: + source = introspector.query_introspector_source_code(proj, path) + if source: + source_list.append(source) + + prompt_text = prompt_text.replace('{EXISTING_HARNESS}', + '\n---\n'.join(source_list)) + + # Add all public candidates to prompt + methods = introspector.query_introspector_jvm_all_public_candidates(proj) + name = [method['function_name'] for method in methods] + prompt_text = prompt_text.replace('{PUBLIC_METHODS}', ','.join(name)) + else: + # Add the error string to prompt + prompt_text = prompt_text.replace('{ERRORS}', self.error_str) self._prompt.add_priming(prompt_text) return self._prompt diff --git a/prompts/template_xml/jvm_requirement_coverage_fixing.txt b/prompts/template_xml/jvm_requirement_coverage_fixing.txt new file mode 100644 index 000000000..a135096a1 --- /dev/null +++ b/prompts/template_xml/jvm_requirement_coverage_fixing.txt @@ -0,0 +1,94 @@ +I'm a security engineer looking to write good fuzzing harnesses. I want you help me improve my fuzzing harness so it could covers more part of the code. + +The target library is {TARGET_REPO}. + +The target project is implemented in the Java programming language; therefore, the harness should also be written in Java. +The fuzzing harness must be executable within the Jazzer fuzzing framework. + +Below is the source code of the target fuzzing harness that I would like to improve: + +{GENERATED_HARNESS} + + +For reference, the source code for all existing harnesses of the project is provided below, separated by `---`: + +{EXISTING_HARNESS} + + +Additionally, a list of all public methods and constructors of the project is included for your reference, you should try to expand the fuzzing harness that calls these targets to improve the overall fuzzing coverage: +{PUBLIC_METHODS} + +Your task is to improve the target fuzzing harness provided above to increase code coverage for additional parts of the project that are not covered by the existing fuzzing harnesses. Please ensure that the changes made are minimal. +In your response, include ONLY the code for the harness, nothing more. You should wrap the code in tags. + +Here is an additional list of requirements that you MUST follow. + +NEVER use any methods from the java.lang.Random class in the generated code. +NEVER use any classes or methods in the java.lang.reflect package in the generated code. +NEVER use the @FuzzTest annotation for specifying the fuzzing method. +NEVER use any assert, printing and logging statements in the generated harness. +NEVER use any multithreading or multi-processing approach. +You MUST create the object before calling the target method. +Please use {HARNESS_NAME} as the Java class name. +You MUST invoke the close method of any resource class objects that implements the java.lang.AutoCloseable interface in the finally block after the target method is invoked. +Always create the fuzzing harness from the following templates: + +import com.code_intelligence.jazzer.api.FuzzedDataProvider; +// Other imports + +public class {HARNESS_NAME} { + public static void fuzzerInitialize() { + // Initializing objects for fuzzing + } + + public static void fuzzerTearDown() { + // Tear down objects after fuzzing + } + + public static void fuzzerTestOneInput(FuzzedDataProvider data) { + // Use the FuzzedDataProvider object to generate random data for fuzzing + + // Fuzz by invoking the target method with random parameters / objects generated above. + } +} + + +You MUST ONLY use any of the following methods from the FuzzedDataProvider of the Jazzer framework for generating random data for fuzzing. +If the needed return value is not found in the table, try use constructors or methods to create the needed random object. But you MUST try your best to randomise the random object with the methods in the table. + +| Method | Return Value | +|---------------------------------------------|---------------------------------------| +| `consumeBytes(int length)` | `byte[]` | +| `consumeRemainingAsBytes()` | `byte[]` | +| `consumeString(int length)` | `String` | +| `consumeRemainingAsString()` | `String` | +| `consumeBoolean()` | `boolean` | +| `consumeInt(int min, int max)` | `int` | +| `consumeInt()` | `int` | +| `consumeLong(long min, long max)` | `long` | +| `consumeLong()` | `long` | +| `consumeFloat(float min, float max)` | `float` | +| `consumeFloat()` | `float` | +| `consumeDouble(double min, double max)` | `double` | +| `consumeDouble()` | `double` | +| `consumeChar()` | `char` | +| `consumeChar(char min, char max)` | `char` | +| `consumeShort(short min, short max)` | `short` | +| `consumeShort()` | `short` | +| `consumeRemainingAsCharSequence()` | `CharSequence` | +| `consumeBytestring()` | `byte[]` | +| `consumeBigInteger(int minNumBits)` | `BigInteger` | +| `consumeEnum(Class enumType)` | `E` (Enum type) | +| `consumeProbabilityDouble()` | `double` | +| `consumeFraction()` | `double` | +| `pickValue(T... values)` | `T` (Type of value) | +| `pickValue(List values)` | `T` (Type of value) | +| `consumeByte()` | `byte` | +| `consumeIntList(int length)` | `List` | +| `consumeLongList(int length)` | `List` | +| `consumeFloatList(int length)` | `List` | +| `consumeDoubleList(int length)` | `List` | +| `consumeCharList(int length)` | `List` | + + + diff --git a/prompts/template_xml/jvm_requirement_error_fixing.txt b/prompts/template_xml/jvm_requirement_error_fixing.txt index 00c3d1e85..1a3194f0f 100644 --- a/prompts/template_xml/jvm_requirement_error_fixing.txt +++ b/prompts/template_xml/jvm_requirement_error_fixing.txt @@ -1,4 +1,4 @@ -I'm a security engineer looking to convert unit tests into fuzzing harnesses. I got some compilation errors and want you to help fix them. +I'm a security engineer looking to write a good fuzzing harnesses. I got some compilation errors and want you to help fix them. The target library is {TARGET_REPO}.