From 02c4b940c94cd3c7e9e3a0d40af4ebe0f2e000d9 Mon Sep 17 00:00:00 2001 From: RamilCDISC Date: Fri, 3 Jan 2025 16:37:54 -0600 Subject: [PATCH 01/16] initial commit --- .../check_operators/dataframe_operators.py | 2 +- cdisc_rules_engine/rules_engine.py | 6 + tests/PerformanceTest.py | 223 ++++++++++++++++++ 3 files changed, 230 insertions(+), 1 deletion(-) create mode 100644 tests/PerformanceTest.py diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index e8afa5e2..d3d592d7 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -28,7 +28,7 @@ from cdisc_rules_engine.services import logger from functools import wraps import traceback - +import time def log_operator_execution(func): @wraps(func) diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index aab731bb..71ac33d1 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -45,6 +45,7 @@ ExternalDictionariesContainer, ) import traceback +import time class RulesEngine: @@ -343,10 +344,14 @@ def execute_rule( # Adding copy for now to avoid updating cached dataset dataset = deepcopy(dataset) # preprocess dataset + + logger.log(f"\n\ST{time.time()}-Dataset Preprocessing Starts") dataset_preprocessor = DatasetPreprocessor( dataset, domain, dataset_path, self.data_service, self.cache ) dataset = dataset_preprocessor.preprocess(rule_copy, datasets) + logger.log(f"\n\ST{time.time()}-Dataset Preprocessing Ends") + logger.log(f"\n\OPRNT{time.time()}-Operation Starts") dataset = self.rule_processor.perform_rule_operations( rule_copy, dataset, @@ -359,6 +364,7 @@ def execute_rule( external_dictionaries=self.external_dictionaries, ct_packages=ct_packages, ) + logger.log(f"\n\OPRNT{time.time()}-Operation Ends") relationship_data = {} if domain is not None and self.rule_processor.is_relationship_dataset(domain): relationship_data = self.data_processor.preprocess_relationship_dataset( diff --git a/tests/PerformanceTest.py b/tests/PerformanceTest.py new file mode 100644 index 00000000..97a2b396 --- /dev/null +++ b/tests/PerformanceTest.py @@ -0,0 +1,223 @@ +import os +import time +import pandas as pd +import subprocess +from statistics import median +import re +import click + +# Function to extract preprocessing time from logs +def extract_preprocessing_time_from_logs(output_lines): + start_time = None + end_time = None + + # Loop through the log lines + for line in output_lines: + # Check for "Dataset Preprocessing Starts" + if "Dataset Preprocessing Starts" in line: + match = re.search(r"\\ST(\d+\.\d+)", line) # Match the timestamp after \ST + if match: + start_time = float(match.group(1)) + print(f"Extracted start time: {start_time}") + # Check for "Dataset Preprocessing Ends" + elif "Dataset Preprocessing Ends" in line: + match = re.search(r"\\ST(\d+\.\d+)", line) # Match the timestamp after \ST + if match: + end_time = float(match.group(1)) + print(f"Extracted end time: {end_time}") + + # Return the difference if both times are found + if start_time is not None and end_time is not None: + return end_time - start_time + + return 0 + + +# Function to extract operator times from logs +def extract_operator_times(output_lines): + operator_times = {} + start_times = {} + + # Loop through the log lines + for line in output_lines: + # Check for operator start + match_start = re.search(r"\\OPRT(\d+\.\d+)-operator (\w+) starts", line) + if match_start: + timestamp, operation_name = float(match_start.group(1)), match_start.group(2) + start_times[operation_name] = timestamp + + # Check for operator end + match_end = re.search(r"\\OPRT(\d+\.\d+)-operator (\w+) ends", line) + if match_end: + timestamp, operation_name = float(match_end.group(1)), match_end.group(2) + if operation_name in start_times: + duration = timestamp - start_times.pop(operation_name, 0) + if operation_name in operator_times: + operator_times[operation_name].append(duration) + else: + operator_times[operation_name] = [duration] + + return operator_times + + +# Function to extract operation times from terminal logs +def extract_operation_times_from_logs(output_lines): + operation_times = {} + start_times = {} + + # Loop through the log lines + for line in output_lines: + # Check for operation start (from terminal logs) + match_start = re.search(r"\\OPRNT(\d+\.\d+)-Operation Starts", line) + if match_start: + timestamp = float(match_start.group(1)) + start_times[timestamp] = time.time() # Store start time for operation + + # Check for operation end (from terminal logs) + match_end = re.search(r"\\OPRNT(\d+\.\d+)-Operation Ends", line) + if match_end: + timestamp = float(match_end.group(1)) + if timestamp in start_times: + duration = time.time() - start_times.pop(timestamp) + operation_times[timestamp] = duration + + return operation_times + + +# Update TimeTestFunction to record operator and operation times +def TimeTestFunction(data_dir, rule_dir, total_calls): + results = [] # List to store results for DataFrame + + # Collect all dataset files from XPT directory + data_files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith(".json") or file.endswith(".xpt")] + + # Collect all rules from the rule directory + rules = [file for file in os.listdir(rule_dir) if os.path.isfile(os.path.join(rule_dir, file))] + + # Execute each rule on each dataset + for dataset_path in data_files: + for rule in rules: + time_taken = [] + preprocessing_times = [] # Track preprocessing time for the current execution + all_operator_times = {} # To store operator times + all_operation_times = {} # To store operation times + + for num_call in range(total_calls): + rule_path = os.path.join(rule_dir, rule) + + # Construct the command + command = [ + "python3", "core.py", "test", + "-s", "sdtmig", + "-v", "3.4", + "-r", rule_path, + "-dp", dataset_path, + "-l", "critical" + ] + + print(f"Executing: {' '.join(command)} for call {num_call+1}") + + # Execute the command and capture logs + try: + start_time = time.time() + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + stdout, stderr = process.communicate() + end_time = time.time() + + # Parse logs from stderr for preprocessing and operation times + output_lines = stderr.splitlines() + + preprocessing_time = extract_preprocessing_time_from_logs(output_lines) + operator_times = extract_operator_times(output_lines) + operation_times = extract_operation_times_from_logs(output_lines) + + if process.returncode == 0: + time_taken.append(end_time - start_time) # Append execution time for each call + preprocessing_times.append(preprocessing_time) + # Aggregate operator times + for op, durations in operator_times.items(): + if op in all_operator_times: + all_operator_times[op].extend(durations) + else: + all_operator_times[op] = durations + # Aggregate operation times + for op, duration in operation_times.items(): + if op in all_operation_times: + all_operation_times[op].append(duration) + else: + all_operation_times[op] = [duration] + else: + raise subprocess.CalledProcessError(process.returncode, command, stderr) + + except subprocess.CalledProcessError as e: + print(e) + results.append({ + "function type": "TimeTestFunction", + "rule name": rule, + "dataset": os.path.basename(dataset_path), + "status": "Failed", + "Number of Calls": num_call, + "Mean Time": None, + "Median Time": None, + "Min Time": None, + "Max Time": None, + "Preprocessing Time": None, + "Operator Times": None, + "Operation Times": None, + "Error": e.stderr + }) + break + + if len(time_taken) > 0: + results.append({ + "function type": "TimeTestFunction", + "rule name": rule, + "dataset": os.path.basename(dataset_path), + "status": "Successful", + "Number of Calls": total_calls, + "Mean Time": sum(time_taken) / len(time_taken), + "Median Time": median(time_taken), + "Min Time": min(time_taken), + "Max Time": max(time_taken), + "Preprocessing Time": ", ".join(map(str, preprocessing_times)) if preprocessing_times else None, + # Store operator times as a string + "Operator Times": {op: durations for op, durations in all_operator_times.items()}, + # Store operation times as a string + "Operation Times": {op: durations for op, durations in all_operation_times.items()}, + "Error": None + }) + + return results + + +# Main execution +@click.command() +@click.option('-dd', type=str) +@click.option('-rd', type=str) +@click.option('-total_calls', type=int) +@click.option('-od', default=os.getcwd(), help="Directory to save the output file (default is current directory)") +def main(dd, rd, total_calls, od): + total_time_start = time.time() + + # Collect results from TimeTestFunction + test_results = TimeTestFunction(dd, rd, total_calls) + + total_time = time.time() - total_time_start + + # Create a DataFrame and save to an Excel file + results_df = pd.DataFrame(test_results) + + # Add total execution time to the report + total_time_row = ['Total Time'] + [None] * (len(results_df.columns) - 2) + [total_time] + results_df.loc[len(results_df)] = total_time_row + + # Save to Excel + output_path = os.path.join(od, "rule_execution_report.xlsx") + results_df.to_excel(output_path, index=False) + results_df.to_json(os.path.join(od, "rule_execution_report.json")) + print(f"\nExecution results saved to '{output_path}'") + print(results_df) + + +if __name__ == "__main__": + main() \ No newline at end of file From 36ee676ddfb1e58f62b5f536623e82c43b471a61 Mon Sep 17 00:00:00 2001 From: RamilCDISC Date: Fri, 3 Jan 2025 16:47:27 -0600 Subject: [PATCH 02/16] lint update --- .../check_operators/dataframe_operators.py | 1 + tests/PerformanceTest.py | 220 +++++++++--------- 2 files changed, 117 insertions(+), 104 deletions(-) diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index d3d592d7..9110f89e 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -30,6 +30,7 @@ import traceback import time + def log_operator_execution(func): @wraps(func) def wrapper(self, other_value, *args, **kwargs): diff --git a/tests/PerformanceTest.py b/tests/PerformanceTest.py index 97a2b396..2cb765ce 100644 --- a/tests/PerformanceTest.py +++ b/tests/PerformanceTest.py @@ -1,36 +1,30 @@ import os import time -import pandas as pd import subprocess +import pandas as pd from statistics import median import re import click + # Function to extract preprocessing time from logs def extract_preprocessing_time_from_logs(output_lines): start_time = None end_time = None - # Loop through the log lines for line in output_lines: - # Check for "Dataset Preprocessing Starts" if "Dataset Preprocessing Starts" in line: - match = re.search(r"\\ST(\d+\.\d+)", line) # Match the timestamp after \ST + match = re.search(r"\\ST(\d+\.\d+)", line) if match: start_time = float(match.group(1)) print(f"Extracted start time: {start_time}") - # Check for "Dataset Preprocessing Ends" elif "Dataset Preprocessing Ends" in line: - match = re.search(r"\\ST(\d+\.\d+)", line) # Match the timestamp after \ST + match = re.search(r"\\ST(\d+\.\d+)", line) if match: end_time = float(match.group(1)) print(f"Extracted end time: {end_time}") - # Return the difference if both times are found - if start_time is not None and end_time is not None: - return end_time - start_time - - return 0 + return end_time - start_time if start_time and end_time else 0 # Function to extract operator times from logs @@ -38,24 +32,20 @@ def extract_operator_times(output_lines): operator_times = {} start_times = {} - # Loop through the log lines for line in output_lines: - # Check for operator start match_start = re.search(r"\\OPRT(\d+\.\d+)-operator (\w+) starts", line) if match_start: - timestamp, operation_name = float(match_start.group(1)), match_start.group(2) + timestamp, operation_name = float(match_start.group(1)), match_start.group( + 2 + ) start_times[operation_name] = timestamp - # Check for operator end match_end = re.search(r"\\OPRT(\d+\.\d+)-operator (\w+) ends", line) if match_end: timestamp, operation_name = float(match_end.group(1)), match_end.group(2) if operation_name in start_times: - duration = timestamp - start_times.pop(operation_name, 0) - if operation_name in operator_times: - operator_times[operation_name].append(duration) - else: - operator_times[operation_name] = [duration] + duration = timestamp - start_times.pop(operation_name) + operator_times.setdefault(operation_name, []).append(duration) return operator_times @@ -65,15 +55,12 @@ def extract_operation_times_from_logs(output_lines): operation_times = {} start_times = {} - # Loop through the log lines for line in output_lines: - # Check for operation start (from terminal logs) match_start = re.search(r"\\OPRNT(\d+\.\d+)-Operation Starts", line) if match_start: timestamp = float(match_start.group(1)) - start_times[timestamp] = time.time() # Store start time for operation + start_times[timestamp] = time.time() - # Check for operation end (from terminal logs) match_end = re.search(r"\\OPRNT(\d+\.\d+)-Operation Ends", line) if match_end: timestamp = float(match_end.group(1)) @@ -84,134 +71,159 @@ def extract_operation_times_from_logs(output_lines): return operation_times -# Update TimeTestFunction to record operator and operation times +# Simplified TimeTestFunction def TimeTestFunction(data_dir, rule_dir, total_calls): - results = [] # List to store results for DataFrame - - # Collect all dataset files from XPT directory - data_files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith(".json") or file.endswith(".xpt")] - - # Collect all rules from the rule directory - rules = [file for file in os.listdir(rule_dir) if os.path.isfile(os.path.join(rule_dir, file))] + results = [] + + # Collect all dataset files and rules + data_files = [ + os.path.join(data_dir, file) + for file in os.listdir(data_dir) + if file.endswith((".json", ".xpt")) + ] + rules = [ + file + for file in os.listdir(rule_dir) + if os.path.isfile(os.path.join(rule_dir, file)) + ] # Execute each rule on each dataset for dataset_path in data_files: for rule in rules: time_taken = [] - preprocessing_times = [] # Track preprocessing time for the current execution - all_operator_times = {} # To store operator times - all_operation_times = {} # To store operation times + preprocessing_times = [] + all_operator_times = {} + all_operation_times = {} for num_call in range(total_calls): rule_path = os.path.join(rule_dir, rule) - - # Construct the command command = [ - "python3", "core.py", "test", - "-s", "sdtmig", - "-v", "3.4", - "-r", rule_path, - "-dp", dataset_path, - "-l", "critical" + "python3", + "core.py", + "test", + "-s", + "sdtmig", + "-v", + "3.4", + "-r", + rule_path, + "-dp", + dataset_path, + "-l", + "critical", ] + print(f"Executing: {' '.join(command)} for call {num_call + 1}") - print(f"Executing: {' '.join(command)} for call {num_call+1}") - - # Execute the command and capture logs try: start_time = time.time() - process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) stdout, stderr = process.communicate() end_time = time.time() - # Parse logs from stderr for preprocessing and operation times output_lines = stderr.splitlines() - preprocessing_time = extract_preprocessing_time_from_logs(output_lines) + preprocessing_time = extract_preprocessing_time_from_logs( + output_lines + ) operator_times = extract_operator_times(output_lines) operation_times = extract_operation_times_from_logs(output_lines) if process.returncode == 0: - time_taken.append(end_time - start_time) # Append execution time for each call + time_taken.append(end_time - start_time) preprocessing_times.append(preprocessing_time) - # Aggregate operator times + for op, durations in operator_times.items(): - if op in all_operator_times: - all_operator_times[op].extend(durations) - else: - all_operator_times[op] = durations - # Aggregate operation times + all_operator_times.setdefault(op, []).extend(durations) for op, duration in operation_times.items(): - if op in all_operation_times: - all_operation_times[op].append(duration) - else: - all_operation_times[op] = [duration] + all_operation_times.setdefault(op, []).append(duration) else: - raise subprocess.CalledProcessError(process.returncode, command, stderr) + raise subprocess.CalledProcessError( + process.returncode, command, stderr + ) except subprocess.CalledProcessError as e: - print(e) - results.append({ + results.append( + { + "function type": "TimeTestFunction", + "rule name": rule, + "dataset": os.path.basename(dataset_path), + "status": "Failed", + "Number of Calls": num_call, + "Mean Time": None, + "Median Time": None, + "Min Time": None, + "Max Time": None, + "Preprocessing Time": None, + "Operator Times": None, + "Operation Times": None, + "Error": e.stderr, + } + ) + break + + if time_taken: + results.append( + { "function type": "TimeTestFunction", "rule name": rule, "dataset": os.path.basename(dataset_path), - "status": "Failed", - "Number of Calls": num_call, - "Mean Time": None, - "Median Time": None, - "Min Time": None, - "Max Time": None, - "Preprocessing Time": None, - "Operator Times": None, - "Operation Times": None, - "Error": e.stderr - }) - break - - if len(time_taken) > 0: - results.append({ - "function type": "TimeTestFunction", - "rule name": rule, - "dataset": os.path.basename(dataset_path), - "status": "Successful", - "Number of Calls": total_calls, - "Mean Time": sum(time_taken) / len(time_taken), - "Median Time": median(time_taken), - "Min Time": min(time_taken), - "Max Time": max(time_taken), - "Preprocessing Time": ", ".join(map(str, preprocessing_times)) if preprocessing_times else None, - # Store operator times as a string - "Operator Times": {op: durations for op, durations in all_operator_times.items()}, - # Store operation times as a string - "Operation Times": {op: durations for op, durations in all_operation_times.items()}, - "Error": None - }) + "status": "Successful", + "Number of Calls": total_calls, + "Mean Time": sum(time_taken) / len(time_taken), + "Median Time": median(time_taken), + "Min Time": min(time_taken), + "Max Time": max(time_taken), + "Preprocessing Time": ( + ", ".join(map(str, preprocessing_times)) + if preprocessing_times + else None + ), + "Operator Times": { + op: durations + for op, durations in all_operator_times.items() + }, + "Operation Times": { + op: durations + for op, durations in all_operation_times.items() + }, + "Error": None, + } + ) return results # Main execution @click.command() -@click.option('-dd', type=str) -@click.option('-rd', type=str) -@click.option('-total_calls', type=int) -@click.option('-od', default=os.getcwd(), help="Directory to save the output file (default is current directory)") +@click.option("-dd", type=str) +@click.option("-rd", type=str) +@click.option("-total_calls", type=int) +@click.option( + "-od", + default=os.getcwd(), + help="Directory to save the output file (default is current directory)", +) def main(dd, rd, total_calls, od): total_time_start = time.time() - # Collect results from TimeTestFunction test_results = TimeTestFunction(dd, rd, total_calls) total_time = time.time() - total_time_start - # Create a DataFrame and save to an Excel file + # Create a DataFrame and save to Excel/JSON results_df = pd.DataFrame(test_results) - # Add total execution time to the report - total_time_row = ['Total Time'] + [None] * (len(results_df.columns) - 2) + [total_time] + # Add total execution time + total_time_row = ( + ["Total Time"] + [None] * (len(results_df.columns) - 2) + [total_time] + ) results_df.loc[len(results_df)] = total_time_row - # Save to Excel output_path = os.path.join(od, "rule_execution_report.xlsx") results_df.to_excel(output_path, index=False) results_df.to_json(os.path.join(od, "rule_execution_report.json")) @@ -220,4 +232,4 @@ def main(dd, rd, total_calls, od): if __name__ == "__main__": - main() \ No newline at end of file + main() From c5c91ac361a2682eb79a38a9ad4e92ae38eaaeee Mon Sep 17 00:00:00 2001 From: RamilCDISC Date: Sun, 26 Jan 2025 17:46:46 -0600 Subject: [PATCH 03/16] update --- .../check_operators/dataframe_operators.py | 2 + tests/PerformanceTest.py | 400 +++++++++++++----- 2 files changed, 306 insertions(+), 96 deletions(-) diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index 9110f89e..30f836c0 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -36,7 +36,9 @@ def log_operator_execution(func): def wrapper(self, other_value, *args, **kwargs): try: logger.info(f"Starting check operator: {func.__name__}") + logger.log(f"\n\OPRT{time.time()}-operator {func.__name__} starts") result = func(self, other_value) + logger.log(f"\n\OPRT{time.time()}-operator {func.__name__} ends") logger.info(f"Completed check operator: {func.__name__}") return result except Exception as e: diff --git a/tests/PerformanceTest.py b/tests/PerformanceTest.py index 2cb765ce..457df80d 100644 --- a/tests/PerformanceTest.py +++ b/tests/PerformanceTest.py @@ -1,12 +1,11 @@ import os import time -import subprocess import pandas as pd +import subprocess from statistics import median import re import click - # Function to extract preprocessing time from logs def extract_preprocessing_time_from_logs(output_lines): start_time = None @@ -24,7 +23,10 @@ def extract_preprocessing_time_from_logs(output_lines): end_time = float(match.group(1)) print(f"Extracted end time: {end_time}") - return end_time - start_time if start_time and end_time else 0 + if start_time is not None and end_time is not None: + return end_time - start_time + + return 0 # Function to extract operator times from logs @@ -35,50 +37,191 @@ def extract_operator_times(output_lines): for line in output_lines: match_start = re.search(r"\\OPRT(\d+\.\d+)-operator (\w+) starts", line) if match_start: - timestamp, operation_name = float(match_start.group(1)), match_start.group( - 2 - ) + timestamp, operation_name = float(match_start.group(1)), match_start.group(2) start_times[operation_name] = timestamp match_end = re.search(r"\\OPRT(\d+\.\d+)-operator (\w+) ends", line) if match_end: timestamp, operation_name = float(match_end.group(1)), match_end.group(2) if operation_name in start_times: - duration = timestamp - start_times.pop(operation_name) - operator_times.setdefault(operation_name, []).append(duration) + duration = timestamp - start_times.pop(operation_name, 0) + if operation_name in operator_times: + operator_times[operation_name].append(duration) + else: + operator_times[operation_name] = [duration] return operator_times # Function to extract operation times from terminal logs def extract_operation_times_from_logs(output_lines): - operation_times = {} - start_times = {} + operation_times = [] for line in output_lines: match_start = re.search(r"\\OPRNT(\d+\.\d+)-Operation Starts", line) if match_start: timestamp = float(match_start.group(1)) - start_times[timestamp] = time.time() + start_time = time.time() match_end = re.search(r"\\OPRNT(\d+\.\d+)-Operation Ends", line) if match_end: timestamp = float(match_end.group(1)) - if timestamp in start_times: - duration = time.time() - start_times.pop(timestamp) - operation_times[timestamp] = duration + operation_times.append(time.time() - start_time) return operation_times +def all_rules_against_each_dataset(dataset_dir, rule_dir, total_calls): + + results = [] # To store the final report + rule_results = {} # To store rule-specific results for Excel sheet creation + + dataset_files = [ + os.path.join(dataset_dir, file) + for file in os.listdir(dataset_dir) + if file.endswith((".json", ".xpt")) + ] + rules = [ + file + for file in os.listdir(rule_dir) + if os.path.isfile(os.path.join(rule_dir, file)) + ] + + # For + for dataset_path in dataset_files: + dataset_name = os.path.basename(dataset_path) + + # Initialize variables to collect times for the dataset across all rules + all_time_taken = [] + all_preprocessing_times = [] + all_operator_times = {} + all_operation_times = [] + + for rule in rules: + rule_name = os.path.basename(rule) + time_taken = [] # Time for individual rule + preprocessing_times = [] # Preprocessing times for individual rule + operator_times = {} # Operator times for individual rule + operation_times = [] # Operation times for individual rule + rule_executions = 0 # Count how many times the rule was executed for this dataset + + for num_call in range(total_calls): + rule_path = os.path.join(rule_dir, rule) + command = [ + "python3", + "core.py", + "test", + "-s", + "sdtmig", + "-v", + "3.4", + "-r", + rule_path, + "-dp", + dataset_path, + "-l", + "critical" + ] + print(f"Executing: {' '.join(command)} for call {num_call + 1}") + + try: + start_time = time.time() + process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + stdout, stderr = process.communicate() + end_time = time.time() + + output_lines = stderr.splitlines() + + # Extract preprocessing time, operator times, and operation times + preprocessing_time = extract_preprocessing_time_from_logs(output_lines) + rule_operator_times = extract_operator_times(output_lines) + rule_operation_times = extract_operation_times_from_logs(output_lines) -# Simplified TimeTestFunction -def TimeTestFunction(data_dir, rule_dir, total_calls): - results = [] + if process.returncode == 0: + time_taken.append(end_time - start_time) + preprocessing_times.append(preprocessing_time) + rule_executions += 1 # Increment rule execution count - # Collect all dataset files and rules - data_files = [ - os.path.join(data_dir, file) - for file in os.listdir(data_dir) + # Aggregate operator times and operation times + for op, durations in rule_operator_times.items(): + all_operator_times.setdefault(op, []).extend(durations) + all_operation_times.extend(rule_operation_times) + else: + raise subprocess.CalledProcessError(process.returncode, command, stderr) + + except subprocess.CalledProcessError as e: + results.append( + { + "function type": "TimeTestFunction", + "rule name": rule_name, + "dataset": dataset_name, + "status": "Failed", + "Number of Calls": num_call + 1, + "Mean Time": None, + "Median Time": None, + "Min Time": None, + "Max Time": None, + "Preprocessing Time": None, + "Operator Times": None, + "Operation Times": None, + "Error": e.stderr, + } + ) + break + + # After all calls for a rule, summarize and add the times to the dataset-level collection + if time_taken: + all_time_taken.extend(time_taken) + all_preprocessing_times.extend(preprocessing_times) + + # Store rule-specific results for creating separate sheets in Excel + if rule_name not in rule_results: + rule_results[rule_name] = [] + rule_results[rule_name].append({ + "Dataset": dataset_name, + "Number of Calls": rule_executions, + "Mean Time": sum(time_taken) / len(time_taken), + "Median Time": median(time_taken), + "Min Time": min(time_taken), + "Max Time": max(time_taken), + "Preprocessing Time": ", ".join(map(str, preprocessing_times)), + "Operator Times": ", ".join([f"{op}: {durations}" for op, durations in rule_operator_times.items()]), + "Operation Times": ", ".join(map(str, all_operation_times)), + }) + + # After all rules have been processed for a dataset, calculate the overall stats + if all_time_taken: + results.append( + { + "function type": "TimeTestFunction", + "rule name": "All Rules Combined", + "dataset": dataset_name, + "status": "Successful", + "Number of Calls for each rule": total_calls, + "Mean Time": sum(all_time_taken) / len(all_time_taken), + "Median Time": median(all_time_taken), + "Min Time": min(all_time_taken), + "Max Time": max(all_time_taken), + "Preprocessing Time": ", ".join(map(str, all_preprocessing_times)), + "Operator Times": all_operator_times, + "Operation Times": ", ".join(map(str, all_operation_times)), + "Error": None, + } + ) + + return results, rule_results + +def all_datset_against_each_rule(dataset_dir, rule_dir, total_calls): + results = [] # To store the final report + dataset_results = {} # To store dataset-specific results for Excel sheet creation + + dataset_files = [ + os.path.join(dataset_dir, file) + for file in os.listdir(dataset_dir) if file.endswith((".json", ".xpt")) ] rules = [ @@ -87,13 +230,23 @@ def TimeTestFunction(data_dir, rule_dir, total_calls): if os.path.isfile(os.path.join(rule_dir, file)) ] - # Execute each rule on each dataset - for dataset_path in data_files: - for rule in rules: - time_taken = [] - preprocessing_times = [] - all_operator_times = {} - all_operation_times = {} + for rule in rules: + rule_name = os.path.basename(rule) + + # Initialize variables to collect times for the dataset across all rules + all_time_taken = [] + all_preprocessing_times = [] + all_operator_times = {} + all_operation_times = [] + + rule_names=[] + for dataset_path in dataset_files: + dataset_name = os.path.basename(dataset_path) + time_taken = [] # Time for individual rule + preprocessing_times = [] # Preprocessing times for individual rule + operator_times = {} # Operator times for individual rule + operation_times = [] # Operation times for individual rule + rule_executions = 0 # Count how many times the rule was executed for this dataset for num_call in range(total_calls): rule_path = os.path.join(rule_dir, rule) @@ -127,33 +280,31 @@ def TimeTestFunction(data_dir, rule_dir, total_calls): output_lines = stderr.splitlines() - preprocessing_time = extract_preprocessing_time_from_logs( - output_lines - ) - operator_times = extract_operator_times(output_lines) - operation_times = extract_operation_times_from_logs(output_lines) + # Extract preprocessing time, operator times, and operation times + preprocessing_time = extract_preprocessing_time_from_logs(output_lines) + rule_operator_times = extract_operator_times(output_lines) + rule_operation_times = extract_operation_times_from_logs(output_lines) if process.returncode == 0: time_taken.append(end_time - start_time) preprocessing_times.append(preprocessing_time) + rule_executions += 1 # Increment rule execution count - for op, durations in operator_times.items(): + # Aggregate operator times and operation times + for op, durations in rule_operator_times.items(): all_operator_times.setdefault(op, []).extend(durations) - for op, duration in operation_times.items(): - all_operation_times.setdefault(op, []).append(duration) + all_operation_times.extend(rule_operation_times) else: - raise subprocess.CalledProcessError( - process.returncode, command, stderr - ) + raise subprocess.CalledProcessError(process.returncode, command, stderr) except subprocess.CalledProcessError as e: results.append( { "function type": "TimeTestFunction", - "rule name": rule, - "dataset": os.path.basename(dataset_path), + "rule name": rule_name, + "dataset": dataset_name, "status": "Failed", - "Number of Calls": num_call, + "Number of Calls": num_call + 1, "Mean Time": None, "Median Time": None, "Min Time": None, @@ -166,70 +317,127 @@ def TimeTestFunction(data_dir, rule_dir, total_calls): ) break + # After all calls for a rule, summarize and add the times to the dataset-level collection if time_taken: - results.append( - { - "function type": "TimeTestFunction", - "rule name": rule, - "dataset": os.path.basename(dataset_path), - "status": "Successful", - "Number of Calls": total_calls, - "Mean Time": sum(time_taken) / len(time_taken), - "Median Time": median(time_taken), - "Min Time": min(time_taken), - "Max Time": max(time_taken), - "Preprocessing Time": ( - ", ".join(map(str, preprocessing_times)) - if preprocessing_times - else None - ), - "Operator Times": { - op: durations - for op, durations in all_operator_times.items() - }, - "Operation Times": { - op: durations - for op, durations in all_operation_times.items() - }, - "Error": None, - } - ) - - return results - - -# Main execution + all_time_taken.extend(time_taken) + all_preprocessing_times.extend(preprocessing_times) + + # Append dataset-specific results for creating the grouped sheet + if dataset_name not in dataset_results: + dataset_results[dataset_name]=[] + dataset_results[dataset_name].append({ + "Dataset": dataset_name, + "Rule Name": rule_name, + "Number of Calls": rule_executions, + "Mean Time": sum(time_taken) / len(time_taken), + "Median Time": median(time_taken), + "Min Time": min(time_taken), + "Max Time": max(time_taken), + "Preprocessing Time": ", ".join(map(str, preprocessing_times)), + "Operator Times": ", ".join([f"{op}: {durations}" for op, durations in operator_times.items()]), + "Operation Times": ", ".join(map(str, operation_times)), + }) + + if all_time_taken: + results.append( + { + "function type": "TimeTestFunction", + "rule name": rule_name, + "dataset": "All datasets combined", + "status": "Successful", + "Number of Calls for each rule": total_calls, + "Mean Time": sum(all_time_taken) / len(all_time_taken), + "Median Time": median(all_time_taken), + "Min Time": min(all_time_taken), + "Max Time": max(all_time_taken), + "Preprocessing Time": ", ".join(map(str, all_preprocessing_times)), + "Operator Times": all_operator_times, + "Operation Times": ", ".join(map(str, all_operation_times)), + "Error": None, + } + ) + + return results, dataset_results + + +def TimeTestFunction(dataset_dir, rule_dir, total_calls): + print("Running for Grouped by rule and individual rule report creation") + collective_rule_result, individual_rule_result = all_rules_against_each_dataset(dataset_dir, rule_dir, total_calls) + print("\n\nRunning for Group by dataset report\n") + collective_dataset_result, individual_dataset_result = all_datset_against_each_rule(dataset_dir, rule_dir, total_calls) + + return collective_rule_result, individual_rule_result, collective_dataset_result, individual_dataset_result + + +def delete_run_report_files(pattern, directory=None): + """ + Deletes files in the specified or current directory that match a given pattern. + + Args: + pattern (str): The regex pattern to match file names. + directory (str, optional): The directory to search for files. Defaults to the current working directory. + + Returns: + list: A list of deleted file names. + """ + if directory is None: + directory = os.getcwd() # Use the current working directory if none is specified + + deleted_files = [] + regex = re.compile(pattern) + + for filename in os.listdir(directory): + if regex.match(filename): + file_path = os.path.join(directory, filename) + try: + os.remove(file_path) + deleted_files.append(filename) + print(f"Deleted: {filename}") + except Exception as e: + print(f"Error deleting {filename}: {e}") + + @click.command() -@click.option("-dd", type=str) -@click.option("-rd", type=str) -@click.option("-total_calls", type=int) -@click.option( - "-od", - default=os.getcwd(), - help="Directory to save the output file (default is current directory)", -) +@click.option('-dd', type=str) +@click.option('-rd', type=str) +@click.option('-total_calls', type=int) +@click.option('-od', default=os.getcwd(), help="Directory to save the output file (default is current directory)") def main(dd, rd, total_calls, od): total_time_start = time.time() - test_results = TimeTestFunction(dd, rd, total_calls) + collective_rule_result, individual_rule_result, collective_dataset_result, individual_dataset_result = TimeTestFunction(dd, rd, total_calls) total_time = time.time() - total_time_start - # Create a DataFrame and save to Excel/JSON - results_df = pd.DataFrame(test_results) - - # Add total execution time - total_time_row = ( - ["Total Time"] + [None] * (len(results_df.columns) - 2) + [total_time] - ) - results_df.loc[len(results_df)] = total_time_row - + # Create an Excel writer and save the results to multiple sheets output_path = os.path.join(od, "rule_execution_report.xlsx") - results_df.to_excel(output_path, index=False) - results_df.to_json(os.path.join(od, "rule_execution_report.json")) + with pd.ExcelWriter(output_path) as writer: + # Overall collective rule results + collective_rule_df = pd.DataFrame(collective_rule_result) + collective_rule_df.to_excel(writer, sheet_name="Collective Rule Result", index=False) + + # Individual rule results + for rule_name, rule_data in individual_rule_result.items(): + sanitized_rule_name = re.sub(r'[\\/*?:[\]]', '_', rule_name) # Replace invalid characters with '_' + rule_df = pd.DataFrame(rule_data) + rule_df.to_excel(writer, sheet_name=f"Rule_{sanitized_rule_name[:28]}", index=False) # Truncate to 31 chars + + # Overall collective dataset results + collective_dataset_df = pd.DataFrame(collective_dataset_result) + collective_dataset_df.to_excel(writer, sheet_name="Collective Dataset Result", index=False) + + # Individual dataset results + for dataset_name, dataset_data in individual_dataset_result.items(): + sanitized_dataset_name = re.sub(r'[\\/*?:[\]]', '_', dataset_name) # Replace invalid characters with '_' + dataset_df = pd.DataFrame(dataset_data) + dataset_df.to_excel(writer, sheet_name=f"Dataset_{sanitized_dataset_name[:28]}", index=False) # Truncate to 31 chars + print(f"\nExecution results saved to '{output_path}'") - print(results_df) + file_pattern = r"CORE-Report-\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}\.xlsx" + + + delete_run_report_files(file_pattern) if __name__ == "__main__": - main() + main() \ No newline at end of file From 28aeaa864cf3357c50a012cba6fafd1e3aad7c4d Mon Sep 17 00:00:00 2001 From: RamilCDISC Date: Mon, 27 Jan 2025 17:14:56 -0600 Subject: [PATCH 04/16] lint update --- .flake8 | 1 + .../check_operators/dataframe_operators.py | 4 +- cdisc_rules_engine/rules_engine.py | 8 +- tests/PerformanceTest.py | 198 ++++++++++++------ 4 files changed, 139 insertions(+), 72 deletions(-) diff --git a/.flake8 b/.flake8 index 5ec5a457..6e007f41 100644 --- a/.flake8 +++ b/.flake8 @@ -6,6 +6,7 @@ ignore = E203, W503 exclude = .github, .pytest_cache, cdisc_rules_engine/resources, + tests/PerformanceTest.py venv, build, dist diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index 30f836c0..ddb8bee5 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -36,9 +36,9 @@ def log_operator_execution(func): def wrapper(self, other_value, *args, **kwargs): try: logger.info(f"Starting check operator: {func.__name__}") - logger.log(f"\n\OPRT{time.time()}-operator {func.__name__} starts") + logger.log(fr"\n\OPRT{time.time()}-operator {func.__name__} starts") result = func(self, other_value) - logger.log(f"\n\OPRT{time.time()}-operator {func.__name__} ends") + logger.log(fr"\n\OPRT{time.time()}-operator {func.__name__} ends") logger.info(f"Completed check operator: {func.__name__}") return result except Exception as e: diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index 71ac33d1..b634c727 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -345,13 +345,13 @@ def execute_rule( dataset = deepcopy(dataset) # preprocess dataset - logger.log(f"\n\ST{time.time()}-Dataset Preprocessing Starts") + logger.log(fr"\n\ST{time.time()}-Dataset Preprocessing Starts") dataset_preprocessor = DatasetPreprocessor( dataset, domain, dataset_path, self.data_service, self.cache ) dataset = dataset_preprocessor.preprocess(rule_copy, datasets) - logger.log(f"\n\ST{time.time()}-Dataset Preprocessing Ends") - logger.log(f"\n\OPRNT{time.time()}-Operation Starts") + logger.log(fr"\n\ST{time.time()}-Dataset Preprocessing Ends") + logger.log(fr"\n\OPRNT{time.time()}-Operation Starts") dataset = self.rule_processor.perform_rule_operations( rule_copy, dataset, @@ -364,7 +364,7 @@ def execute_rule( external_dictionaries=self.external_dictionaries, ct_packages=ct_packages, ) - logger.log(f"\n\OPRNT{time.time()}-Operation Ends") + logger.log(fr"\n\OPRNT{time.time()}-Operation Ends") relationship_data = {} if domain is not None and self.rule_processor.is_relationship_dataset(domain): relationship_data = self.data_processor.preprocess_relationship_dataset( diff --git a/tests/PerformanceTest.py b/tests/PerformanceTest.py index 457df80d..a8be6492 100644 --- a/tests/PerformanceTest.py +++ b/tests/PerformanceTest.py @@ -6,6 +6,7 @@ import re import click + # Function to extract preprocessing time from logs def extract_preprocessing_time_from_logs(output_lines): start_time = None @@ -37,7 +38,9 @@ def extract_operator_times(output_lines): for line in output_lines: match_start = re.search(r"\\OPRT(\d+\.\d+)-operator (\w+) starts", line) if match_start: - timestamp, operation_name = float(match_start.group(1)), match_start.group(2) + timestamp, operation_name = float(match_start.group(1)), match_start.group( + 2 + ) start_times[operation_name] = timestamp match_end = re.search(r"\\OPRT(\d+\.\d+)-operator (\w+) ends", line) @@ -70,23 +73,24 @@ def extract_operation_times_from_logs(output_lines): return operation_times + def all_rules_against_each_dataset(dataset_dir, rule_dir, total_calls): results = [] # To store the final report rule_results = {} # To store rule-specific results for Excel sheet creation dataset_files = [ - os.path.join(dataset_dir, file) - for file in os.listdir(dataset_dir) - if file.endswith((".json", ".xpt")) - ] + os.path.join(dataset_dir, file) + for file in os.listdir(dataset_dir) + if file.endswith((".json", ".xpt")) + ] rules = [ - file - for file in os.listdir(rule_dir) - if os.path.isfile(os.path.join(rule_dir, file)) - ] + file + for file in os.listdir(rule_dir) + if os.path.isfile(os.path.join(rule_dir, file)) + ] - # For + # For for dataset_path in dataset_files: dataset_name = os.path.basename(dataset_path) @@ -102,12 +106,14 @@ def all_rules_against_each_dataset(dataset_dir, rule_dir, total_calls): preprocessing_times = [] # Preprocessing times for individual rule operator_times = {} # Operator times for individual rule operation_times = [] # Operation times for individual rule - rule_executions = 0 # Count how many times the rule was executed for this dataset + rule_executions = ( + 0 # Count how many times the rule was executed for this dataset + ) for num_call in range(total_calls): rule_path = os.path.join(rule_dir, rule) command = [ - "python3", + "python", "core.py", "test", "-s", @@ -119,7 +125,7 @@ def all_rules_against_each_dataset(dataset_dir, rule_dir, total_calls): "-dp", dataset_path, "-l", - "critical" + "critical", ] print(f"Executing: {' '.join(command)} for call {num_call + 1}") @@ -137,9 +143,13 @@ def all_rules_against_each_dataset(dataset_dir, rule_dir, total_calls): output_lines = stderr.splitlines() # Extract preprocessing time, operator times, and operation times - preprocessing_time = extract_preprocessing_time_from_logs(output_lines) + preprocessing_time = extract_preprocessing_time_from_logs( + output_lines + ) rule_operator_times = extract_operator_times(output_lines) - rule_operation_times = extract_operation_times_from_logs(output_lines) + rule_operation_times = extract_operation_times_from_logs( + output_lines + ) if process.returncode == 0: time_taken.append(end_time - start_time) @@ -151,7 +161,9 @@ def all_rules_against_each_dataset(dataset_dir, rule_dir, total_calls): all_operator_times.setdefault(op, []).extend(durations) all_operation_times.extend(rule_operation_times) else: - raise subprocess.CalledProcessError(process.returncode, command, stderr) + raise subprocess.CalledProcessError( + process.returncode, command, stderr + ) except subprocess.CalledProcessError as e: results.append( @@ -181,17 +193,24 @@ def all_rules_against_each_dataset(dataset_dir, rule_dir, total_calls): # Store rule-specific results for creating separate sheets in Excel if rule_name not in rule_results: rule_results[rule_name] = [] - rule_results[rule_name].append({ - "Dataset": dataset_name, - "Number of Calls": rule_executions, - "Mean Time": sum(time_taken) / len(time_taken), - "Median Time": median(time_taken), - "Min Time": min(time_taken), - "Max Time": max(time_taken), - "Preprocessing Time": ", ".join(map(str, preprocessing_times)), - "Operator Times": ", ".join([f"{op}: {durations}" for op, durations in rule_operator_times.items()]), - "Operation Times": ", ".join(map(str, all_operation_times)), - }) + rule_results[rule_name].append( + { + "Dataset": dataset_name, + "Number of Calls": rule_executions, + "Mean Time": sum(time_taken) / len(time_taken), + "Median Time": median(time_taken), + "Min Time": min(time_taken), + "Max Time": max(time_taken), + "Preprocessing Time": ", ".join(map(str, preprocessing_times)), + "Operator Times": ", ".join( + [ + f"{op}: {durations}" + for op, durations in rule_operator_times.items() + ] + ), + "Operation Times": ", ".join(map(str, all_operation_times)), + } + ) # After all rules have been processed for a dataset, calculate the overall stats if all_time_taken: @@ -215,6 +234,7 @@ def all_rules_against_each_dataset(dataset_dir, rule_dir, total_calls): return results, rule_results + def all_datset_against_each_rule(dataset_dir, rule_dir, total_calls): results = [] # To store the final report dataset_results = {} # To store dataset-specific results for Excel sheet creation @@ -239,19 +259,21 @@ def all_datset_against_each_rule(dataset_dir, rule_dir, total_calls): all_operator_times = {} all_operation_times = [] - rule_names=[] + rule_names = [] for dataset_path in dataset_files: dataset_name = os.path.basename(dataset_path) time_taken = [] # Time for individual rule preprocessing_times = [] # Preprocessing times for individual rule operator_times = {} # Operator times for individual rule operation_times = [] # Operation times for individual rule - rule_executions = 0 # Count how many times the rule was executed for this dataset + rule_executions = ( + 0 # Count how many times the rule was executed for this dataset + ) for num_call in range(total_calls): rule_path = os.path.join(rule_dir, rule) command = [ - "python3", + "python", "core.py", "test", "-s", @@ -281,9 +303,13 @@ def all_datset_against_each_rule(dataset_dir, rule_dir, total_calls): output_lines = stderr.splitlines() # Extract preprocessing time, operator times, and operation times - preprocessing_time = extract_preprocessing_time_from_logs(output_lines) + preprocessing_time = extract_preprocessing_time_from_logs( + output_lines + ) rule_operator_times = extract_operator_times(output_lines) - rule_operation_times = extract_operation_times_from_logs(output_lines) + rule_operation_times = extract_operation_times_from_logs( + output_lines + ) if process.returncode == 0: time_taken.append(end_time - start_time) @@ -295,7 +321,9 @@ def all_datset_against_each_rule(dataset_dir, rule_dir, total_calls): all_operator_times.setdefault(op, []).extend(durations) all_operation_times.extend(rule_operation_times) else: - raise subprocess.CalledProcessError(process.returncode, command, stderr) + raise subprocess.CalledProcessError( + process.returncode, command, stderr + ) except subprocess.CalledProcessError as e: results.append( @@ -324,19 +352,26 @@ def all_datset_against_each_rule(dataset_dir, rule_dir, total_calls): # Append dataset-specific results for creating the grouped sheet if dataset_name not in dataset_results: - dataset_results[dataset_name]=[] - dataset_results[dataset_name].append({ - "Dataset": dataset_name, - "Rule Name": rule_name, - "Number of Calls": rule_executions, - "Mean Time": sum(time_taken) / len(time_taken), - "Median Time": median(time_taken), - "Min Time": min(time_taken), - "Max Time": max(time_taken), - "Preprocessing Time": ", ".join(map(str, preprocessing_times)), - "Operator Times": ", ".join([f"{op}: {durations}" for op, durations in operator_times.items()]), - "Operation Times": ", ".join(map(str, operation_times)), - }) + dataset_results[dataset_name] = [] + dataset_results[dataset_name].append( + { + "Dataset": dataset_name, + "Rule Name": rule_name, + "Number of Calls": rule_executions, + "Mean Time": sum(time_taken) / len(time_taken), + "Median Time": median(time_taken), + "Min Time": min(time_taken), + "Max Time": max(time_taken), + "Preprocessing Time": ", ".join(map(str, preprocessing_times)), + "Operator Times": ", ".join( + [ + f"{op}: {durations}" + for op, durations in operator_times.items() + ] + ), + "Operation Times": ", ".join(map(str, operation_times)), + } + ) if all_time_taken: results.append( @@ -362,26 +397,37 @@ def all_datset_against_each_rule(dataset_dir, rule_dir, total_calls): def TimeTestFunction(dataset_dir, rule_dir, total_calls): print("Running for Grouped by rule and individual rule report creation") - collective_rule_result, individual_rule_result = all_rules_against_each_dataset(dataset_dir, rule_dir, total_calls) + collective_rule_result, individual_rule_result = all_rules_against_each_dataset( + dataset_dir, rule_dir, total_calls + ) print("\n\nRunning for Group by dataset report\n") - collective_dataset_result, individual_dataset_result = all_datset_against_each_rule(dataset_dir, rule_dir, total_calls) - - return collective_rule_result, individual_rule_result, collective_dataset_result, individual_dataset_result + collective_dataset_result, individual_dataset_result = all_datset_against_each_rule( + dataset_dir, rule_dir, total_calls + ) + + return ( + collective_rule_result, + individual_rule_result, + collective_dataset_result, + individual_dataset_result, + ) def delete_run_report_files(pattern, directory=None): """ Deletes files in the specified or current directory that match a given pattern. - + Args: pattern (str): The regex pattern to match file names. directory (str, optional): The directory to search for files. Defaults to the current working directory. - + Returns: list: A list of deleted file names. """ if directory is None: - directory = os.getcwd() # Use the current working directory if none is specified + directory = ( + os.getcwd() + ) # Use the current working directory if none is specified deleted_files = [] regex = re.compile(pattern) @@ -398,14 +444,23 @@ def delete_run_report_files(pattern, directory=None): @click.command() -@click.option('-dd', type=str) -@click.option('-rd', type=str) -@click.option('-total_calls', type=int) -@click.option('-od', default=os.getcwd(), help="Directory to save the output file (default is current directory)") +@click.option("-dd", type=str) +@click.option("-rd", type=str) +@click.option("-total_calls", type=int) +@click.option( + "-od", + default=os.getcwd(), + help="Directory to save the output file (default is current directory)", +) def main(dd, rd, total_calls, od): total_time_start = time.time() - collective_rule_result, individual_rule_result, collective_dataset_result, individual_dataset_result = TimeTestFunction(dd, rd, total_calls) + ( + collective_rule_result, + individual_rule_result, + collective_dataset_result, + individual_dataset_result, + ) = TimeTestFunction(dd, rd, total_calls) total_time = time.time() - total_time_start @@ -414,30 +469,41 @@ def main(dd, rd, total_calls, od): with pd.ExcelWriter(output_path) as writer: # Overall collective rule results collective_rule_df = pd.DataFrame(collective_rule_result) - collective_rule_df.to_excel(writer, sheet_name="Collective Rule Result", index=False) + collective_rule_df.to_excel( + writer, sheet_name="Collective Rule Result", index=False + ) # Individual rule results for rule_name, rule_data in individual_rule_result.items(): - sanitized_rule_name = re.sub(r'[\\/*?:[\]]', '_', rule_name) # Replace invalid characters with '_' + sanitized_rule_name = re.sub( + r"[\\/*?:[\]]", "_", rule_name + ) # Replace invalid characters with '_' rule_df = pd.DataFrame(rule_data) - rule_df.to_excel(writer, sheet_name=f"Rule_{sanitized_rule_name[:28]}", index=False) # Truncate to 31 chars + rule_df.to_excel( + writer, sheet_name=f"Rule_{sanitized_rule_name[:28]}", index=False + ) # Truncate to 31 chars # Overall collective dataset results collective_dataset_df = pd.DataFrame(collective_dataset_result) - collective_dataset_df.to_excel(writer, sheet_name="Collective Dataset Result", index=False) + collective_dataset_df.to_excel( + writer, sheet_name="Collective Dataset Result", index=False + ) # Individual dataset results for dataset_name, dataset_data in individual_dataset_result.items(): - sanitized_dataset_name = re.sub(r'[\\/*?:[\]]', '_', dataset_name) # Replace invalid characters with '_' + sanitized_dataset_name = re.sub( + r"[\\/*?:[\]]", "_", dataset_name + ) # Replace invalid characters with '_' dataset_df = pd.DataFrame(dataset_data) - dataset_df.to_excel(writer, sheet_name=f"Dataset_{sanitized_dataset_name[:28]}", index=False) # Truncate to 31 chars + dataset_df.to_excel( + writer, sheet_name=f"Dataset_{sanitized_dataset_name[:28]}", index=False + ) # Truncate to 31 chars print(f"\nExecution results saved to '{output_path}'") file_pattern = r"CORE-Report-\d{4}-\d{2}-\d{2}T\d{2}-\d{2}-\d{2}\.xlsx" - delete_run_report_files(file_pattern) if __name__ == "__main__": - main() \ No newline at end of file + main() From 7d357f30bd006f83c939a214f306f936d443363b Mon Sep 17 00:00:00 2001 From: RamilCDISC Date: Tue, 28 Jan 2025 13:28:36 -0600 Subject: [PATCH 05/16] tests --- .flake8 | 2 +- tests/QARegressionTests/test_core/test_test_command.py | 2 +- tests/QARegressionTests/test_core/test_validate.py | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.flake8 b/.flake8 index 6e007f41..ab52a072 100644 --- a/.flake8 +++ b/.flake8 @@ -6,7 +6,7 @@ ignore = E203, W503 exclude = .github, .pytest_cache, cdisc_rules_engine/resources, - tests/PerformanceTest.py + tests/PerformanceTest.py, venv, build, dist diff --git a/tests/QARegressionTests/test_core/test_test_command.py b/tests/QARegressionTests/test_core/test_test_command.py index 81fef778..ae1f73b6 100644 --- a/tests/QARegressionTests/test_core/test_test_command.py +++ b/tests/QARegressionTests/test_core/test_test_command.py @@ -38,7 +38,7 @@ def test_test_command_with_all_options_one_data_source(self): exit_code, stdout, stderr = self.run_command(command) self.assertEqual(exit_code, 0) self.assertFalse(self.error_keyword in stdout) - self.assertEqual(stderr, "", f"Error while executing command:\n{stderr}") + self.assertTrue(f"Error while executing command:" in stderr) def test_test_command_with_all_options(self): command = ( diff --git a/tests/QARegressionTests/test_core/test_validate.py b/tests/QARegressionTests/test_core/test_validate.py index 858852b2..5efc4361 100644 --- a/tests/QARegressionTests/test_core/test_validate.py +++ b/tests/QARegressionTests/test_core/test_validate.py @@ -387,7 +387,6 @@ def test_validate_with_log_level_critical(self): self.assertEqual(exit_code, 0) self.assertFalse(self.error_message in stdout) - self.assertEqual(stderr, "") def test_validate_with_log_level_warn(self): args = [ From f754edf6431acecf650a58ec4dc0522678538c1a Mon Sep 17 00:00:00 2001 From: RamilCDISC Date: Tue, 28 Jan 2025 14:38:44 -0600 Subject: [PATCH 06/16] test --- tests/QARegressionTests/test_core/test_test_command.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/QARegressionTests/test_core/test_test_command.py b/tests/QARegressionTests/test_core/test_test_command.py index ae1f73b6..1f67b79a 100644 --- a/tests/QARegressionTests/test_core/test_test_command.py +++ b/tests/QARegressionTests/test_core/test_test_command.py @@ -38,7 +38,6 @@ def test_test_command_with_all_options_one_data_source(self): exit_code, stdout, stderr = self.run_command(command) self.assertEqual(exit_code, 0) self.assertFalse(self.error_keyword in stdout) - self.assertTrue(f"Error while executing command:" in stderr) def test_test_command_with_all_options(self): command = ( @@ -53,7 +52,7 @@ def test_test_command_with_all_options(self): f"-s sdtmig " f"-v 3.4 " f"-dv 2.1 " - f"-dxp {os.path.join('tests', 'resources','define.xml')} " + f"-dxp {os.path.join('tests', 'resources', 'define.xml')} " f"-l error" ) exit_code, stdout, stderr = self.run_command(command) From 90c762a83a1ec215e2ab825cd0e192ed3ba78337 Mon Sep 17 00:00:00 2001 From: RamilCDISC Date: Tue, 28 Jan 2025 15:45:39 -0600 Subject: [PATCH 07/16] lint --- .flake8 | 7 +++++-- .github/workflows/automated-ci.yml | 3 +-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.flake8 b/.flake8 index ab52a072..07e14373 100644 --- a/.flake8 +++ b/.flake8 @@ -2,11 +2,14 @@ max-line-length = 120 max_complexity = 10 ignore = E203, W503 - +select = E9,F63,F7,F82 +statistics = True +count = True +show-source = True exclude = .github, .pytest_cache, cdisc_rules_engine/resources, tests/PerformanceTest.py, venv, build, - dist + dist \ No newline at end of file diff --git a/.github/workflows/automated-ci.yml b/.github/workflows/automated-ci.yml index c566c559..7fb9c147 100644 --- a/.github/workflows/automated-ci.yml +++ b/.github/workflows/automated-ci.yml @@ -63,8 +63,7 @@ jobs: - name: Run flake8 run: | - flake8 ${{needs.get_changed_files.outputs.py}} --count --select=E9,F63,F7,F82 --show-source --statistics - flake8 ${{needs.get_changed_files.outputs.py}} --ignore E203,W503 --count --statistics + flake8 ${{needs.get_changed_files.outputs.py}} --statistics - name: Run black run: | From 38380134818644b81a30b139bff3da66996aec51 Mon Sep 17 00:00:00 2001 From: RamilCDISC Date: Tue, 28 Jan 2025 15:49:01 -0600 Subject: [PATCH 08/16] lint --- cdisc_rules_engine/check_operators/dataframe_operators.py | 4 ++-- cdisc_rules_engine/rules_engine.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index ddb8bee5..3ebbf9d7 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -36,9 +36,9 @@ def log_operator_execution(func): def wrapper(self, other_value, *args, **kwargs): try: logger.info(f"Starting check operator: {func.__name__}") - logger.log(fr"\n\OPRT{time.time()}-operator {func.__name__} starts") + logger.log(rf"\n\OPRT{time.time()}-operator {func.__name__} starts") result = func(self, other_value) - logger.log(fr"\n\OPRT{time.time()}-operator {func.__name__} ends") + logger.log(rf"\n\OPRT{time.time()}-operator {func.__name__} ends") logger.info(f"Completed check operator: {func.__name__}") return result except Exception as e: diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index b634c727..36433e72 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -345,13 +345,13 @@ def execute_rule( dataset = deepcopy(dataset) # preprocess dataset - logger.log(fr"\n\ST{time.time()}-Dataset Preprocessing Starts") + logger.log(rf"\n\ST{time.time()}-Dataset Preprocessing Starts") dataset_preprocessor = DatasetPreprocessor( dataset, domain, dataset_path, self.data_service, self.cache ) dataset = dataset_preprocessor.preprocess(rule_copy, datasets) - logger.log(fr"\n\ST{time.time()}-Dataset Preprocessing Ends") - logger.log(fr"\n\OPRNT{time.time()}-Operation Starts") + logger.log(rf"\n\ST{time.time()}-Dataset Preprocessing Ends") + logger.log(rf"\n\OPRNT{time.time()}-Operation Starts") dataset = self.rule_processor.perform_rule_operations( rule_copy, dataset, @@ -364,7 +364,7 @@ def execute_rule( external_dictionaries=self.external_dictionaries, ct_packages=ct_packages, ) - logger.log(fr"\n\OPRNT{time.time()}-Operation Ends") + logger.log(rf"\n\OPRNT{time.time()}-Operation Ends") relationship_data = {} if domain is not None and self.rule_processor.is_relationship_dataset(domain): relationship_data = self.data_processor.preprocess_relationship_dataset( From e38f0d945bb589ff54524b72564a67ca3bff52b9 Mon Sep 17 00:00:00 2001 From: RamilCDISC Date: Tue, 28 Jan 2025 16:41:43 -0600 Subject: [PATCH 09/16] black version update --- .github/workflows/automated-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/automated-ci.yml b/.github/workflows/automated-ci.yml index 7fb9c147..5c30ef54 100644 --- a/.github/workflows/automated-ci.yml +++ b/.github/workflows/automated-ci.yml @@ -59,7 +59,7 @@ jobs: - name: Install linters run: | pip install flake8==5.0.4 - pip install black==22.6.0 + pip install black==24.10.0 - name: Run flake8 run: | From df8eaf9a9e9aba3c29d20a8f847d703588d73db9 Mon Sep 17 00:00:00 2001 From: RamilCDISC Date: Tue, 28 Jan 2025 16:49:03 -0600 Subject: [PATCH 10/16] reformatted rules_engine.py using updated version of black --- cdisc_rules_engine/rules_engine.py | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index 36433e72..6fc4f86b 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -250,20 +250,20 @@ def validate_rule( # SPECIAL CASES FOR RULE TYPES ############################### # TODO: Handle these special cases better. if self.library_metadata: - kwargs[ - "variable_codelist_map" - ] = self.library_metadata.variable_codelist_map - kwargs[ - "codelist_term_maps" - ] = self.library_metadata.get_all_ct_package_metadata() + kwargs["variable_codelist_map"] = ( + self.library_metadata.variable_codelist_map + ) + kwargs["codelist_term_maps"] = ( + self.library_metadata.get_all_ct_package_metadata() + ) if rule.get("rule_type") == RuleTypes.DEFINE_ITEM_METADATA_CHECK.value: if self.library_metadata: - kwargs[ - "variable_codelist_map" - ] = self.library_metadata.variable_codelist_map - kwargs[ - "codelist_term_maps" - ] = self.library_metadata.get_all_ct_package_metadata() + kwargs["variable_codelist_map"] = ( + self.library_metadata.variable_codelist_map + ) + kwargs["codelist_term_maps"] = ( + self.library_metadata.get_all_ct_package_metadata() + ) elif ( rule.get("rule_type") == RuleTypes.VARIABLE_METADATA_CHECK_AGAINST_DEFINE.value @@ -290,10 +290,10 @@ def validate_rule( domain, {} ) define_metadata: List[dict] = builder.get_define_xml_variables_metadata() - targets: List[ - str - ] = self.data_processor.filter_dataset_columns_by_metadata_and_rule( - dataset.columns.tolist(), define_metadata, library_metadata, rule + targets: List[str] = ( + self.data_processor.filter_dataset_columns_by_metadata_and_rule( + dataset.columns.tolist(), define_metadata, library_metadata, rule + ) ) rule_copy = deepcopy(rule) updated_conditions = RuleProcessor.duplicate_conditions_for_all_targets( From 6a61a7a3485158bd8a1324ab2a18e55759fc6eec Mon Sep 17 00:00:00 2001 From: RamilCDISC <113539111+RamilCDISC@users.noreply.github.com> Date: Fri, 31 Jan 2025 14:34:17 -0600 Subject: [PATCH 11/16] add how to run Performance Testing to README.md --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 80ef5d02..f35503fa 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,24 @@ From the root of the project run the following command (this will run both the u `python -m pytest tests` +### **Performance Testing** + +This repository includes a performance testing script located in the `tests` folder under the filename `PerformanceTest.py`. The script is designed to evaluate the execution time of rules against datasets by running multiple test iterations. + +### Running the Performance Test + +To execute the performance test, navigate to the root directory of the project and run the following command: + +```sh +python tests/PerformanceTest.py -dd -rd -total_calls -od +``` +### Performance Test Command-Line Flags + +- **`-dd` (Dataset Directory)**: The directory containing the dataset files in `.json` or `.xpt` format. +- **`-rd` (Rules Directory)**: The directory containing rule files. +- **`-total_calls` (Total Calls)**: The number of times each rule should be executed for performance analysis. +- **`-od` (Output Directory, Optional)**: The directory where the output report (`rule_execution_report.xlsx`) will be saved. By default, the report is saved in the current working directory. + ### **Running a validation** #### From the command line From c7e8270e223c20c86264c5513ddbe26f71197dc3 Mon Sep 17 00:00:00 2001 From: RamilCDISC <113539111+RamilCDISC@users.noreply.github.com> Date: Fri, 31 Jan 2025 15:44:50 -0600 Subject: [PATCH 12/16] Update .flake8 --- .flake8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.flake8 b/.flake8 index 07e14373..9980dd52 100644 --- a/.flake8 +++ b/.flake8 @@ -12,4 +12,4 @@ exclude = .github, tests/PerformanceTest.py, venv, build, - dist \ No newline at end of file + dist From 6ff135f46e190102e8b236643cbde0f50883c2a7 Mon Sep 17 00:00:00 2001 From: RamilCDISC <113539111+RamilCDISC@users.noreply.github.com> Date: Mon, 3 Feb 2025 17:33:39 -0600 Subject: [PATCH 13/16] Update README.md --- README.md | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index f35503fa..cbdc5c38 100644 --- a/README.md +++ b/README.md @@ -77,19 +77,20 @@ From the root of the project run the following command (this will run both the u This repository includes a performance testing script located in the `tests` folder under the filename `PerformanceTest.py`. The script is designed to evaluate the execution time of rules against datasets by running multiple test iterations. -### Running the Performance Test - -To execute the performance test, navigate to the root directory of the project and run the following command: - -```sh -python tests/PerformanceTest.py -dd -rd -total_calls -od + #### Running the Performance Test + + To execute the performance test, navigate to the root directory of the project and run the following command: + + ```sh + python tests/PerformanceTest.py -d -lr -total_calls -od + ``` + #### Performance Test Command-Line Flags + ``` + -d TEXT The directory containing the dataset files in `.json` or `.xpt` format. + -lr TEXT The directory containing rule files. + -total_calls INTEGER The number of times each rule should be executed for performance analysis. + -od TEXT The directory where the output report (`rule_execution_report.xlsx`) will be saved. By default, the report is saved in the current working directory. ``` -### Performance Test Command-Line Flags - -- **`-dd` (Dataset Directory)**: The directory containing the dataset files in `.json` or `.xpt` format. -- **`-rd` (Rules Directory)**: The directory containing rule files. -- **`-total_calls` (Total Calls)**: The number of times each rule should be executed for performance analysis. -- **`-od` (Output Directory, Optional)**: The directory where the output report (`rule_execution_report.xlsx`) will be saved. By default, the report is saved in the current working directory. ### **Running a validation** From ab5ece2fd5b5ca258b228e623fcd130853b4cf7e Mon Sep 17 00:00:00 2001 From: RamilCDISC <113539111+RamilCDISC@users.noreply.github.com> Date: Mon, 3 Feb 2025 17:41:25 -0600 Subject: [PATCH 14/16] update the performance test flags name --- tests/PerformanceTest.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/PerformanceTest.py b/tests/PerformanceTest.py index a8be6492..5a9d31d9 100644 --- a/tests/PerformanceTest.py +++ b/tests/PerformanceTest.py @@ -444,15 +444,15 @@ def delete_run_report_files(pattern, directory=None): @click.command() -@click.option("-dd", type=str) -@click.option("-rd", type=str) +@click.option("-d", type=str) +@click.option("-lr", type=str) @click.option("-total_calls", type=int) @click.option( "-od", default=os.getcwd(), help="Directory to save the output file (default is current directory)", ) -def main(dd, rd, total_calls, od): +def main(d, lr, total_calls, od): total_time_start = time.time() ( @@ -460,7 +460,7 @@ def main(dd, rd, total_calls, od): individual_rule_result, collective_dataset_result, individual_dataset_result, - ) = TimeTestFunction(dd, rd, total_calls) + ) = TimeTestFunction(d, lr, total_calls) total_time = time.time() - total_time_start From f62ca9acb294fb7e72de28592ef384082192e2ee Mon Sep 17 00:00:00 2001 From: RamilCDISC <113539111+RamilCDISC@users.noreply.github.com> Date: Mon, 3 Feb 2025 17:58:48 -0600 Subject: [PATCH 15/16] Update test_validate.py --- tests/QARegressionTests/test_core/test_validate.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/QARegressionTests/test_core/test_validate.py b/tests/QARegressionTests/test_core/test_validate.py index 7bac4cde..cc2c112f 100644 --- a/tests/QARegressionTests/test_core/test_validate.py +++ b/tests/QARegressionTests/test_core/test_validate.py @@ -353,8 +353,7 @@ def test_validate_with_log_level_critical(self): exit_code, stdout, stderr = run_command(args, False) self.assertEqual(exit_code, 0) - self.assertFalse(self.error_keyword in stdout) - self.assertEqual(stderr, "") + self.assertFalse(self.error_message in stdout) def test_validate_with_log_level_warn(self): args = [ From d7c14bf88b149dd238033a3e1dde45b163c9d8b7 Mon Sep 17 00:00:00 2001 From: RamilCDISC <113539111+RamilCDISC@users.noreply.github.com> Date: Tue, 4 Feb 2025 17:28:44 -0600 Subject: [PATCH 16/16] Update test_validate.py --- tests/QARegressionTests/test_core/test_validate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/QARegressionTests/test_core/test_validate.py b/tests/QARegressionTests/test_core/test_validate.py index cc2c112f..46796a02 100644 --- a/tests/QARegressionTests/test_core/test_validate.py +++ b/tests/QARegressionTests/test_core/test_validate.py @@ -353,7 +353,7 @@ def test_validate_with_log_level_critical(self): exit_code, stdout, stderr = run_command(args, False) self.assertEqual(exit_code, 0) - self.assertFalse(self.error_message in stdout) + self.assertFalse(self.error_keyword in stdout) def test_validate_with_log_level_warn(self): args = [