Semantic profiler and report generation module integration

Added the modules for generating the report based on the syntactic and semantic feature present in the code Signed-off-by: Pankaj Thorat <thorat.pankaj9@gmail.com>
IBM · Nov 25, 2024 · d87f217 · d87f217
1 parent 995bfc6
commit d87f217
Show file tree

Hide file tree

Showing 46 changed files with 69,457 additions and 1,767 deletions.
diff --git a/transforms/code/code_profiler/README.md b/transforms/code/code_profiler/README.md
@@ -61,3 +61,5 @@ The high-level system design is as follows:
 For each new target language, the offline phase is utilized to create deterministic rules by harnessing the capabilities of LLMs and working with exemplar code samples from the target language. In this process, Workflow W1 facilitates the creation of rules around syntactic structures based on exemplar code samples, while Workflow W2 is used to establish semantic dimensions for profiling. Subsequently, we derive rules that connect syntactic constructs to the predefined semantic concepts. These rules are then stored in a rule database, ready to be employed during the online phase.
 
 In the online phase, the system dynamically generates profiling outputs for any incoming code snippets. This is achieved by extracting concepts from the snippets using the rules in the database and storing these extractions in a tabular format. The structured tabular format allows for generating additional concept columns, which are then utilized to create comprehensive profiling reports.
+
+
diff --git a/transforms/code/code_profiler/input/data_profiler_params.json b/transforms/code/code_profiler/input/data_profiler_params.json
@@ -1,5 +1,6 @@
 {
     "input": "multi-package.parquet",
-    "contents": "Contents",
-    "language": "Language"
+    "dynamic_schema_mapping": "True",
+    "contents": "contents",
+    "language": "language"
 }
diff --git a/transforms/code/code_profiler/input/multi-package.parquet b/transforms/code/code_profiler/input/multi-package.parquet
diff --git a/transforms/code/code_profiler/input/output_part1.parquet b/transforms/code/code_profiler/input/output_part1.parquet
diff --git a/transforms/code/code_profiler/input/output_part2.parquet b/transforms/code/code_profiler/input/output_part2.parquet
diff --git a/transforms/code/code_profiler/python/Makefile b/transforms/code/code_profiler/python/Makefile
@@ -35,7 +35,7 @@ setup:: .transforms.setup
 set-versions:
 	$(MAKE) TRANSFORM_PYTHON_VERSION=$(CODE_PROFILER_PYTHON_VERSION) TOML_VERSION=$(CODE_PROFILER_PYTHON_VERSION) .transforms.set-versions 
 
-build-dist:: .defaults.build-dist 
+build-dist:: .defaults.build-dist
 
 publish-dist:: .defaults.publish-dist
 
@@ -51,5 +51,5 @@ run-local-sample: .transforms.run-local-sample
 
 run-local-python-sample:
 	$(MAKE) RUN_FILE=code_profiler_local_python.py \
-	RUN_ARGS="--content 'Contents' --language 'Language'" \
+	RUN_ARGS="--content 'contents' --language 'language'" \
 	.transforms.run-local-python-sample
diff --git a/transforms/code/code_profiler/python/src/UAST_parser.py b/transforms/code/code_profiler/python/src/UAST_parser.py
@@ -17,11 +17,15 @@
 import sys
 sys.setrecursionlimit(10000)
 
+import sys
+sys.setrecursionlimit(10000)
+
 """
 Initialize the parser with a path for rules and grammar.
 """
 class UASTParser():
     def __init__(self):
+        self.processed_nodes = set()
         self.language : str = None
         self.uast : UAST = None
         self.rules : dict = None
@@ -228,8 +232,14 @@ def _add_user_defined(self, node):
         return
 
     # Traversing through the AST to create nodes recursively.
-    def _dfs(self, AST_node, parent) :
-        if (AST_node.type in self.rules) :
+    def _dfs(self, AST_node, parent):
+        # Use a unique identifier to check if the node is already processed
+        node_identifier = (AST_node.start_point, AST_node.end_point, AST_node.type)
+        if node_identifier in self.processed_nodes:
+            return  # Skip processing if already done
+        self.processed_nodes.add(node_identifier)
+
+        if (AST_node.type in self.rules):
             ast_snippet = AST_node.text.decode("utf8")
             node_type = self.rules[AST_node.type]["uast_node_type"]
             exec_string = self.rules[AST_node.type]["extractor"]
@@ -258,6 +268,11 @@ def _dfs(self, AST_node, parent) :
                 self._dfs(AST_node= child, parent = parent)
             except RecursionError as e:
                 print(f"RecursionError caught: {str(e)}")
+
+            try:
+                self._dfs(AST_node= child, parent = parent)
+            except RecursionError as e:
+                print(f"RecursionError caught: {str(e)}")
 
     def _extract(self, ast_snippet, node_type, exec_string):
         code_snippet = ast_snippet
@@ -269,3 +284,31 @@ def _extract(self, ast_snippet, node_type, exec_string):
             return self.grammar[node_type]["keyword"] + " " + self.extracted        
         except Exception as e:
             print(e)
+
+def uast_read(jsonstring):
+    """
+    Reads an input json string into UAST class object
+    """
+    uast = UAST()
+    if jsonstring is not None and jsonstring != 'null':
+        uast.load_from_json_string(jsonstring)
+        return uast
+    return None
+
+def extract_ccr(uast):
+    """
+    Calculates the code to comment ratio given an UAST object as input
+    """
+    if uast is not None:
+        total_comment_loc = 0
+        for node_idx in uast.nodes:
+            node = uast.get_node(node_idx)
+            if node.node_type == 'uast_comment':
+                total_comment_loc += node.metadata.get("loc_original_code", 0)
+            elif node.node_type == 'uast_root':
+                loc_snippet = node.metadata.get("loc_snippet", 0)
+        if total_comment_loc > 0:
+            return loc_snippet / total_comment_loc
+        else:
+            return None 
+    return None
diff --git a/transforms/code/code_profiler/python/src/code_profiler_local_python.py b/transforms/code/code_profiler/python/src/code_profiler_local_python.py
@@ -24,8 +24,8 @@
 local_conf = {
     "input_folder": input_folder,
     "output_folder": output_folder,
-    "contents": "Contents",
-    "language": "Language"
+    "contents": "contents",
+    "language": "language"
 }
 params = {
     # Data access. Only required parameters are specified

diff --git a/transforms/code/code_profiler/python/src/code_profiler_transform.py b/transforms/code/code_profiler/python/src/code_profiler_transform.py
@@ -40,10 +40,8 @@
 
 short_name = "CodeProfiler"
 cli_prefix = f"{short_name}_"
-language_key = "language"
-contents_key = "contents"
-language_cli_param = f"{cli_prefix}{language_key}"
-contents_cli_param = f"{cli_prefix}{contents_key}"
+language = "language"
+contents = "contents"
 
 class CodeProfilerTransform(AbstractTableTransform):
     """
@@ -57,8 +55,11 @@ def __init__(self, config: dict[str, Any]):
 
         super().__init__(config)
 
-        self.contents = self.config.get("contents")
-        self.language = self.config.get("language")
+        self.contents = self.config.get("contents", "contents")
+        self.language = self.config.get("language", "language")        
+
+        if not isinstance(self.contents, str):
+           raise ValueError(f"'contents' should be a string, got {type(self.contents).__name__}")
 
         def ensure_tree_sitter_bindings():
             # Get the directory where the script is located
@@ -148,23 +149,46 @@ def ensure_tree_sitter_bindings():
         self.ikb_file = config.get("ikb_file", "semantic-ruleset/ikb_model.csv")
         self.null_libs_file = config.get("null_libs_file", "semantic-ruleset/null_libs.csv")
 
+        src_file_dir = os.path.abspath(os.path.dirname(__file__))
+        # Check if the file exists; if not, update the default path
+        if not os.path.exists(self.ikb_file):
+            print(f"File not found at {self.ikb_file}. Updating to '../semantic-ruleset/ikb_model.csv'")
+            self.ikb_file = os.path.join(src_file_dir, "semantic-ruleset/ikb_model.csv")
+        # Raise an error if the file still doesn't exist
+        if not os.path.exists(self.ikb_file):
+            raise FileNotFoundError(f"File not found: {self.ikb_file}")
+
+        # Check if the file exists; if not, update the default path
+        if not os.path.exists(self.null_libs_file):
+            print(f"File not found at {self.null_libs_file}. Updating to '../semantic-ruleset/null_libs.csv'")
+            self.null_libs_file = os.path.join(src_file_dir, "semantic-ruleset/null_libs.csv")
+        # Raise an error if the file still doesn't exist
+        if not os.path.exists(self.null_libs_file):
+            raise FileNotFoundError(f"File not found: {self.null_libs_file}")
+
         # Higher order semantic features
-        self.metrics_list = config.get("metrics_list", ["CCR"])
+        self.metrics_list = config.get("metrics_list", ["CCR", "code_snippet_len", "avg_fn_len_in_snippet"])
 
     def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:
         """
         Extracts the syntactic constructs
         """
-        print("tranforming the the input dataframe")
+        print("Transforming the the input dataframe")
 
         ts_parser = TSParser()
         uast_parser = UASTParser()
 
         def get_uast_json(code, lang):
-            if lang in self.language_map:
-                ts_parser.set_language(self.language_map[lang])
-                uast_parser.set_language(self.uast_language_map[lang])
-                ast = ts_parser.parse(bytes(code, encoding= "utf8"))
+            # Create case-insensitive mappings
+            language_map_lower = {key.lower(): value for key, value in self.language_map.items()}
+            uast_language_map_lower = {key.lower(): value for key, value in self.uast_language_map.items()}
+
+            # Check for the lowercase version of `lang`
+            lang_lower = lang.lower()
+            if lang_lower in language_map_lower:
+                ts_parser.set_language(language_map_lower[lang_lower])
+                uast_parser.set_language(uast_language_map_lower[lang_lower])
+                ast = ts_parser.parse(bytes(code, encoding="utf8"))
                 uast = uast_parser.parse(ast, code)
                 return uast.get_json()
             return None
@@ -175,8 +199,12 @@ def extract_packages_from_uast(uast_json):
 
             try:
                 uast_data = json.loads(uast_json)
-                nodes = uast_data.get("nodes", {})
-
+                if uast_data is not None:
+                    nodes = uast_data.get("nodes", {})
+                else:
+                    nodes = {}
+                    print("Warning: uast_data is None. Check the data source or initialization process.")  
+                    return              
                 # Iterate through nodes to find nodes with type 'uast_package'
                 for node_id, node_data in nodes.items():
                     if node_data.get("node_type") == "uast_package":
@@ -189,13 +217,14 @@ def extract_packages_from_uast(uast_json):
 
             return ",".join(package_list)  # Return as a comma-separated string
 
-        def get_uast_parquet():
+        def get_uast_parquet(tmp_table):
             # df = pd.read_parquet(f'{db_path}/{filename}', 'pyarrow')
             # df = df.reindex(columns=all_columns)
 
             # Extract language and content arrays from the table using PyArrow
-            lang_array = table.column(self.language)
-            content_array = table.column(self.contents)
+            print(self.language)
+            lang_array = tmp_table.column(self.language)
+            content_array = tmp_table.column(self.contents)
             # Ensure both arrays have the same length
             assert len(lang_array) == len(content_array)
 
@@ -208,68 +237,74 @@ def get_uast_parquet():
             uast_column = pa.array(uasts)
             package_list_column = pa.array(package_lists)
 
-            table_with_uast = table.append_column('UAST', uast_column)
+            tmp_table_with_uast = tmp_table.append_column('UAST', uast_column)
             # Add the uast_package column
-            table_with_package_list = table_with_uast.append_column('UAST_Package_List', package_list_column)
+            table_with_package_list = tmp_table_with_uast.append_column('UAST_Package_List', package_list_column)
             return table_with_package_list
 
-        # Custom cleanup function
-        def safe_rmtree(path):
-            if os.path.exists(path):
-                shutil.rmtree(path)
-
-        table_with_uast = get_uast_parquet()
-        # report statistics
-        stats = {"source_documents": table.num_columns, "result_documents": table_with_uast.num_columns}
+        table_with_uast = get_uast_parquet(table)
+
+        try:
+            # Use an OS command to remove the folder and its contents
+            subprocess.run(["rm", "-rf", self.bindings_dir], check=True)
+            print(f"Successfully deleted: {self.bindings_dir}")
+        except subprocess.CalledProcessError as e:
+            print(f"Error deleting {self.bindings_dir}: {e}")
 
         ## Semantic profiling
-        table = table_with_uast
-        self.logger.debug(f"Semantic profiling of one table with {len(table)} rows")
+        self.logger.debug(f"Semantic profiling of one table with {len(table_with_uast)} rows")
 
         # Load Knowledge Base
+        print(self.ikb_file)
+        print(self.null_libs_file)
         ikb = knowledge_base(self.ikb_file, self.null_libs_file)
         ikb.load_ikb_trie()
 
         # Extract concept from IKB
-        libraries = table.column('UAST_Package_List').to_pylist()
-        language = table.column('Language').to_pylist()
+        libraries = table_with_uast.column('UAST_Package_List').to_pylist()
+        language = table_with_uast.column('language').to_pylist()
         concepts = [concept_extractor(lib, lang, ikb) for lib, lang in zip(libraries, language)]
 
         # Append concepts column to table and record unknown libraries
         new_col = pa.array(concepts)
-        table = table.append_column('Concepts', new_col)
+        table_with_uast = table_with_uast.append_column('Concepts', new_col)
         ikb.write_null_files()
 
         # Higher order syntactic profiler
-        self.logger.debug(f"Transforming one table with {len(table)} rows")
+        self.logger.debug(f"Transforming one table with {len(table_with_uast)} rows")
 
         if self.metrics_list is not None:
-            for metric in self.metrics_list:
-                if metric == "CCR":
-                    self.logger.info(f"Generating {metric} values")
-                    uasts = [uast_read(uast_json) for uast_json in table['UAST'].to_pylist()]
-                    ccrs = [extract_ccr(uast) for uast in uasts]
-                    new_table = table.append_column(metric, pa.array(ccrs))
-
-        self.logger.debug(f"Transformed one table with {len(new_table)} rows")
-        metadata = {"nfiles": 1, "nrows": len(new_table)}
-
+            uasts = [uast_read(uast_json) for uast_json in table_with_uast['UAST'].to_pylist()]
+            ccrs = []
+            code_snippet_len = []    
+            avg_fn_len_in_snippet = []                                       
+
+            for uast in uasts:
+                if "CCR" in self.metrics_list:
+                    ccrs.append(extract_ccr(uast))
+                if "code_snippet_len" in self.metrics_list:
+                    code_snippet_len.append(extract_code_snippet_length(uast))
+                if "avg_fn_len_in_snippet" in self.metrics_list:
+                    avg_fn_len_in_snippet.append(extract_code_avg_fn_len_in_snippet(uast))                    
+
+            if "CCR" in self.metrics_list:
+                table_with_uast = table_with_uast.append_column("CCR", pa.array(ccrs))
+            if "code_snippet_len" in self.metrics_list:
+                table_with_uast = table_with_uast.append_column("code_snippet_len", pa.array(code_snippet_len))
+            if "avg_fn_len_in_snippet" in self.metrics_list:
+                table_with_uast = table_with_uast.append_column("avg_fn_len_in_snippet", pa.array(avg_fn_len_in_snippet))
+
+        self.logger.debug(f"Transformed one table with {len(table_with_uast)} rows")
+        metadata = {"nfiles": 1, "nrows": len(table_with_uast)}
         # Report generation
-        if 'UAST' in new_table.schema.names and 'Concepts' in new_table.schema.names:
-            generate_report(new_table,self.metrics_list)
+        if 'UAST' in table_with_uast.schema.names and 'Concepts' in table_with_uast.schema.names:
+            generate_report(table_with_uast,self.metrics_list)
 
         # Add some sample metadata.
-        self.logger.debug(f"Transformed one table with {len(table)} rows")
-        stats["nrows"] =  len(table)
-
-        try:
-            # Use an OS command to remove the folder and its contents
-            subprocess.run(["rm", "-rf", self.bindings_dir], check=True)
-            print(f"Successfully deleted: {self.bindings_dir}")
-        except subprocess.CalledProcessError as e:
-            print(f"Error deleting {self.bindings_dir}: {e}")
-
-        return [table], stats
+        self.logger.debug(f"Transformed one table with {len(table_with_uast)} rows")
+                # report statistics
+        stats = {"source_documents": table.num_columns, "result_documents": table_with_uast.num_columns}
+        return [table_with_uast], stats
 
 class CodeProfilerTransformConfiguration(TransformConfiguration):
     def __init__(self, transform_class: type[AbstractBinaryTransform] = CodeProfilerTransform):
@@ -279,15 +314,15 @@ def __init__(self, transform_class: type[AbstractBinaryTransform] = CodeProfiler
             )
     def add_input_params(self, parser: ArgumentParser) -> None:
         parser.add_argument(
-            f"--{language_cli_param}",
+            f"--{language}",
             type=str,
-            default="Language",
+            default="language",
             help="Column name that denotes the programming language",
         )
         parser.add_argument(
-            f"--{contents_cli_param}",
+            f"--{contents}",
             type=str,
-            default="Contents",
+            default="contents",
             help="Column name that contains code snippets",
         )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -61,3 +61,5 @@ The high-level system design is as follows:
		For each new target language, the offline phase is utilized to create deterministic rules by harnessing the capabilities of LLMs and working with exemplar code samples from the target language. In this process, Workflow W1 facilitates the creation of rules around syntactic structures based on exemplar code samples, while Workflow W2 is used to establish semantic dimensions for profiling. Subsequently, we derive rules that connect syntactic constructs to the predefined semantic concepts. These rules are then stored in a rule database, ready to be employed during the online phase.

		In the online phase, the system dynamically generates profiling outputs for any incoming code snippets. This is achieved by extracting concepts from the snippets using the rules in the database and storing these extractions in a tabular format. The structured tabular format allows for generating additional concept columns, which are then utilized to create comprehensive profiling reports.