Skip to content

Commit

Permalink
Semantic profiler and report generation module integration
Browse files Browse the repository at this point in the history
Added the modules for generating the report based on the syntactic and semantic feature present in the code

Signed-off-by: Pankaj Thorat <thorat.pankaj9@gmail.com>
  • Loading branch information
pankajskku committed Nov 27, 2024
1 parent 995bfc6 commit 6d58ce1
Show file tree
Hide file tree
Showing 48 changed files with 70,087 additions and 2,753 deletions.
2 changes: 2 additions & 0 deletions transforms/code/code_profiler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,5 @@ The high-level system design is as follows:
For each new target language, the offline phase is utilized to create deterministic rules by harnessing the capabilities of LLMs and working with exemplar code samples from the target language. In this process, Workflow W1 facilitates the creation of rules around syntactic structures based on exemplar code samples, while Workflow W2 is used to establish semantic dimensions for profiling. Subsequently, we derive rules that connect syntactic constructs to the predefined semantic concepts. These rules are then stored in a rule database, ready to be employed during the online phase.

In the online phase, the system dynamically generates profiling outputs for any incoming code snippets. This is achieved by extracting concepts from the snippets using the rules in the database and storing these extractions in a tabular format. The structured tabular format allows for generating additional concept columns, which are then utilized to create comprehensive profiling reports.


5 changes: 3 additions & 2 deletions transforms/code/code_profiler/input/data_profiler_params.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"input": "multi-package.parquet",
"contents": "Contents",
"language": "Language"
"dynamic_schema_mapping": "True",
"contents": "contents",
"language": "language"
}
Binary file modified transforms/code/code_profiler/input/multi-package.parquet
Binary file not shown.
Binary file not shown.
Binary file not shown.
1,542 changes: 555 additions & 987 deletions transforms/code/code_profiler/notebook_example/code-profiler.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions transforms/code/code_profiler/python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ setup:: .transforms.setup
set-versions:
$(MAKE) TRANSFORM_PYTHON_VERSION=$(CODE_PROFILER_PYTHON_VERSION) TOML_VERSION=$(CODE_PROFILER_PYTHON_VERSION) .transforms.set-versions

build-dist:: .defaults.build-dist
build-dist:: .defaults.build-dist

publish-dist:: .defaults.publish-dist

Expand All @@ -51,5 +51,5 @@ run-local-sample: .transforms.run-local-sample

run-local-python-sample:
$(MAKE) RUN_FILE=code_profiler_local_python.py \
RUN_ARGS="--content 'Contents' --language 'Language'" \
RUN_ARGS="--content 'contents' --language 'language'" \
.transforms.run-local-python-sample
33 changes: 31 additions & 2 deletions transforms/code/code_profiler/python/src/UAST_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,8 +228,9 @@ def _add_user_defined(self, node):
return

# Traversing through the AST to create nodes recursively.
def _dfs(self, AST_node, parent) :
if (AST_node.type in self.rules) :
def _dfs(self, AST_node, parent):

if (AST_node.type in self.rules):
ast_snippet = AST_node.text.decode("utf8")
node_type = self.rules[AST_node.type]["uast_node_type"]
exec_string = self.rules[AST_node.type]["extractor"]
Expand Down Expand Up @@ -269,3 +270,31 @@ def _extract(self, ast_snippet, node_type, exec_string):
return self.grammar[node_type]["keyword"] + " " + self.extracted
except Exception as e:
print(e)

def uast_read(jsonstring):
"""
Reads an input json string into UAST class object
"""
uast = UAST()
if jsonstring is not None and jsonstring != 'null':
uast.load_from_json_string(jsonstring)
return uast
return None

def extract_ccr(uast):
"""
Calculates the code to comment ratio given an UAST object as input
"""
if uast is not None:
total_comment_loc = 0
for node_idx in uast.nodes:
node = uast.get_node(node_idx)
if node.node_type == 'uast_comment':
total_comment_loc += node.metadata.get("loc_original_code", 0)
elif node.node_type == 'uast_root':
loc_snippet = node.metadata.get("loc_snippet", 0)
if total_comment_loc > 0:
return loc_snippet / total_comment_loc
else:
return None
return None
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
local_conf = {
"input_folder": input_folder,
"output_folder": output_folder,
"contents": "Contents",
"language": "Language"
"contents": "contents",
"language": "language"
}
params = {
# Data access. Only required parameters are specified
Expand Down
155 changes: 95 additions & 60 deletions transforms/code/code_profiler/python/src/code_profiler_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,8 @@

short_name = "CodeProfiler"
cli_prefix = f"{short_name}_"
language_key = "language"
contents_key = "contents"
language_cli_param = f"{cli_prefix}{language_key}"
contents_cli_param = f"{cli_prefix}{contents_key}"
language = "language"
contents = "contents"

class CodeProfilerTransform(AbstractTableTransform):
"""
Expand All @@ -57,8 +55,11 @@ def __init__(self, config: dict[str, Any]):

super().__init__(config)

self.contents = self.config.get("contents")
self.language = self.config.get("language")
self.contents = self.config.get("contents", "contents")
self.language = self.config.get("language", "language")

if not isinstance(self.contents, str):
raise ValueError(f"'contents' should be a string, got {type(self.contents).__name__}")

def ensure_tree_sitter_bindings():
# Get the directory where the script is located
Expand Down Expand Up @@ -148,23 +149,46 @@ def ensure_tree_sitter_bindings():
self.ikb_file = config.get("ikb_file", "semantic-ruleset/ikb_model.csv")
self.null_libs_file = config.get("null_libs_file", "semantic-ruleset/null_libs.csv")

src_file_dir = os.path.abspath(os.path.dirname(__file__))
# Check if the file exists; if not, update the default path
if not os.path.exists(self.ikb_file):
print(f"File not found at {self.ikb_file}. Updating to '../semantic-ruleset/ikb_model.csv'")
self.ikb_file = os.path.join(src_file_dir, "semantic-ruleset/ikb_model.csv")
# Raise an error if the file still doesn't exist
if not os.path.exists(self.ikb_file):
raise FileNotFoundError(f"File not found: {self.ikb_file}")

# Check if the file exists; if not, update the default path
if not os.path.exists(self.null_libs_file):
print(f"File not found at {self.null_libs_file}. Updating to '../semantic-ruleset/null_libs.csv'")
self.null_libs_file = os.path.join(src_file_dir, "semantic-ruleset/null_libs.csv")
# Raise an error if the file still doesn't exist
if not os.path.exists(self.null_libs_file):
raise FileNotFoundError(f"File not found: {self.null_libs_file}")

# Higher order semantic features
self.metrics_list = config.get("metrics_list", ["CCR"])
self.metrics_list = config.get("metrics_list", ["CCR", "code_snippet_len", "avg_fn_len_in_snippet"])

def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:
"""
Extracts the syntactic constructs
"""
print("tranforming the the input dataframe")
print("Transforming the the input dataframe")

ts_parser = TSParser()
uast_parser = UASTParser()

def get_uast_json(code, lang):
if lang in self.language_map:
ts_parser.set_language(self.language_map[lang])
uast_parser.set_language(self.uast_language_map[lang])
ast = ts_parser.parse(bytes(code, encoding= "utf8"))
# Create case-insensitive mappings
language_map_lower = {key.lower(): value for key, value in self.language_map.items()}
uast_language_map_lower = {key.lower(): value for key, value in self.uast_language_map.items()}

# Check for the lowercase version of `lang`
lang_lower = lang.lower()
if lang_lower in language_map_lower:
ts_parser.set_language(language_map_lower[lang_lower])
uast_parser.set_language(uast_language_map_lower[lang_lower])
ast = ts_parser.parse(bytes(code, encoding="utf8"))
uast = uast_parser.parse(ast, code)
return uast.get_json()
return None
Expand All @@ -175,8 +199,12 @@ def extract_packages_from_uast(uast_json):

try:
uast_data = json.loads(uast_json)
nodes = uast_data.get("nodes", {})

if uast_data is not None:
nodes = uast_data.get("nodes", {})
else:
nodes = {}
print("Warning: uast_data is None. Check the data source or initialization process.")
return
# Iterate through nodes to find nodes with type 'uast_package'
for node_id, node_data in nodes.items():
if node_data.get("node_type") == "uast_package":
Expand All @@ -189,13 +217,14 @@ def extract_packages_from_uast(uast_json):

return ",".join(package_list) # Return as a comma-separated string

def get_uast_parquet():
def get_uast_parquet(tmp_table):
# df = pd.read_parquet(f'{db_path}/{filename}', 'pyarrow')
# df = df.reindex(columns=all_columns)

# Extract language and content arrays from the table using PyArrow
lang_array = table.column(self.language)
content_array = table.column(self.contents)
print(self.language)
lang_array = tmp_table.column(self.language)
content_array = tmp_table.column(self.contents)
# Ensure both arrays have the same length
assert len(lang_array) == len(content_array)

Expand All @@ -208,68 +237,74 @@ def get_uast_parquet():
uast_column = pa.array(uasts)
package_list_column = pa.array(package_lists)

table_with_uast = table.append_column('UAST', uast_column)
tmp_table_with_uast = tmp_table.append_column('UAST', uast_column)
# Add the uast_package column
table_with_package_list = table_with_uast.append_column('UAST_Package_List', package_list_column)
table_with_package_list = tmp_table_with_uast.append_column('UAST_Package_List', package_list_column)
return table_with_package_list

# Custom cleanup function
def safe_rmtree(path):
if os.path.exists(path):
shutil.rmtree(path)

table_with_uast = get_uast_parquet()
# report statistics
stats = {"source_documents": table.num_columns, "result_documents": table_with_uast.num_columns}
table_with_uast = get_uast_parquet(table)

try:
# Use an OS command to remove the folder and its contents
subprocess.run(["rm", "-rf", self.bindings_dir], check=True)
print(f"Successfully deleted: {self.bindings_dir}")
except subprocess.CalledProcessError as e:
print(f"Error deleting {self.bindings_dir}: {e}")

## Semantic profiling
table = table_with_uast
self.logger.debug(f"Semantic profiling of one table with {len(table)} rows")
self.logger.debug(f"Semantic profiling of one table with {len(table_with_uast)} rows")

# Load Knowledge Base
print(self.ikb_file)
print(self.null_libs_file)
ikb = knowledge_base(self.ikb_file, self.null_libs_file)
ikb.load_ikb_trie()

# Extract concept from IKB
libraries = table.column('UAST_Package_List').to_pylist()
language = table.column('Language').to_pylist()
libraries = table_with_uast.column('UAST_Package_List').to_pylist()
language = table_with_uast.column('language').to_pylist()
concepts = [concept_extractor(lib, lang, ikb) for lib, lang in zip(libraries, language)]

# Append concepts column to table and record unknown libraries
new_col = pa.array(concepts)
table = table.append_column('Concepts', new_col)
table_with_uast = table_with_uast.append_column('Concepts', new_col)
ikb.write_null_files()

# Higher order syntactic profiler
self.logger.debug(f"Transforming one table with {len(table)} rows")
self.logger.debug(f"Transforming one table with {len(table_with_uast)} rows")

if self.metrics_list is not None:
for metric in self.metrics_list:
if metric == "CCR":
self.logger.info(f"Generating {metric} values")
uasts = [uast_read(uast_json) for uast_json in table['UAST'].to_pylist()]
ccrs = [extract_ccr(uast) for uast in uasts]
new_table = table.append_column(metric, pa.array(ccrs))

self.logger.debug(f"Transformed one table with {len(new_table)} rows")
metadata = {"nfiles": 1, "nrows": len(new_table)}

uasts = [uast_read(uast_json) for uast_json in table_with_uast['UAST'].to_pylist()]
ccrs = []
code_snippet_len = []
avg_fn_len_in_snippet = []

for uast in uasts:
if "CCR" in self.metrics_list:
ccrs.append(extract_ccr(uast))
if "code_snippet_len" in self.metrics_list:
code_snippet_len.append(extract_code_snippet_length(uast))
if "avg_fn_len_in_snippet" in self.metrics_list:
avg_fn_len_in_snippet.append(extract_code_avg_fn_len_in_snippet(uast))

if "CCR" in self.metrics_list:
table_with_uast = table_with_uast.append_column("CCR", pa.array(ccrs))
if "code_snippet_len" in self.metrics_list:
table_with_uast = table_with_uast.append_column("code_snippet_len", pa.array(code_snippet_len))
if "avg_fn_len_in_snippet" in self.metrics_list:
table_with_uast = table_with_uast.append_column("avg_fn_len_in_snippet", pa.array(avg_fn_len_in_snippet))

self.logger.debug(f"Transformed one table with {len(table_with_uast)} rows")
metadata = {"nfiles": 1, "nrows": len(table_with_uast)}
# Report generation
if 'UAST' in new_table.schema.names and 'Concepts' in new_table.schema.names:
generate_report(new_table,self.metrics_list)
if 'UAST' in table_with_uast.schema.names and 'Concepts' in table_with_uast.schema.names:
generate_report(table_with_uast,self.metrics_list)

# Add some sample metadata.
self.logger.debug(f"Transformed one table with {len(table)} rows")
stats["nrows"] = len(table)

try:
# Use an OS command to remove the folder and its contents
subprocess.run(["rm", "-rf", self.bindings_dir], check=True)
print(f"Successfully deleted: {self.bindings_dir}")
except subprocess.CalledProcessError as e:
print(f"Error deleting {self.bindings_dir}: {e}")

return [table], stats
self.logger.debug(f"Transformed one table with {len(table_with_uast)} rows")
# report statistics
stats = {"source_documents": table.num_columns, "result_documents": table_with_uast.num_columns}
return [table_with_uast], stats

class CodeProfilerTransformConfiguration(TransformConfiguration):
def __init__(self, transform_class: type[AbstractBinaryTransform] = CodeProfilerTransform):
Expand All @@ -279,15 +314,15 @@ def __init__(self, transform_class: type[AbstractBinaryTransform] = CodeProfiler
)
def add_input_params(self, parser: ArgumentParser) -> None:
parser.add_argument(
f"--{language_cli_param}",
f"--{language}",
type=str,
default="Language",
default="language",
help="Column name that denotes the programming language",
)
parser.add_argument(
f"--{contents_cli_param}",
f"--{contents}",
type=str,
default="Contents",
default="contents",
help="Column name that contains code snippets",
)

Expand Down
51 changes: 47 additions & 4 deletions transforms/code/code_profiler/python/src/higher_order_concepts.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
# (C) Copyright IBM Corp. 2024.
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################

from UAST import *

def extract_ccr(uast):
Expand All @@ -12,8 +24,39 @@ def extract_ccr(uast):
total_comment_loc += node.metadata.get("loc_original_code", 0)
elif node.node_type == 'uast_root':
loc_snippet = node.metadata.get("loc_snippet", 0)
if total_comment_loc > 0:
return loc_snippet / total_comment_loc
if loc_snippet > 0:
if total_comment_loc > 0:
return str(round(float(loc_snippet) / float(total_comment_loc), 1))
else:
return str(0)
else:
return None
return None
return str(-1)
return str(0)

def extract_code_snippet_length(uast):
if uast is not None:
for node_idx in uast.nodes:
node = uast.get_node(node_idx)
if node.node_type == 'uast_root':
loc_snippet = node.metadata.get("loc_snippet", 0)
if loc_snippet > 0:
return str(loc_snippet)
else:
return str(0)


def extract_code_avg_fn_len_in_snippet(uast):
if uast is not None:
total_fn_loc = 0
fn_node_count = 0
for node_idx in uast.nodes:
node = uast.get_node(node_idx)
if node.node_type == 'uast_function':
total_fn_loc += node.metadata.get("loc_original_code", 0)
fn_node_count +=1

if fn_node_count > 0:
return str(round(float(total_fn_loc) / float(fn_node_count), 1))
else:
return str(0)

Loading

0 comments on commit 6d58ce1

Please sign in to comment.