diff --git a/HISTORY.md b/HISTORY.md index 60214c5..b84a5c6 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -70,10 +70,18 @@ Release data: Dec 12, 2022 Version 0.0.6 ============= -Release data: Jan 9, 2022 +Release data: Jan 9, 2023 * Add tree sitter utils (in codetext.parser) * Replace all `match_from_span` to `get_node_text` * Replace all `traverse_type` to `get_node_by_kind` * Fix `CppParser.get_function_metadata` missing `param_type` and `param_identifier` * Update return metadata from all parser + +Version 0.0.7 +============= +Release data: Jul 5, 2023 + +* Update all class extractor format (using dict instead of list) +* Fix missing identifier, parameter in C, C#, Java parser +* Implement CLI diff --git a/README.md b/README.md index a35f98d..edfd1cf 100755 --- a/README.md +++ b/README.md @@ -1,79 +1,152 @@

- logo + logo

- -**CodeText-parser** ______________________________________________________________________ -| Branch | Build | Unittest | Linting | Release | License | -|-------- |------- |---------- |--------- |--------- |--------- | -| main | | [![Unittest](https://github.com/AI4Code-Research/CodeText-parser/actions/workflows/unittest.yml/badge.svg)](https://github.com/AI4Code-Research/CodeText-parser/actions/workflows/unittest.yml) | | [![release](https://img.shields.io/pypi/v/codetext)](https://pypi.org/project/codetext/) [![pyversion](https://img.shields.io/pypi/pyversions/codetext)](https://pypi.org/project/codetext/)| [![license](https://img.shields.io/github/license/AI4Code-Research/CodeText-parser)](https://github.com/AI4Code-Research/CodeText-parser/blob/main/LICENSES.txt) | +| Branch | Build | Unittest | Release | License | +|-------- |------- |---------- |--------- |--------- | +| main | | [![Unittest](https://github.com/AI4Code-Research/CodeText-parser/actions/workflows/unittest.yml/badge.svg)](https://github.com/AI4Code-Research/CodeText-parser/actions/workflows/unittest.yml) | [![release](https://img.shields.io/pypi/v/codetext)](https://pypi.org/project/codetext/) [![pyversion](https://img.shields.io/pypi/pyversions/codetext)](https://pypi.org/project/codetext/)| [![license](https://img.shields.io/github/license/AI4Code-Research/CodeText-parser)](https://github.com/AI4Code-Research/CodeText-parser/blob/main/LICENSES.txt) |
______________________________________________________________________ -**Code-Text data toolkit** contains multilingual programming language parsers for the extract from raw source code into multiple levels of pair data (code-text) (e.g., function-level, class-level, inline-level). +**Code-Text parser** is a custom [tree-sitter](https://github.com/tree-sitter)'s grammar parser for extract raw source code into class and function level. We support 10 common programming languages: +- Python +- Java +- JavaScript +- PHP +- Ruby +- Rust +- C +- C++ +- C# +- Go # Installation -Setup environment and install dependencies and setup by using `install_env.sh` -```bash -bash -i ./install_env.sh -``` -then activate conda environment named "code-text-env" +**codetext** package require python 3.7 or above and tree-sitter. Setup environment and install dependencies manually from source: ```bash -conda activate code-text-env +git https://github.com/FSoft-AI4Code/CodeText-parser.git; cd CodeText-parser +pip install -r requirement.txt +pip install -e . ``` -*Setup for using parser* +Or install via `pypi` package: ```bash pip install codetext ``` # Getting started -## Build your language -Auto build tree-sitter into `.so` located in `/tree-sitter/` +## `codetext` CLI Usage +```bash +codetext [options] [PATH or FILE] ... +``` + +For example extract any python file in `src/` folder: +```bash +codetext src/ --language Python +``` + +If you want to store extracted class and function, use flag `--json` and give a path to destination file: +```bash +codetext src/ --language Python --output_file ./python_report.json --json +``` + +**Options** + +```bash +positional arguments: + paths list of the filename/paths. + +optional arguments: + -h, --help show this help message and exit + --version show program's version number and exit + -l LANGUAGE, --language LANGUAGE + Target the programming languages you want to analyze. + -o OUTPUT_FILE, --output_file OUTPUT_FILE + Output file (e.g report.json). + --json Generate json output as a transform of the default + output + --verbose Print progress bar + +``` + +**Example** +``` +File circle_linkedlist.py analyzed: +================================================== +Number of class : 1 +Number of function : 2 +-------------------------------------------------- + +Class summary: ++-----+---------+-------------+ +| # | Class | Arguments | ++=====+=========+=============+ +| 0 | Node | | ++-----+---------+-------------+ + +Class analyse: Node ++-----+---------------+-------------+--------+---------------+ +| # | Method name | Paramters | Type | Return type | ++=====+===============+=============+========+===============+ +| 0 | __init__ | self | | | +| | | data | | | ++-----+---------------+-------------+--------+---------------+ + +Function analyse: ++-----+-----------------+-------------+--------+---------------+ +| # | Function name | Paramters | Type | Return type | ++=====+=================+=============+========+===============+ +| 0 | push | head_ref | | Node | +| | | data | Any | Node | +| 1 | countNodes | head | Node | | ++-----+-----------------+-------------+--------+---------------+ +``` + +## Using `codetext` as Python module +### Build your language +`codetext` need tree-sitter language file (i.e `.so` file) to work properly. You can manually compile language ([see more](https://github.com/tree-sitter/py-tree-sitter#usage)) or automatically build use our pre-defined function (the `.so` will saved in a folder name `/tree-sitter/`): ```python from codetext.utils import build_language language = 'rust' build_language(language) - # INFO:utils:Not found tree-sitter-rust, attempt clone from github # Cloning into 'tree-sitter-rust'... # remote: Enumerating objects: 2835, done. ... # INFO:utils:Attempt to build Tree-sitter Language for rust and store in .../tree-sitter/rust.so ``` -## Language Parser -We supported 10 programming languages, namely `Python`, `Java`, `JavaScript`, `Golang`, `Ruby`, `PHP`, `C#`, `C++`, `C` and `Rust`. +### Using Language Parser +Each programming language we supported are correspond to a custome `language_parser`. (e.g Python is [`PythonParser()`](src/codetext/parser/python_parser.py#L11)). `language_parser` take input as raw source code and use breadth-first search to traveser through all syntax node. The class, method or stand-alone function will then be collected: -Setup ```python from codetext.utils import parse_code raw_code = """ -/** -* Sum of 2 number -* @param a int number -* @param b int number -*/ -double sum2num(int a, int b) { - return a + b; -} + /** + * Sum of 2 number + * @param a int number + * @param b int number + */ + double sum2num(int a, int b) { + return a + b; + } """ +# Auto parse code into tree-sitter.Tree root = parse_code(raw_code, 'cpp') root_node = root.root_node ``` -Get all function nodes inside a specific node, use: +Get all function nodes inside a specific node: ```python from codetext.utils.parser import CppParser @@ -105,3 +178,9 @@ class_list = CppParser.get_class_list(root_node) # and metadata = CppParser.get_metadata_list(root_node) ``` + +# Limitations +`codetext` heavly depends on tree-sitter syntax: +- Since we use tree-sitter grammar to extract desire node like function, class, function's name (identifier) or class's argument list, etc. `codetext` is easily vulnerable by tree-sitter update patch or syntax change in future. + +- While we try our best to capture all possiblity, there are still plenty out there. We open for community to contribute into this project. \ No newline at end of file diff --git a/asset/img/codetext_logo.png b/asset/img/codetext_logo.png new file mode 100644 index 0000000..d38fb00 Binary files /dev/null and b/asset/img/codetext_logo.png differ diff --git a/asset/img/codetext_logo_line.png b/asset/img/codetext_logo_line.png new file mode 100644 index 0000000..0a3358c Binary files /dev/null and b/asset/img/codetext_logo_line.png differ diff --git a/pyproject.toml b/pyproject.toml index 60a54b8..8f39ea2 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "codetext" -version = "0.0.5" +version = "0.0.7" authors = [ { name="Dung Manh Nguyen", email="dungnm.workspace@gmail.com" }, ] @@ -21,8 +21,12 @@ dependencies = [ "Levenshtein>=0.20", "langdetect>=1.0.0", "bs4>=0.0.1", + "tabulate>=0.9.0" ] [project.urls] "Homepage" = "https://github.com/AI4Code-Research/CodeText-data" "Bug Tracker" = "https://github.com/AI4Code-Research/CodeText-data/issues" + +[project.scripts] +codetext = "codetext.__main__:main" diff --git a/requirements.txt b/requirements.txt index cc35b84..d438040 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ # for preprocessing tree-sitter -# docstring-parser +tabulate Levenshtein langdetect bs4 diff --git a/src/codetext/__init__.py b/src/codetext/__init__.py old mode 100755 new mode 100644 diff --git a/src/codetext/__main__.py b/src/codetext/__main__.py new file mode 100644 index 0000000..d4d0a54 --- /dev/null +++ b/src/codetext/__main__.py @@ -0,0 +1,93 @@ +import os +import sys +import argparse +import pkg_resources + +import json +from .codetext_cli import parse_file, print_result, PL_MATCHING + + +def get_args(): + parser = argparse.ArgumentParser(description=f"codetext parser {20*'='}") + + parser.add_argument('paths', nargs='*', default=['.'], + help='list of the filename/paths.') + parser.add_argument("--version", action="version", + version=pkg_resources.get_distribution("codetext").version) + parser.add_argument("-l", "--language", + help='''Target the programming languages you want to + analyze.''') + parser.add_argument("-o", "--output_file", + help='''Output file (e.g report.json). + ''', + type=str) + parser.add_argument("--json", + help='''Generate json output as a transform of the + default output''', + action="store_true") + parser.add_argument("--verbose", + help='''Print progress bar''', + action="store_true") + + return parser.parse_args() + + +def main(): + opt = get_args() + + # check args + if opt.json: + if not opt.output_file: + raise ValueError("Missing --output_file") + if opt.language: + if opt.language not in PL_MATCHING.keys(): + raise ValueError( + "{language} not supported. Currently support {sp_language}" + .format(language=opt.language, + sp_language=list(PL_MATCHING.keys()))) + + # check path + for path in opt.paths: + assert os.path.exists(path) == True, "paths is not valid" + + if os.path.isdir(path): + files = [os.path.join(path, f) for f in os.listdir(path) \ + if os.path.isfile(os.path.join(path, f))] + elif os.path.isfile(path): + files = [path] + + if opt.language: + for file in files[:]: + filename, file_extension = os.path.splitext(file) + if file_extension not in PL_MATCHING[opt.language]: + files.remove(file) + + output_metadata = {} + for file in files: + filename, file_extension = os.path.splitext(file) + + if opt.language == None: + for lang, ext_list in PL_MATCHING.items(): + if file_extension in ext_list: + language = lang + break + else: + language = opt.language + + output = parse_file(file, language=language) + print_result( + output, + file_name=str(filename).split(os.sep)[-1]+file_extension + ) + output_metadata[file] = output + + if opt.json: + save_path = opt.output_file + with open(save_path, 'w') as output_file: + json.dump(output_metadata, output_file, sort_keys=True, indent=4) + print(50*'=') + print("Save report to {path}".format(path=save_path)) + + +if __name__ == '__main__': + main() diff --git a/src/codetext/clean/__init__.py b/src/codetext/clean/__init__.py old mode 100755 new mode 100644 diff --git a/src/codetext/clean/noise_removal.py b/src/codetext/clean/noise_removal.py old mode 100755 new mode 100644 diff --git a/src/codetext/codetext_cli.py b/src/codetext/codetext_cli.py new file mode 100644 index 0000000..91f7fd9 --- /dev/null +++ b/src/codetext/codetext_cli.py @@ -0,0 +1,238 @@ +import os +from typing import List, Dict + +from tabulate import tabulate + +from .parser import * +from .utils import parse_code + + +def parse_file(file_path: str, language: str = None, verbose: bool = False) -> List: + assert language != None, "Auto detect is not implemented, please specify language" + language = str(language).lower() + # assert (language in SUPPORT_LANGUAGE) == True, f"{language} is not supported" + assert os.path.isfile(file_path) == True, "File not found" + + if verbose: + print(50 * "=") + print("Parse code into tree-sitter node") + + content: str = open(file_path, "r").read() + root_node = parse_code(raw_code=content, language=language).root_node + + if language == "python": + parser: LanguageParser = PythonParser + elif language == "java": + parser: LanguageParser = JavaParser + elif language == "javascript": + parser: LanguageParser = JavascriptParser + elif language == "go": + parser: LanguageParser = GoParser + elif language in ["c", "c++"]: + parser: LanguageParser = CppParser + elif language == "c#": + parser: LanguageParser = CsharpParser + elif language == "rust": + parser: LanguageParser = RustParser + elif language == "ruby": + parser: LanguageParser = RubyParser + elif language == "php": + parser: LanguageParser = PhpParser + else: + raise KeyError(f"{language} is not supported") + + if verbose: + print(50 * "=") + print("Get node detail") + + cls_list = parser.get_class_list(root_node) + method_list = [] + cls_metadata = [] + for _cls in cls_list: + cls_info = parser.get_class_metadata(_cls) + cls_info["code"] = get_node_text(_cls) + + cls_method = [] + method_list = parser.get_function_list(_cls) + for method in method_list: + method_info = parser.get_function_metadata(method) + cls_method.append(method_info) + + cls_info["method"] = cls_method + cls_metadata.append(cls_info) + method_list.extend(method_list) + + fn_list: List = parser.get_function_list(root_node) + for node in fn_list[:]: + if node in method_list: + fn_list.remove(node) + + fn_metadata = [] + for fn in fn_list: + fn_metadata.append(parser.get_function_metadata(fn)) + + output_metadata = {"class": cls_metadata, "function": fn_metadata} + + return output_metadata + + +def print_result(res: Dict, file_name: str = "no_name_file"): + # ======== Print file name ======== + print("File {name} analyzed:".format(name=file_name)) + print(50 * "=") + + # ========= Summary ========= + print("Number of class : {length}".format(length=len(res["class"]))) + print("Number of function : {length}".format(length=len(res["function"]))) + print(50 * "-" + "\n") + + # ========= Print class & method ========= + cls_headers = ["#", "Class", "Arguments"] + cls_method_headers = ["#", "Method name", "Paramters", "Type", "Return type"] + cls_info = [] + method_info = {} + for cls_idx, _cls in enumerate(res["class"]): + cls_max_length = max(1, len(_cls["parameters"].keys())) + for i in range(cls_max_length): + clslist = [""] * len(cls_headers) + clslist[0] = cls_idx if i < 1 else "" + clslist[1] = _cls["identifier"] if i < 1 else "" + if _cls["parameters"].keys(): + clslist[2] = list(_cls["parameters"].keys())[i] + cls_info.append(clslist) + + _method_info = [] + for idx, method in enumerate(_cls["method"]): + max_length = max(1, len(method["parameters"].keys())) + for i in range(max_length): + sublist = [""] * len(cls_method_headers) + sublist[0] = idx if i < 1 else "" + sublist[1] = method["identifier"] if i < 1 else "" + if method["parameters"].keys(): + sublist[2] = list(method["parameters"].keys())[i] + sublist[3] = list(method["parameters"].values())[i] + sublist[4] = ( + method["return_type"] + if i <= 1 and method["return_type"] != "" + else "" + ) + _method_info.append(sublist) + + method_info[file_name] = [_cls["identifier"], _method_info] + + if cls_info: + print("Class summary:") + print(tabulate(cls_info, headers=cls_headers, tablefmt="outline")) + print("\n") + + for _, info in method_info.items(): + name, info = info + print("Class analyse: {name}".format(name=name)) + print(tabulate(info, headers=cls_method_headers, tablefmt="outline")) + print("\n") + + # ========= Print stand alone function ========= + fn_headers = ["#", "Function name", "Paramters", "Type", "Return type"] + function_info = [] + + for idx, fn in enumerate(res["function"]): + max_length = max(1, len(fn["parameters"].keys())) + for i in range(max_length): + sublist = [""] * len(fn_headers) + sublist[0] = idx if i < 1 else "" + sublist[1] = fn["identifier"] if i < 1 else "" + if fn["parameters"].keys(): + sublist[2] = list(fn["parameters"].keys())[i] + sublist[3] = list(fn["parameters"].values())[i] + sublist[4] = ( + fn["return_type"] + if i <= 1 and fn["return_type"] != "" + else "" + ) + function_info.append(sublist) + + if function_info: + print("Function analyse:") + print(tabulate(function_info, headers=fn_headers, tablefmt="outline")) + print("\n") + + elif not method_info: + print("File empty") + print("\n") + + +PL_MATCHING = { + "Java": [".java"], + "JavaScript": [ + ".js", + "._js", + ".bones", + ".es6", + ".jake", + ".jsb", + ".jscad", + ".jsfl", + ".jsm", + ".jss", + ".njs", + ".pac", + ".sjs", + ".ssjs", + ".xsjs", + ".xsjslib", + ], + "Python": [ + ".py", + ".bzl", + ".gyp", + ".lmi", + ".pyde", + ".pyp", + ".pyt", + ".pyw", + ".tac", + ".wsgi", + ".xpy", + ], + "PHP": [".php", ".aw", ".ctp", ".php3", ".php4", ".php5", ".phps", ".phpt"], + "Go": [".go"], + "Rust": [".rs", ".rs.in"], + "Ruby": [ + ".rb", + ".builder", + ".gemspec", + ".god", + ".irbrc", + ".jbuilder", + ".mspec", + ".podspec", + ".rabl", + ".rake", + ".rbuild", + ".rbw", + ".rbx", + ".ru", + ".ruby", + ".thor", + ".watchr", + ], + "C": [".c", ".cats", ".h", ".idc", ".w"], + "C#": [".cs", ".cake", ".cshtml", ".csx"], + "C++": [ + ".cpp", + ".c++", + ".cc", + ".cp", + ".cxx", + ".h++", + ".hh", + ".hpp", + ".hxx", + ".inl", + ".ipp", + ".tcc", + ".tpp", + ".C", + ".H", + ], +} diff --git a/src/codetext/parser/README.md b/src/codetext/parser/README.md new file mode 100644 index 0000000..d7fcb0a --- /dev/null +++ b/src/codetext/parser/README.md @@ -0,0 +1,6 @@ +# Parser Appendix + +With `codetext` parser, we support to extract serveral function type, however, by using `tree-sitter` grammarly, some function or some language might be not fully supported. + +This is the list of current supported function: + diff --git a/src/codetext/parser/__init__.py b/src/codetext/parser/__init__.py old mode 100755 new mode 100644 index a1b7ad0..de5a6d0 --- a/src/codetext/parser/__init__.py +++ b/src/codetext/parser/__init__.py @@ -10,7 +10,13 @@ from .cpp_parser import CppParser from .c_sharp_parser import CsharpParser from .rust_parser import RustParser -from .language_parser import LanguageParser +from .language_parser import LanguageParser, get_node_by_kind, get_node_text, \ + tokenize_code, tokenize_docstring, nodes_are_equal + +SUPPORT_LANGUAGE = [ + "go", "php", "ruby", "java", "javascript", + "python", "cpp", "c", "c_sharp", "rust" +] __all__ = [ 'GoParser', 'PhpParser', 'RubyParser', 'JavaParser', 'JavascriptParser', diff --git a/src/codetext/parser/c_sharp_parser.py b/src/codetext/parser/c_sharp_parser.py old mode 100755 new mode 100644 index 2491815..bda10c3 --- a/src/codetext/parser/c_sharp_parser.py +++ b/src/codetext/parser/c_sharp_parser.py @@ -131,10 +131,24 @@ def get_function_metadata(function_node, blob: str = None) -> Dict[str, Any]: for param_node in child.children: param_nodes = get_node_by_kind(param_node, ['parameter']) for param in param_nodes: - param_type = get_node_text(param.children[0]) - param_identifier = get_node_text(param.children[1]) - - metadata['parameters'][param_identifier] = param_type + if len(param.children) > 1: + param_type = get_node_text(param.children[0]) + param_name = get_node_text(param.children[1]) + metadata['parameters'][param_name] = param_type + + else: + param_name = get_node_text(param.children[0]) + metadata['parameters'][param_name] = None + # for node in param.children: + # if node.type in ['array_type', 'implicit_type', \ + # 'nullable_type', 'pointer_type', 'function_pointer_type', \ + # 'predefined_type', 'tuple_type']: + # param_type = get_node_text(node) + # elif node.type == 'identifier': + # param_identifier = get_node_text(node) + + # param_type = get_node_text(param.child_by_field_name('type')) + # param_identifier = get_node_text(param.child_by_field_name('name')) return metadata @staticmethod @@ -148,7 +162,7 @@ def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]: logger.info('From version `0.0.6` this function will update argument in the API') metadata = { 'identifier': '', - 'parameters': '', + 'parameters': {}, } assert type(class_node) == tree_sitter.Node @@ -156,11 +170,11 @@ def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]: if child.type == 'identifier': metadata['identifier'] = get_node_text(child) elif child.type == 'base_list': - argument_list = [] for arg in child.children: if arg.type == 'identifier': - argument_list.append(get_node_text(arg)) - metadata['parameters'] = argument_list + metadata['parameters'][get_node_text(arg)] = None + # argument_list.append(get_node_text(arg)) + # metadata['parameters'] = argument_list return metadata diff --git a/src/codetext/parser/cpp_parser.py b/src/codetext/parser/cpp_parser.py old mode 100755 new mode 100644 index 517a148..6674b65 --- a/src/codetext/parser/cpp_parser.py +++ b/src/codetext/parser/cpp_parser.py @@ -126,7 +126,7 @@ def get_function_metadata(function_node, blob: str=None) -> Dict[str, Any]: child = subchild if child.type == 'function_declarator': for subchild in child.children: - if subchild.type in ['qualified_identifier', 'identifier']: + if subchild.type in ['qualified_identifier', 'identifier', 'field_identifier']: metadata['identifier'] = get_node_text(subchild) elif subchild.type == 'parameter_list': param_nodes = get_node_by_kind(subchild, ['parameter_declaration']) @@ -134,7 +134,8 @@ def get_function_metadata(function_node, blob: str=None) -> Dict[str, Any]: param_type = param.child_by_field_name('type') param_type = get_node_text(param_type) list_name = get_node_by_kind(param, ['identifier']) - assert len(list_name) == 1 + if not list_name: + continue param_name = get_node_text(list_name[0]) metadata['parameters'][param_name] = param_type # for item in param.children: @@ -157,7 +158,7 @@ def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]: logger.info('From version `0.0.6` this function will update argument in the API') metadata = { 'identifier': '', - 'parameters': '', + 'parameters': {}, } assert type(class_node) == tree_sitter.Node @@ -168,7 +169,8 @@ def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]: argument_list = [] for param in child.children: if param.type == 'type_identifier': - argument_list.append(get_node_text(param)) - metadata['parameters'] = argument_list + metadata['parameters'][get_node_text(param)] = None + # argument_list.append(get_node_text(param)) + # metadata['parameters'] = argument_list return metadata diff --git a/src/codetext/parser/go_parser.py b/src/codetext/parser/go_parser.py old mode 100755 new mode 100644 index 5b8e567..3ebff09 --- a/src/codetext/parser/go_parser.py +++ b/src/codetext/parser/go_parser.py @@ -20,7 +20,7 @@ def get_comment_node(function_node): Return: List: list of comment nodes """ - comment_node = get_node_by_kind(function_node, comment_node, kind='comment') + comment_node = get_node_by_kind(function_node, kind='comment') return comment_node @staticmethod diff --git a/src/codetext/parser/java_parser.py b/src/codetext/parser/java_parser.py old mode 100755 new mode 100644 index 855f03c..2046e11 --- a/src/codetext/parser/java_parser.py +++ b/src/codetext/parser/java_parser.py @@ -88,7 +88,7 @@ def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]: logger.info('From version `0.0.6` this function will update argument in the API') metadata = { 'identifier': '', - 'parameters': '', + 'parameters': {}, } argument_list = [] for child in class_node.children: @@ -97,9 +97,10 @@ def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]: elif child.type == 'superclass' or child.type == 'super_interfaces': for subchild in child.children: if subchild.type == 'type_list' or subchild.type == 'type_identifier': - argument_list.append(get_node_text(subchild)) + metadata['parameters'][get_node_text(subchild)] = None + # argument_list.append(get_node_text(subchild)) - metadata['parameters'] = argument_list + # metadata['parameters'] = argument_list return metadata @staticmethod diff --git a/src/codetext/parser/javascript_parser.py b/src/codetext/parser/javascript_parser.py old mode 100755 new mode 100644 index 1392b1d..c65c7c8 --- a/src/codetext/parser/javascript_parser.py +++ b/src/codetext/parser/javascript_parser.py @@ -48,7 +48,12 @@ def get_comment_node(function_node): @staticmethod def get_function_list(node): - function_types = ['function_declaration', 'function', 'method_definition', 'generator_function_declaration'] + function_types = ['function_declaration', + 'function', + 'method_definition', + 'generator_function_declaration', + 'arrow_function', + 'generator_function'] res = get_node_by_kind(node, function_types) for node in res[:]: if not node.children: @@ -87,6 +92,16 @@ def get_function_metadata(function_node, blob: str=None) -> Dict[str, str]: return_statement = get_node_by_kind(function_node, ['return_statement']) if len(return_statement) > 0: metadata['return_type'] = '' + + if function_node.type in ["function", + "arrow_function", + "generator_function"]: + # function inside object property or variable declarator + identifier = function_node.prev_named_sibling + if identifier: + if identifier.type in ["identifier"]: + metadata["identifier"] = identifier.text.decode() + return metadata @staticmethod @@ -95,7 +110,7 @@ def get_class_metadata(class_node, blob=None): logger.info('From version `0.0.6` this function will update argument in the API') metadata = { 'identifier': '', - 'parameters': '', + 'parameters': {}, } param = [] for child in class_node.children: @@ -104,7 +119,8 @@ def get_class_metadata(class_node, blob=None): elif child.type == 'class_heritage': for subchild in child.children: if subchild.type == 'identifier': - param.append(get_node_text(subchild)) + metadata['parameters'][get_node_text(subchild)] = None + # param.append(get_node_text(subchild)) - metadata['parameters'] = param + # metadata['parameters'] = param return metadata diff --git a/src/codetext/parser/language_parser.py b/src/codetext/parser/language_parser.py old mode 100755 new mode 100644 index f24584e..c9cef9e --- a/src/codetext/parser/language_parser.py +++ b/src/codetext/parser/language_parser.py @@ -229,12 +229,12 @@ def get_comment_node(node) -> List[tree_sitter.Node]: @staticmethod @abstractmethod - def get_class_metadata(class_node, blob): + def get_class_metadata(class_node, blob=None): pass @staticmethod @abstractmethod - def get_function_metadata(function_node, blob) -> Dict[str, str]: + def get_function_metadata(function_node, blob=None) -> Dict[str, str]: pass diff --git a/src/codetext/parser/php_parser.py b/src/codetext/parser/php_parser.py old mode 100755 new mode 100644 index 5c96b37..b2a8ef2 --- a/src/codetext/parser/php_parser.py +++ b/src/codetext/parser/php_parser.py @@ -99,7 +99,7 @@ def get_class_metadata(class_node, blob: str=None): logger.info('From version `0.0.6` this function will update argument in the API') metadata = { 'identifier': '', - 'parameters': '', + 'parameters': {}, } assert type(class_node) == tree_sitter.Node @@ -110,7 +110,9 @@ def get_class_metadata(class_node, blob: str=None): argument_list = [] for param in child.children: if param.type == 'name': - argument_list.append(get_node_text(param)) - metadata['parameters'] = argument_list + name = get_node_text(param) + metadata['parameters'][name] = None + # argument_list.append(get_node_text(param)) + # metadata['parameters'] = argument_list return metadata diff --git a/src/codetext/parser/python_parser.py b/src/codetext/parser/python_parser.py old mode 100755 new mode 100644 index 6ff2bba..c54ff5c --- a/src/codetext/parser/python_parser.py +++ b/src/codetext/parser/python_parser.py @@ -105,19 +105,16 @@ def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]: logger.info('From version `0.0.6` this function will update argument in the API') metadata = { 'identifier': '', - 'parameters': [], + 'parameters': {}, } for child in class_node.children: if child.type == 'identifier': metadata['identifier'] = get_node_text(child) elif child.type == 'argument_list': - args = [] argument_list = get_node_text(child).split(',') for arg in argument_list: item = re.sub(r'[^a-zA-Z0-9\_]', ' ', arg).split() - if len(item) > 0: - args.append(item[0].strip()) - metadata['parameters'] = args + metadata['parameters'][item[0].strip()] = None # get __init__ function return metadata diff --git a/src/codetext/parser/ruby_parser.py b/src/codetext/parser/ruby_parser.py old mode 100755 new mode 100644 index e7fb653..93ee79d --- a/src/codetext/parser/ruby_parser.py +++ b/src/codetext/parser/ruby_parser.py @@ -83,7 +83,7 @@ def get_function_metadata(function_node, blob=None) -> Dict[str, str]: logger.info('From version `0.0.6` this function will update argument in the API') metadata = { 'identifier': '', - 'parameters': [], + 'parameters': {}, 'return_type': None, } @@ -96,7 +96,7 @@ def get_function_metadata(function_node, blob=None) -> Dict[str, str]: elif child.type in ['method_parameters', 'parameters', 'bare_parameters']: params = get_node_by_kind(child, ['identifier']) for item in params: - metadata['parameters'].append(get_node_text(item)) + metadata['parameters'][get_node_text(item)] = None if not metadata['return_type']: return_statement = get_node_by_kind(function_node, ['return']) @@ -113,7 +113,7 @@ def get_class_metadata(class_node, blob=None): logger.info('From version `0.0.6` this function will update argument in the API') metadata = { 'identifier': '', - 'parameters': [], + 'parameters': {}, } assert type(class_node) == tree_sitter.Node @@ -124,7 +124,7 @@ def get_class_metadata(class_node, blob=None): if child.type == 'superclass': for subchild in child.children: if subchild.type == 'constant': - metadata['parameters'].append(get_node_text(subchild)) + metadata['parameters'][get_node_text(subchild)] = None return metadata diff --git a/src/codetext/parser/rust_parser.py b/src/codetext/parser/rust_parser.py old mode 100755 new mode 100644 index c22d745..e4285ef --- a/src/codetext/parser/rust_parser.py +++ b/src/codetext/parser/rust_parser.py @@ -88,9 +88,12 @@ def get_function_metadata(function_node, blob=None) -> Dict[str, str]: metadata['parameters'][get_node_text(item)] = None else: - # param_name = '' + param_name = '' for subchild in item.children: - if subchild.type == 'identifier': + if subchild.type == 'mutable_specifier': + param_name = 'self' + break + elif subchild.type == 'identifier': param_name = get_node_text(subchild) break param_type = item.child_by_field_name('type') @@ -120,7 +123,7 @@ def get_class_metadata(class_node, blob=None): logger.info('From version `0.0.6` this function will update argument in the API') metadata = { 'identifier': '', - 'parameters': [], + 'parameters': {}, } assert type(class_node) == tree_sitter.Node @@ -136,12 +139,12 @@ def get_class_metadata(class_node, blob=None): metadata['identifier'] = get_node_text(identifier[0]) if len(identifier) > 1: for param in identifier[1:]: - metadata['parameters'].append(get_node_text(param)) + metadata['parameters'][get_node_text(param)] = None return metadata @staticmethod def get_comment_node(function_node): - comment_node = get_node_by_kind(function_node, kind='comment') + comment_node = get_node_by_kind(function_node, kind=['comment', 'line_comment', 'block_comment']) return comment_node diff --git a/src/codetext/utils/__init__.py b/src/codetext/utils/__init__.py old mode 100755 new mode 100644 diff --git a/src/codetext/utils/imports.py b/src/codetext/utils/imports.py old mode 100755 new mode 100644 diff --git a/src/codetext/utils/utils.py b/src/codetext/utils/utils.py old mode 100755 new mode 100644 diff --git a/tests/test_parser/test_cpp.py b/tests/test_parser/test_cpp.py index e01897d..a1c5d06 100755 --- a/tests/test_parser/test_cpp.py +++ b/tests/test_parser/test_cpp.py @@ -49,7 +49,7 @@ def test_get_class_metadata(self): classes = list(CppParser.get_class_list(root))[0] metadata = CppParser.get_class_metadata(classes) - self.assertEqual(metadata['parameters'], ['Vehicle', 'B']) + self.assertEqual(metadata['parameters'], {'Vehicle': None, 'B': None}) self.assertEqual(metadata['identifier'], 'Car') def test_get_docstring(self): diff --git a/tests/test_parser/test_csharp.py b/tests/test_parser/test_csharp.py index a0c2f8d..e918768 100755 --- a/tests/test_parser/test_csharp.py +++ b/tests/test_parser/test_csharp.py @@ -87,7 +87,7 @@ def test_get_class_metadata(self): classes = list(CsharpParser.get_class_list(root))[0] metadata = CsharpParser.get_class_metadata(classes) - self.assertEqual(metadata['parameters'], ['Animal']) + self.assertEqual(metadata['parameters'], {'Animal': None}) self.assertEqual(metadata['identifier'], 'Dog') diff --git a/tests/test_parser/test_java.py b/tests/test_parser/test_java.py index 3eb5a9a..dac7950 100755 --- a/tests/test_parser/test_java.py +++ b/tests/test_parser/test_java.py @@ -75,7 +75,7 @@ def test_get_class_metadata(self): classes = list(JavaParser.get_class_list(root))[0] metadata = JavaParser.get_class_metadata(classes) - self.assertEqual(metadata['parameters'], ['SudoUser', 'FileController']) + self.assertEqual(metadata['parameters'], {'SudoUser': None, 'FileController': None}) self.assertEqual(metadata['identifier'], 'SaveFileController') def test_extract_docstring(self): diff --git a/tests/test_parser/test_javascript.py b/tests/test_parser/test_javascript.py index 477a5bc..aa4c1c9 100755 --- a/tests/test_parser/test_javascript.py +++ b/tests/test_parser/test_javascript.py @@ -76,8 +76,8 @@ class Car { def test_get_function_metadata(self): root = self.root_node - function = JavascriptParser.get_function_list(root)[1] - metadata = JavascriptParser.get_function_metadata(function) + _function = JavascriptParser.get_function_list(root)[1] + metadata = JavascriptParser.get_function_metadata(_function) for key in ['identifier', 'parameters', 'return_type']: self.assertTrue(key in metadata.keys()) @@ -104,11 +104,62 @@ def test_get_class_metadata(self): metadata = JavascriptParser.get_class_metadata(classes) self.assertEqual(metadata['identifier'], 'Model') - self.assertEqual(metadata['parameters'], ['Car']) + self.assertEqual(metadata['parameters'], {'Car': None}) def test_extract_docstring(self): pass + + def test_metadata_with_arrow_function(self): + code_sample = ''' + export const parseModel = async (mesh) => + new Promise((resolve) => { + exporter.parse( + mesh, + (gltf) => { + const blob = new Blob([gltf], { type: "application/octet-stream" }); + resolve(blob); + return blob; + }, + (error) => { + console.log(error); + return error; + + } + ); + }); + ''' + root = parse_code(code_sample, 'javascript').root_node + fn = JavascriptParser.get_function_list(root)[0] + metadata = JavascriptParser.get_function_metadata(fn) + + identifier = metadata['identifier'] + self.assertEqual(identifier, 'parseModel') + + def test_metadata_with_undecleared_functions(self): + code_sample = """ + const asyncFunctionExpression = async function() { + // async function expression definition + return a + }; + + const generatorFunctionExpression = function*() { + // generator function expression definition + return b + }; + """ + root = parse_code(code_sample, 'javascript').root_node + fn1, fn2 = JavascriptParser.get_function_list(root) + + self.assertEqual(fn1.type, 'function') + self.assertEqual(fn2.type, 'generator_function') + + metadata1 = JavascriptParser.get_function_metadata(fn1) + metadata2 = JavascriptParser.get_function_metadata(fn2) + + self.assertEqual(metadata1['identifier'], 'asyncFunctionExpression') + self.assertEqual(metadata2['identifier'], 'generatorFunctionExpression') + if __name__ == '__main__': unittest.main() diff --git a/tests/test_parser/test_php.py b/tests/test_parser/test_php.py index 76cbaa4..2d2d526 100755 --- a/tests/test_parser/test_php.py +++ b/tests/test_parser/test_php.py @@ -107,7 +107,7 @@ def test_get_class_metadata(self): classes = list(PhpParser.get_class_list(root))[0] metadata = PhpParser.get_class_metadata(classes) - self.assertEqual(metadata['parameters'], ['AbstractSQLServerDriver']) + self.assertEqual(metadata['parameters'], {'AbstractSQLServerDriver': None}) self.assertEqual(metadata['identifier'], 'Driver') diff --git a/tests/test_parser/test_python.py b/tests/test_parser/test_python.py index 6eee05c..ae67239 100755 --- a/tests/test_parser/test_python.py +++ b/tests/test_parser/test_python.py @@ -71,7 +71,7 @@ def test_sample(self, arg1: str = "string", arg2 = "another_string"): classes = list(PythonParser.get_class_list(root))[0] metadata = PythonParser.get_class_metadata(classes) - self.assertEqual(metadata['parameters'], ['ABC']) + self.assertEqual(metadata['parameters'], {'ABC': None}) self.assertEqual(metadata['identifier'], 'Sample') def test_get_comment_list(self): diff --git a/tests/test_parser/test_ruby.py b/tests/test_parser/test_ruby.py index ca91aa5..848d4a2 100755 --- a/tests/test_parser/test_ruby.py +++ b/tests/test_parser/test_ruby.py @@ -82,7 +82,7 @@ def test_get_function_metadata(self): for key in ['identifier', 'parameters', 'return_type']: self.assertTrue(key in metadata.keys()) self.assertEqual(metadata['identifier'], 'search') - self.assertEqual(metadata['parameters'], ['query', 'options']) + self.assertEqual(metadata['parameters'], {'query': None, 'options': None}) self.assertEqual(metadata['return_type'], None) @@ -112,7 +112,7 @@ def test_get_class_metadata(self): metadata = RubyParser.get_class_metadata(classes) self.assertEqual(metadata['identifier'], 'Client') - self.assertEqual(metadata['parameters'], ['API']) + self.assertEqual(metadata['parameters'], {'API': None}) if __name__ == '__main__': diff --git a/tests/test_parser/test_rust.py b/tests/test_parser/test_rust.py index 05f7e52..52274ad 100755 --- a/tests/test_parser/test_rust.py +++ b/tests/test_parser/test_rust.py @@ -111,7 +111,7 @@ def test_get_class_metadata(self): metadata = RustParser.get_class_metadata(classes) self.assertEqual(metadata['identifier'], 'Quack') - self.assertEqual(metadata['parameters'], ['Duck']) + self.assertEqual(metadata['parameters'], {'Duck': None}) if __name__ == '__main__':