diff --git a/pyproject.toml b/pyproject.toml index 67d5414e..a3e42831 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ requires = [ "matplotlib==3.5.0", "nltk==3.6.5", "numpy==1.21.4", - "pandas==1.3.4", + "pandas>=2.0.0", "rdflib==6.0.2", "rdflib-jsonld==0.6.2", "requests>=2.22.0", @@ -20,6 +20,7 @@ requires = [ "contractions>=0.1.66", "imblearn>=0.0", "morph-kgc", + "nbformat", "bibtexparser" ] build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py index 93004dc3..ce639129 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ "matplotlib==3.5.0", "nltk==3.6.6", "numpy==1.22.0", - "pandas==1.3.4", + "pandas>=2.0.0", "rdflib>=6.0.2", "rdflib-jsonld==0.6.2", "requests>=2.22.0", @@ -27,7 +27,8 @@ "imbalanced-learn>=0.8.1", "pytest", "morph-kgc>=2.3.1", - "bibtexparser==1.4.1" + "bibtexparser==1.4.1", + "nbformat>=5.9.2" ] diff --git a/src/somef/process_files.py b/src/somef/process_files.py index b40c2f0a..525d7beb 100644 --- a/src/somef/process_files.py +++ b/src/somef/process_files.py @@ -2,6 +2,7 @@ import os import re import urllib +from urllib.parse import urlparse from .utils import constants, markdown_utils from . import extract_ontologies, extract_workflows from .process_results import Result @@ -95,7 +96,6 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner logging.error("README Error: error while reading file content") logging.error(f"{type(err).__name__} was raised: {err}") if "LICENCE" == filename.upper() or "LICENSE" == filename.upper() or "LICENSE.MD" == filename.upper(): - # to do (issue 530) if there are two licenses, keep the one closer to the root metadata_result = get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_default_branch, repo_dir, repo_relative_path, filename, dir_path, @@ -260,6 +260,28 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul """ url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, repo_dir, repo_relative_path, filename) + # do not add result if a file under the same category exist. Only for license, citation, COC, contribution, README + replace = False + results = metadata_result.results + try: + if category in results: + # check category exists, using the file exploration technique, and retrieve source + if category in [constants.CAT_CITATION, constants.CAT_LICENSE, constants.CAT_COC, constants.CAT_README_URL, + constants.CAT_CONTRIBUTING_GUIDELINES]: + for entry in results[category]: + if (entry[constants.PROP_SOURCE] is not None and + entry[constants.PROP_TECHNIQUE] is constants.TECHNIQUE_FILE_EXPLORATION): + new_file_path = extract_directory_path(url) + existing_path = extract_directory_path(entry[constants.PROP_SOURCE]) + if new_file_path.startswith(existing_path): + # the existing file is higher, ignore this one + return metadata_result + else: + # replace result in hierarchy (below) + replace = True + break + except Exception: + logging.warning("Error when trying to determine if redundant files exist") try: with open(os.path.join(dir_path, filename), "r") as data_file: file_text = data_file.read() @@ -269,16 +291,43 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul } if format_result != "": result[constants.PROP_FORMAT] = format_result - metadata_result.add_result(category, result, 1, constants.TECHNIQUE_FILE_EXPLORATION, url) + if replace: + metadata_result.edit_hierarchical_result(category, result, 1, constants.TECHNIQUE_FILE_EXPLORATION, url) + else: + metadata_result.add_result(category, result, 1, constants.TECHNIQUE_FILE_EXPLORATION, url) except: - metadata_result.add_result(category, - { - constants.PROP_VALUE: url, - constants.PROP_TYPE: constants.URL - }, 1, constants.TECHNIQUE_FILE_EXPLORATION) + if replace: + metadata_result.edit_hierarchical_result(category, + { + constants.PROP_VALUE: url, + constants.PROP_TYPE: constants.URL + }, 1, constants.TECHNIQUE_FILE_EXPLORATION) + else: + metadata_result.add_result(category, + { + constants.PROP_VALUE: url, + constants.PROP_TYPE: constants.URL + }, 1, constants.TECHNIQUE_FILE_EXPLORATION) return metadata_result +def extract_directory_path(path): + """ + Method to extract a directorr or URL path without the file name + Parameters + ---------- + path: file path + + Returns + ------- + the URL/file path without the name of the file + """ + if os.path.exists(path): + return os.path.dirname(os.path.abspath(path)) + else: + return os.path.dirname(urlparse(path).path) + + def convert_to_raw_user_content_github(partial, owner, repo_name, repo_ref): """Converts GitHub paths into raw.githubuser content URLs, accessible by users""" if partial.startswith("./"): diff --git a/src/somef/process_results.py b/src/somef/process_results.py index e98a021c..193bece7 100644 --- a/src/somef/process_results.py +++ b/src/somef/process_results.py @@ -54,6 +54,29 @@ def add_result(self, category, result, confidence, technique, source=""): else: logging.error("Tried to add a result without value or type. Discarding it ...") + def edit_hierarchical_result(self, category, result, confidence, technique, source=""): + """ + Method to edit a resource that is supposed to be unique with a higher up in the hierarchy. + For example, if there are 2 licenses or citation files, we only take the upper level one. + The value replaced is the one in the same category and technique + Parameters + ---------- + category: category of the result + result: new result value + confidence: confidence value + technique: in this case, file exploration + source: new source link + + Returns + ------- + N/A edits the metadata result + """ + for entry in self.results[category]: + if entry[constants.PROP_SOURCE] is not None and entry[constants.PROP_TECHNIQUE] is constants.TECHNIQUE_FILE_EXPLORATION: + if source != "": + entry[constants.PROP_RESULT] = result + entry[constants.PROP_SOURCE] = source + # def consolidate_results(self): # # TO DO: for each category where we may reduce/ improve the results, do so. # # For example, here is where we would detect if there are redundant citation files, and we would create a single diff --git a/src/somef/test/test_data/repositories/Widoco/license.md b/src/somef/test/test_data/repositories/Widoco/license.md deleted file mode 100644 index 6b6f330a..00000000 --- a/src/somef/test/test_data/repositories/Widoco/license.md +++ /dev/null @@ -1,13 +0,0 @@ - Copyright 2012-2013 Ontology Engineering Group, Universidad Politécnica de Madrid, Spain - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file diff --git a/src/somef/test/test_process_repository.py b/src/somef/test/test_process_repository.py index 6b0132b9..d40af50f 100644 --- a/src/somef/test/test_process_repository.py +++ b/src/somef/test/test_process_repository.py @@ -182,4 +182,17 @@ def test_issue_526(self): github_data = Result() text, github_data = process_files.process_repository_files(test_data_repositories + "Widoco", github_data, constants.RepositoryType.LOCAL) - assert len(github_data.results[constants.CAT_CITATION]) == 1 \ No newline at end of file + assert len(github_data.results[constants.CAT_CITATION]) == 1 + + def test_issue_530(self): + """ + Test designed to see if repositories with two licenses or citation files get only the outer license or cff. + This test also applies for COC and contributing guidelines + """ + github_data = Result() + text, github_data = process_files.process_repository_files(test_data_repositories + "Widoco", github_data, + constants.RepositoryType.LOCAL) + licenses = github_data.results[constants.CAT_LICENSE] + citation = github_data.results[constants.CAT_CITATION] + assert len(licenses) == 1 and "LICENSE" in licenses[0]["source"] and \ + len(citation) == 1 and "example_onto" not in citation[0]["source"] \ No newline at end of file