KnowledgeCaptureAndDiscovery · dgarijo · Jan 4, 2024 · Jan 4, 2024 · Jan 4, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
     "matplotlib==3.5.0",
     "nltk==3.6.5",
     "numpy==1.21.4",
-    "pandas==1.3.4",
+    "pandas>=2.0.0",
     "rdflib==6.0.2",
     "rdflib-jsonld==0.6.2",
     "requests>=2.22.0",
@@ -20,6 +20,7 @@ requires = [
     "contractions>=0.1.66",
     "imblearn>=0.0",
     "morph-kgc",
+    "nbformat",
     "bibtexparser"
 ]
 build-backend = "setuptools.build_meta"
diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
     "matplotlib==3.5.0",
     "nltk==3.6.6",
     "numpy==1.22.0",
-    "pandas==1.3.4",
+    "pandas>=2.0.0",
     "rdflib>=6.0.2",
     "rdflib-jsonld==0.6.2",
     "requests>=2.22.0",
@@ -27,7 +27,8 @@
     "imbalanced-learn>=0.8.1",
     "pytest",
     "morph-kgc>=2.3.1",
-    "bibtexparser==1.4.1"
+    "bibtexparser==1.4.1",
+    "nbformat>=5.9.2"
 ]
 
 

diff --git a/src/somef/process_files.py b/src/somef/process_files.py
@@ -2,6 +2,7 @@
 import os
 import re
 import urllib
+from urllib.parse import urlparse
 from .utils import constants, markdown_utils
 from . import extract_ontologies, extract_workflows
 from .process_results import Result
@@ -95,7 +96,6 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
                             logging.error("README Error: error while reading file content")
                             logging.error(f"{type(err).__name__} was raised: {err}")
                 if "LICENCE" == filename.upper() or "LICENSE" == filename.upper() or "LICENSE.MD" == filename.upper():
-                    # to do (issue 530) if there are two licenses, keep the one closer to the root
                     metadata_result = get_file_content_or_link(repo_type, file_path, owner, repo_name,
                                                                repo_default_branch,
                                                                repo_dir, repo_relative_path, filename, dir_path,
@@ -260,6 +260,28 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
     """
     url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, repo_dir, repo_relative_path,
                         filename)
+    # do not add result if a file under the same category exist. Only for license, citation, COC, contribution, README
+    replace = False
+    results = metadata_result.results
+    try:
+        if category in results:
+            # check category exists, using the file exploration technique, and retrieve source
+            if category in [constants.CAT_CITATION, constants.CAT_LICENSE, constants.CAT_COC, constants.CAT_README_URL,
+                        constants.CAT_CONTRIBUTING_GUIDELINES]:
+                for entry in results[category]:
+                    if (entry[constants.PROP_SOURCE] is not None and
+                            entry[constants.PROP_TECHNIQUE] is constants.TECHNIQUE_FILE_EXPLORATION):
+                        new_file_path = extract_directory_path(url)
+                        existing_path = extract_directory_path(entry[constants.PROP_SOURCE])
+                        if new_file_path.startswith(existing_path):
+                            # the existing file is higher, ignore this one
+                            return metadata_result
+                        else:
+                            # replace result in hierarchy (below)
+                            replace = True
+                        break
+    except Exception:
+        logging.warning("Error when trying to determine if redundant files exist")
     try:
         with open(os.path.join(dir_path, filename), "r") as data_file:
             file_text = data_file.read()
@@ -269,16 +291,43 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
             }
             if format_result != "":
                 result[constants.PROP_FORMAT] = format_result
-            metadata_result.add_result(category, result, 1, constants.TECHNIQUE_FILE_EXPLORATION, url)
+            if replace:
+                metadata_result.edit_hierarchical_result(category, result, 1, constants.TECHNIQUE_FILE_EXPLORATION, url)
+            else:
+                metadata_result.add_result(category, result, 1, constants.TECHNIQUE_FILE_EXPLORATION, url)
     except:
-        metadata_result.add_result(category,
-                                   {
-                                       constants.PROP_VALUE: url,
-                                       constants.PROP_TYPE: constants.URL
-                                   }, 1, constants.TECHNIQUE_FILE_EXPLORATION)
+        if replace:
+            metadata_result.edit_hierarchical_result(category,
+                                                     {
+                                           constants.PROP_VALUE: url,
+                                           constants.PROP_TYPE: constants.URL
+                                       }, 1, constants.TECHNIQUE_FILE_EXPLORATION)
+        else:
+            metadata_result.add_result(category,
+                                       {
+                                           constants.PROP_VALUE: url,
+                                           constants.PROP_TYPE: constants.URL
+                                       }, 1, constants.TECHNIQUE_FILE_EXPLORATION)
     return metadata_result
 
 
+def extract_directory_path(path):
+    """
+    Method to extract a directorr or URL path without the file name
+    Parameters
+    ----------
+    path: file path
+
+    Returns
+    -------
+    the URL/file path without the name of the file
+    """
+    if os.path.exists(path):
+        return os.path.dirname(os.path.abspath(path))
+    else:
+        return os.path.dirname(urlparse(path).path)
+
+
 def convert_to_raw_user_content_github(partial, owner, repo_name, repo_ref):
     """Converts GitHub paths into raw.githubuser content URLs, accessible by users"""
     if partial.startswith("./"):

diff --git a/src/somef/process_results.py b/src/somef/process_results.py
@@ -54,6 +54,29 @@ def add_result(self, category, result, confidence, technique, source=""):
         else:
             logging.error("Tried to add a result without value or type. Discarding it ...")
 
+    def edit_hierarchical_result(self, category, result, confidence, technique, source=""):
+        """
+        Method to edit a resource that is supposed to be unique with a higher up in the hierarchy.
+        For example, if there are 2 licenses or citation files, we only take the upper level one.
+        The value replaced is the one in the same category and technique
+        Parameters
+        ----------
+        category: category of the result
+        result: new result value
+        confidence: confidence value
+        technique: in this case, file exploration
+        source: new source link
+
+        Returns
+        -------
+        N/A edits the  metadata result
+        """
+        for entry in self.results[category]:
+            if entry[constants.PROP_SOURCE] is not None and entry[constants.PROP_TECHNIQUE] is constants.TECHNIQUE_FILE_EXPLORATION:
+                if source != "":
+                    entry[constants.PROP_RESULT] = result
+                    entry[constants.PROP_SOURCE] = source
+
     # def consolidate_results(self):
     #   # TO DO: for each category where we may reduce/ improve the results, do so.
     #   # For example, here is where we would detect if there are redundant citation files, and we would create a single

diff --git a/src/somef/test/test_data/repositories/Widoco/license.md b/src/somef/test/test_data/repositories/Widoco/license.md
diff --git a/src/somef/test/test_process_repository.py b/src/somef/test/test_process_repository.py
@@ -182,4 +182,17 @@ def test_issue_526(self):
         github_data = Result()
         text, github_data = process_files.process_repository_files(test_data_repositories + "Widoco", github_data,
                                                                    constants.RepositoryType.LOCAL)
-        assert len(github_data.results[constants.CAT_CITATION]) == 1
+        assert len(github_data.results[constants.CAT_CITATION]) == 1
+
+    def test_issue_530(self):
+        """
+        Test designed to see if repositories with two licenses or citation files get only the outer license or cff.
+        This test also applies for COC and contributing guidelines
+        """
+        github_data = Result()
+        text, github_data = process_files.process_repository_files(test_data_repositories + "Widoco", github_data,
+                                                                   constants.RepositoryType.LOCAL)
+        licenses = github_data.results[constants.CAT_LICENSE]
+        citation = github_data.results[constants.CAT_CITATION]
+        assert len(licenses) == 1 and "LICENSE" in licenses[0]["source"] and \
+            len(citation) == 1 and "example_onto" not in citation[0]["source"]