Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Towards next version #609

Merged
merged 2 commits into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ requires = [
"matplotlib==3.5.0",
"nltk==3.6.5",
"numpy==1.21.4",
"pandas==1.3.4",
"pandas>=2.0.0",
"rdflib==6.0.2",
"rdflib-jsonld==0.6.2",
"requests>=2.22.0",
Expand All @@ -20,6 +20,7 @@ requires = [
"contractions>=0.1.66",
"imblearn>=0.0",
"morph-kgc",
"nbformat",
"bibtexparser"
]
build-backend = "setuptools.build_meta"
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"matplotlib==3.5.0",
"nltk==3.6.6",
"numpy==1.22.0",
"pandas==1.3.4",
"pandas>=2.0.0",
"rdflib>=6.0.2",
"rdflib-jsonld==0.6.2",
"requests>=2.22.0",
Expand All @@ -27,7 +27,8 @@
"imbalanced-learn>=0.8.1",
"pytest",
"morph-kgc>=2.3.1",
"bibtexparser==1.4.1"
"bibtexparser==1.4.1",
"nbformat>=5.9.2"
]


Expand Down
63 changes: 56 additions & 7 deletions src/somef/process_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import re
import urllib
from urllib.parse import urlparse
from .utils import constants, markdown_utils
from . import extract_ontologies, extract_workflows
from .process_results import Result
Expand Down Expand Up @@ -95,7 +96,6 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
logging.error("README Error: error while reading file content")
logging.error(f"{type(err).__name__} was raised: {err}")
if "LICENCE" == filename.upper() or "LICENSE" == filename.upper() or "LICENSE.MD" == filename.upper():
# to do (issue 530) if there are two licenses, keep the one closer to the root
metadata_result = get_file_content_or_link(repo_type, file_path, owner, repo_name,
repo_default_branch,
repo_dir, repo_relative_path, filename, dir_path,
Expand Down Expand Up @@ -260,6 +260,28 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
"""
url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch, repo_dir, repo_relative_path,
filename)
# do not add result if a file under the same category exist. Only for license, citation, COC, contribution, README
replace = False
results = metadata_result.results
try:
if category in results:
# check category exists, using the file exploration technique, and retrieve source
if category in [constants.CAT_CITATION, constants.CAT_LICENSE, constants.CAT_COC, constants.CAT_README_URL,
constants.CAT_CONTRIBUTING_GUIDELINES]:
for entry in results[category]:
if (entry[constants.PROP_SOURCE] is not None and
entry[constants.PROP_TECHNIQUE] is constants.TECHNIQUE_FILE_EXPLORATION):
new_file_path = extract_directory_path(url)
existing_path = extract_directory_path(entry[constants.PROP_SOURCE])
if new_file_path.startswith(existing_path):
# the existing file is higher, ignore this one
return metadata_result
else:
# replace result in hierarchy (below)
replace = True
break
except Exception:
logging.warning("Error when trying to determine if redundant files exist")
try:
with open(os.path.join(dir_path, filename), "r") as data_file:
file_text = data_file.read()
Expand All @@ -269,16 +291,43 @@ def get_file_content_or_link(repo_type, file_path, owner, repo_name, repo_defaul
}
if format_result != "":
result[constants.PROP_FORMAT] = format_result
metadata_result.add_result(category, result, 1, constants.TECHNIQUE_FILE_EXPLORATION, url)
if replace:
metadata_result.edit_hierarchical_result(category, result, 1, constants.TECHNIQUE_FILE_EXPLORATION, url)
else:
metadata_result.add_result(category, result, 1, constants.TECHNIQUE_FILE_EXPLORATION, url)
except:
metadata_result.add_result(category,
{
constants.PROP_VALUE: url,
constants.PROP_TYPE: constants.URL
}, 1, constants.TECHNIQUE_FILE_EXPLORATION)
if replace:
metadata_result.edit_hierarchical_result(category,
{
constants.PROP_VALUE: url,
constants.PROP_TYPE: constants.URL
}, 1, constants.TECHNIQUE_FILE_EXPLORATION)
else:
metadata_result.add_result(category,
{
constants.PROP_VALUE: url,
constants.PROP_TYPE: constants.URL
}, 1, constants.TECHNIQUE_FILE_EXPLORATION)
return metadata_result


def extract_directory_path(path):
"""
Method to extract a directorr or URL path without the file name
Parameters
----------
path: file path

Returns
-------
the URL/file path without the name of the file
"""
if os.path.exists(path):
return os.path.dirname(os.path.abspath(path))
else:
return os.path.dirname(urlparse(path).path)


def convert_to_raw_user_content_github(partial, owner, repo_name, repo_ref):
"""Converts GitHub paths into raw.githubuser content URLs, accessible by users"""
if partial.startswith("./"):
Expand Down
23 changes: 23 additions & 0 deletions src/somef/process_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,29 @@ def add_result(self, category, result, confidence, technique, source=""):
else:
logging.error("Tried to add a result without value or type. Discarding it ...")

def edit_hierarchical_result(self, category, result, confidence, technique, source=""):
"""
Method to edit a resource that is supposed to be unique with a higher up in the hierarchy.
For example, if there are 2 licenses or citation files, we only take the upper level one.
The value replaced is the one in the same category and technique
Parameters
----------
category: category of the result
result: new result value
confidence: confidence value
technique: in this case, file exploration
source: new source link

Returns
-------
N/A edits the metadata result
"""
for entry in self.results[category]:
if entry[constants.PROP_SOURCE] is not None and entry[constants.PROP_TECHNIQUE] is constants.TECHNIQUE_FILE_EXPLORATION:
if source != "":
entry[constants.PROP_RESULT] = result
entry[constants.PROP_SOURCE] = source

# def consolidate_results(self):
# # TO DO: for each category where we may reduce/ improve the results, do so.
# # For example, here is where we would detect if there are redundant citation files, and we would create a single
Expand Down
13 changes: 0 additions & 13 deletions src/somef/test/test_data/repositories/Widoco/license.md

This file was deleted.

15 changes: 14 additions & 1 deletion src/somef/test/test_process_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,4 +182,17 @@ def test_issue_526(self):
github_data = Result()
text, github_data = process_files.process_repository_files(test_data_repositories + "Widoco", github_data,
constants.RepositoryType.LOCAL)
assert len(github_data.results[constants.CAT_CITATION]) == 1
assert len(github_data.results[constants.CAT_CITATION]) == 1

def test_issue_530(self):
"""
Test designed to see if repositories with two licenses or citation files get only the outer license or cff.
This test also applies for COC and contributing guidelines
"""
github_data = Result()
text, github_data = process_files.process_repository_files(test_data_repositories + "Widoco", github_data,
constants.RepositoryType.LOCAL)
licenses = github_data.results[constants.CAT_LICENSE]
citation = github_data.results[constants.CAT_CITATION]
assert len(licenses) == 1 and "LICENSE" in licenses[0]["source"] and \
len(citation) == 1 and "example_onto" not in citation[0]["source"]