Merge pull request #574 from KnowledgeCaptureAndDiscovery/dev

Dev
KnowledgeCaptureAndDiscovery · Jun 26, 2023 · ddb0479 · ddb0479
2 parents ba032c1 + a59183c
commit ddb0479
Show file tree

Hide file tree

Showing 7 changed files with 298 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -53,6 +53,7 @@ Given a readme file (or a GitHub/Gitlab repository) SOMEF will extract the follo
 - **Ontologies**: URL and path to the ontology files present in the repository.
 - **Application domain**: The application domain of the repository. Current supported domains include: Astrophisics, Audio, Computer vision, Graphs, Natural language processing, Reinforcement learning, Semantc web, Sequential. Domains are not mutually exclusive. These domains have been extracted from [awesome lists](https://github.com/topics/awesome-list) and [Papers with code](https://paperswithcode.com/). Find more information in our [documentation](https://somef.readthedocs.io/en/latest/).
 - **Workflows**: URL and path to the workflow files present in the repository.
+- **Related papers**: URL to possible related papers within the repository stated within the readme file.
 
 
 We use different supervised classifiers, header analysis, regular expressions and the GitHub/Gitlab API to retrieve all these fields (more than one technique may be used for each field). Each extraction records its provenance, with the confidence and technique used on each step. For more information check the [output format description](https://somef.readthedocs.io/en/latest/output/)

diff --git a/docs/output.md b/docs/output.md
@@ -103,6 +103,7 @@ SOMEF aims to recognize the following categories (in alphabetical order):
 - `programming_languages`: Languages used in the repository.
 - `readme_url`: URL to the main README file in the repository.
 - `related_documentation`: Pointers to documentation of related projects which may be needed when using the target repository.
+- `related_papers`: URL to possible related papers within the repository stated within the readme file.
 - `releases`: Pointer to the available versions of a software component.
 - `repository_status`: Repository status as it is described in [repostatus.org](https://www.repostatus.org/).
 - `requirements`: Pre-requisites and dependencies needed to execute a software component.

diff --git a/src/somef/regular_expressions.py b/src/somef/regular_expressions.py
@@ -187,21 +187,42 @@ def extract_repo_status(unfiltered_text, repository_metadata: Result, readme_sou
     return repository_metadata
 
 
-def extract_arxiv_links(unfiltered_text):
-    """Extracts arxiv links from a given text"""
+def extract_arxiv_links(unfiltered_text,repository_metadata: Result, readme_source) -> Result:
+    """
+    Regexp to extract arxiv url from a repository
+    Parameters
+    ----------
+    @param unfiltered_text: repo text
+    @param repository_metadata: Result with the extractions so far
+    @param readme_source: url to the file used (for provenance)
+
+    Returns
+    -------
+    @returns a Result including the arxiv url 
+    """
     result_links = [m.start() for m in re.finditer('https://arxiv.org/', unfiltered_text)]
     result_refs = [m.start() for m in re.finditer('arXiv:', unfiltered_text)]
     results = []
     for position in result_links:
-        end = unfiltered_text.find(')', position)
+        end = unfiltered_text.find(')',position)
+        if end < 0:
+            end = unfiltered_text.find('}',position)
         link = unfiltered_text[position:end]
         results.append(link)
     for position in result_refs:
         end = unfiltered_text.find('}', position)
         link = unfiltered_text[position:end]
         results.append(link.replace('arXiv:', 'https://arxiv.org/abs/'))
 
-    return results
+    for link in set(results):
+        repository_metadata.add_result(constants.CAT_RELATED_PAPERS,
+                                        {
+                                            constants.PROP_TYPE: constants.URL,
+                                            constants.PROP_VALUE: link
+                                        },
+                                        1, constants.TECHNIQUE_REGULAR_EXPRESSION, readme_source
+                                    )
+    return repository_metadata
 
 
 def extract_wiki_links(unfiltered_text, repo_url, repository_metadata: Result, readme_source) -> Result:

diff --git a/src/somef/test/test_data/test_issue_181_2.txt b/src/somef/test/test_data/test_issue_181_2.txt
@@ -0,0 +1,135 @@
+<img src="img/combine.png" style="zoom:100%;" />
+
+<p align="center"><a href="https://github.com/THUDM/SelfKG/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/github/license/THUDM/SelfKG" /></a>
+
+# SelfKG: Self-Supervised Entity Alignment in Knowledge Graphs
+
+Original implementation for paper SelfKG: Self-Supervised Entity Alignment in Knowledge Graphs.
+
+This paper is accepted and **nominated as a best paper** by  [The Web Conference2022](https://www2022.thewebconf.org/)! :satisfied:
+
+SelfKG is the **first** **self-supervised** entity alignment method **without label supervision**, which can **match or achieve comparable results with state-of-the-art supervised baselines**. The performance of SelfKG suggests self-supervised learning offers great potential for entity alignment in Knowledge Graphs.
+
+[SelfKG: Self-Supervised Entity Alignment in Knowledge Graphs](https://arxiv.org/abs/2203.01044)
+
+https://doi.org/10.1145/3485447.3511945
+
+- [Installation](#installation)
+  - [Requirements](#requirements)
+- [Quick Start](#quick-start)
+  - [Data Preparation](#data-preparation)
+  - :star:[Run Experiments](#run-experiments)
+- [❗ Common Issues](#-common-issues)
+- [Citing SelfKG](#citing-selfkg)
+
+## Installation
+
+### Requirements
+
+```txt
+torch==1.9.0
+faiss-cpu==1.7.1
+numpy==1.19.2
+pandas==1.0.5
+tqdm==4.61.1
+transformers==4.8.2
+torchtext==0.10.0
+```
+
+You can use [`setup.sh`](https://github.com/THUDM/SelfKG/blob/main/setup.sh) to set up your Anaconda environment by
+
+```bash
+bash setup.sh
+```
+
+
+
+## Quick Start
+
+### Data Preparation
+
+You can download the our data from [here](https://zenodo.org/record/6326870#.YiI2K6tBxPY), and the final structure our project should be:
+
+```bash
+├── data
+│   ├── DBP15K
+│   │   ├── fr_en
+│   │   ├── ja_en
+│   │   └── zh_en
+│   ├── DWY100K
+│   │   ├── dbp_wd
+│   │   └── dbp_yg
+│   └── LaBSE
+│       ├── bert_config.json
+│       ├── bert_model.ckpt.index
+│       ├── checkpoint
+│       ├── config.json
+│       ├── pytorch_model.bin
+│       └── vocab.txt
+│   └── getdata.sh
+├── loader
+├── model
+├── run.sh # Please use this bash to run the experiments!
+├── run_DWY_LaBSE_neighbor.py # SelfKG on DWY100k
+├── run_LaBSE_neighbor.py # SelfKG on DBP15k
+... # run_LaBSE_*.py # Ablation code will be available soon
+├── script
+│   └── preprocess
+├── settings.py
+└── setup.sh # Can be used to set up your Anaconda environment
+```
+
+You can also use the following scripts to download the datasets directly:
+
+```bash
+cd data
+bash getdata.sh # The download speed is decided by your network connection. If it's pretty slow, please directly download the datasets from the website as mentioned before.
+```
+
+### :star:Run Experiments
+
+**Please use**
+
+**`bash run.sh`**
+
+ to reproduce our experiments results. For more details, please refer to [`run.sh`](https://github.com/THUDM/SelfKG/blob/main/run.sh) and our code.
+
+## ❗ Common Issues
+
+<details>
+<summary>
+"XXX file not found"
+</summary>
+<br/>
+Please make sure you've downloaded all the dataset according to README.
+</details>
+
+
+to be continued ...
+
+
+## Citing SelfKG
+
+If you use SelfKG in your research or wish to refer to the baseline results, please use the following BibTeX.
+
+```
+@article{DBLP:journals/corr/abs-2203-01044,
+  author    = {Xiao Liu and
+               Haoyun Hong and
+               Xinghao Wang and
+               Zeyi Chen and
+               Evgeny Kharlamov and
+               Yuxiao Dong and
+               Jie Tang},
+  title     = {SelfKG: Self-Supervised Entity Alignment in Knowledge Graphs},
+  journal   = {CoRR},
+  volume    = {abs/2203.01044},
+  year      = {2022},
+  url       = {https://arxiv.org/abs/2203.01044},
+  eprinttype = {arXiv},
+  eprint    = {2203.01044},
+  timestamp = {Mon, 07 Mar 2022 16:29:57 +0100},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-2203-01044.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
diff --git a/src/somef/test/test_data/test_issue_181_3.txt b/src/somef/test/test_data/test_issue_181_3.txt
@@ -0,0 +1,110 @@
+# MultiDepth
+
+Source code for MultiDepth, our single-image depth estimation method based on joint regression and classification in a multi-task setup.
+This work was presented at the IEEE Intelligent Transportation Systems Conference (ITSC) 2019.
+
+If you make use of our code or approach, please consider citing [our paper](https://arxiv.org/abs/1907.11111) as:
+
+    @InProceedings{,
+      author    = {Lukas Liebel and Marco K\"orner},
+      title     = {{MultiDepth}: Single-Image Depth Estimation via Multi-Task Regression and Classification},
+      booktitle = {IEEE Intelligent Transportation Systems Conference (ITSC)},
+      year      = {2019}
+    }
+
+Check out the [KITTI leaderboard](http://www.cvlibs.net/datasets/kitti/eval_depth.php?benchmark=depth_prediction) for exemplary results I got using this concept.
+
+> I'm confident that you should be able to achieve better results with more training and some minor tweaks.
+
+This implementation is heavily based on [pytorch-semseg](https://github.com/meetshah1995/pytorch-semseg), a brilliant project maintained by Meet Pragnesh Shah (released under [MIT license](https://github.com/meetshah1995/pytorch-semseg/blob/master/LICENSE)).
+Please check out and contribute to their project and feel free to ignore certain parts in my code that are just unused parts of the *ptsemseg* codebase.
+
+
+## Step-by-step Instructions
+
+### A Word of Warning
+
+I originally wrote this code for a different project.
+While most of the unnecessary (and some of the confusing) pieces have already been removed, it might still contain some cryptic lines.
+Just ignore them and you should be fine ;)
+
+> *Sorry for the mess :D If you are/were/know a PhD student you know the drill...*
+
+### Docker Container
+
+This repository comes with a Dockerfile allowing you to build an image that can be used to optionally run training inside a container.
+Just skip the respective steps in the following instructions if you do not wish to use docker.
+
+> Please note that I highly recommend using docker and never tried to run the provided code outside of a container.
+
+0. *(optional)* Adjust the [Dockerfile](docker/Dockerfile) if needed (e.g., add helpful utils, such as tmux, htop, etc.).
+> To change this later just stop running containers, re-build the image and restart the container.
+
+1. Go to the [docker dir](docker).
+Build the MultiDepth docker image by running the respective [script](docker/build_image.sh): `./build_image.sh`
+
+2. Adjust the mount parameters of your container in the provided [script](docker/start_container.sh), such that the directories containing your training data are mounted to `/root/data/kitti/rgb` and `/root/data/kitti/depth`.
+>Feel free to change this if you want to use a different dir tree.
+Keep in mind that it will be necessary to adjust the paths in other places accordingly.
+
+  You can also mount an external directory to `/root/logs` in order to save tensorboard logs and checkpoints outside of the container.
+
+3. Start your container by running the [script](docker/start_container.sh): `./start_container.sh`
+
+4. Connect to the running container, e.g., by running `docker exec -it multidepth bash` or by simply calling the provided minimal [script](docker/connect_to_container.sh): `./connect_to_container.sh`
+
+5. To stop the container simply disconnect from the container (e.g., by pressing [Ctrl] + [D]) and kill it: `docker kill multidepth`.
+
+> If you are familiar with docker, you probably know better ways of starting and stopping containers as well as running scripts within them :)
+
+
+### Set Training Parameters
+
+You can adjust training behavior and numerous other options using a [YAML configuration file](configs/example_config.yml).
+Most of the parameters in the example script should be self-explanatory and they are already set to useful values.
+
+> I might add a more detailed explanation in the future.
+Until then, feel free to message me if you have trouble with understanding their effect and I will update this section accordingly.
+
+
+### Run Training
+
+Run the [main training script](train.py) which expects a single parameter `--config` specifying the path to a configuration file, e.g.: `python train.py --config configs/example_config`.
+
+> Note that it might take a while for the actual training process to start depending on the size of your dataset.
+
+
+### Visualize Training Progress
+
+The training script will write Tensorboard logs to the directory specified in the [config file](configs/example_config.yml).
+Display the results by starting Tensorboard and directing it to the respective log dir.
+
+You could do this by starting another docker container with TensorFlow: `docker run --rm -it -p 6006:6006 -v ~/path/to/my/logs:/root/logs tensorflow/tensorflow`
+
+Make sure to mount the correct data dir and map a different port if necessary (6006 is Tensorboard's standard port).
+This will allow you to access the web interface of Tensorboard running on a server from your local machine.
+
+> This works for me in certain settings but your mileage will vary depending on your network configuration!
+
+Start tensorboard: `tensorboard --logdir /root/logs` and navigate to [your server's ip/localhost]:6006 to access the web-interface in your favorite web browser.
+
+
+### Evaluate Results
+
+Mid-training validation will be carried out from time to time according to your [config file](configs/example_config.yml).
+
+
+## Hardware Requirements
+
+Even though a CUDA-capable GPU is not strictly required to run this training script, it is highly recommended for obvious reasons.
+Adjust the batch size if you run out of memory.
+Successfully tested on 1080 and 1080Ti GPUs.
+**Multi-GPU training with batch-splitting will be used if you provide multiple GPUs!**
+
+
+## Contribute
+
+If you encounter any errors or unexpected behavior feel free to message me.
+You are also welcome to file pull requests if you want to help to improve or fix any part of this.
+
+**Thank you!**
diff --git a/src/somef/test/test_regular_expressions.py b/src/somef/test/test_regular_expressions.py
@@ -175,13 +175,31 @@ def test_issue_images(self):
             print(img)
             assert len(img) == 2
 
-    # Test commented out because arxiv links with no context has demonstrated not to be useful
-    # def test_issue_181(self):
-    #     """Test designed to check if arxiv papers are detected"""
-    #     with open(test_data_path + "test_issue_181.txt", "r") as data_file:
-    #         test_text = data_file.read()
-    #         arxiv_links = regular_expressions.extract_arxiv_links(test_text)
-    #         assert len(arxiv_links) > 0
+    #Test commented out because arxiv links with no context has demonstrated not to be useful
+    def test_issue_181(self):
+        """Test designed to check if arxiv papers are detected"""
+        with open(test_data_path + "test_issue_181.txt", "r") as data_file:
+            test_text = data_file.read()
+            result = regular_expressions.extract_arxiv_links(test_text, Result(), test_data_path + "test_issue_181.txt")
+            arxiv_urls = result.results[constants.CAT_RELATED_PAPERS]
+            assert len(arxiv_urls) > 0
+    def test_issue_181_2(self):
+        """Test designed to check if arxiv papers are detected"""
+        with open(test_data_path + "test_issue_181_2.txt", "r") as data_file:
+            test_text = data_file.read()
+            result = regular_expressions.extract_arxiv_links(test_text, Result(), test_data_path + "test_issue_181_2.txt")
+            arxiv_url = result.results[constants.CAT_RELATED_PAPERS][0]['result']['value']
+            expected_result = "https://arxiv.org/abs/2203.01044"
+            self.assertEquals(expected_result,arxiv_url)
+    def test_issue_181_3(self):
+        """Test to test arxiv as embedded url, including same in bibtex"""
+        with open(test_data_path + "test_issue_181_3.txt", "r") as data_file:
+            test_text = data_file.read()
+            result = regular_expressions.extract_arxiv_links(test_text, Result(),
+                                                             test_data_path + "test_issue_181_3.txt")
+            arxiv_url = result.results[constants.CAT_RELATED_PAPERS][0]['result']['value']
+            expected_result = "https://arxiv.org/abs/1907.11111"
+            self.assertEquals(expected_result, arxiv_url)
 
     def test_issue_270(self):
         """Test designed to check if support channels are detected"""

diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py
@@ -69,6 +69,7 @@
 CAT_PROGRAMMING_LANGUAGES = "programming_languages"
 CAT_README_URL = "readme_url"
 CAT_RELATED_DOCUMENTATION = "related_documentation"
+CAT_RELATED_PAPERS = "related_papers"
 CAT_RELEASES = "releases"
 CAT_RUN = "run"
 CAT_STATUS = "repository_status"