Squashed 'pages/libs/mpxsonar/' changes from 6c083cd..91422cf

91422cf fix merge 673fcc9 change == to is 59f49b0 Patch mpoxsonar (#2) b403feb minor change in dbm 8892a93 Update MpoxSonar 0.5.3 Merge commit 'd20960430509ef4fc45455fdb206e7ef5c8030f5' into note/dev 58c001e fix matching function 668a8a9 qMerge commit '55314730f7bcd1e927c95bb9befc780fdb8ee3c6' into note/dev ef06eca update NCBI download 09b494e Merge commit '70b50bcac89349125d5f077daa004f97079ac953' into note/dev c5854d7 fix count issue bf0fd53 Patch V.0.3.5 (#7) a0572a0 fix delete function 7618df4 Update Dev branch (#5) 63ee752 Update patch 0.5.1 (#1) 541d76a update 0.5.1, fix bugs 4d9fd3b update 0.5.1 9f71aaf Merge branch 'note/dev' of https://github.com/rki-mf1/MpoxRadar into note/dev 7e9afad Merge commit 'f4e9902d0b5fb8806264882d42b61db87589d3de' as 'pages/libs/mpxsonar' 5568533 remove mpxsonar 245d27d update the site dbdb006 update mpoxsonar repo. 9c147cb update NT display 570b4ff fix bug 55cc0b4 Update README.md 72e976f Merge pull request #12 from ferbsx/add-license-1 8b5a311 Create LICENSE.md 6c98df9 The first version to the mpoxradar.net (#11) acb06dd The first version to the mpoxradar.net (#11) b99fb44 terminology correction 9f4c00e replaced file for the correct name f427576 correcting name c14d9aa Update about.py 18333e7 FAQ edit according to alice comment 751c5f7 edit terminology: mpox, MPoxRadar d015854 edit wrong links with anchor tag bc7a09c Merge pull request #10 from ferbsx/injun/dev 268e798 Bugremovung adbb052 Merge pull request #9 from ferbsx/injun/dev e1cab9c Merge remote-tracking branch 'origin/dev' into injun/dev 5c989dc Merge remote-tracking branch 'origin/dev' into injun/dev 3cbcf5b Merge pull request #8 from ferbsx/ivan/dev 473f845 Merge branch 'injun/dev' into ivan/dev a8b966f Merge branch 'injun/dev' into ivan/dev 3b32162 garbage cleaning 4cd7dd5 Update help.py 40d86d8 Update about.py abec293 MPXdatabase dump added 892b0d2 Update util_help_tables.py 2e2d17d Update util_tool_checklists.py f9159ca Update util_footer_table.py 08cbcd2 Update contact.py 518d27c 1. changed structure 2. !!!callbacks doent trigger ee77cd4 csv-file in case of no working database - for developemnt only 3594609 data.py connects to a mpox-database and gets a dictionary of dataframes 94225ad stylized names, texts and titles 076fc28 this is Ivan's tool, which is ahead Injun's (--> trying to make one branck/fork from these two, to then put in dev) 7a177bd merge change 829b200 merge change 80833ec map.py is now unncessary 7efebb8 jorge_tool.py is now unnecessary 59e213c pointing at where to replace the map-figures ca7d045 adding an example to run app.py with different env-paramenters - usefull when developing with accesss to local databases c43399a fixed zoom in non-jorge's map 5b9ab7b added jorge's code 36d5ff6 updated personal notesvim 599909c updated personal notes 3de147b commented out the whole file to not interfere with pages/tool.py and avoid Werkzeug-Exception 98d2171 location_coordinates.csv file from Ivan 99980ab Data.csv file from Ivan b36051c added import os in order to make pages/*.csv relative paths, note absolute paths 8fb0984 jorge's personal notes b9e4854 copy of jorge's tool.py to be merged with dev's tool.py 024927a copy of jorge's data.py for dev-branch e4bd4a0 rephrasing text. from jorge's branch to dev 364ecb3 general .env.template from ivan's to injun's 47d9421 added one of the two data-files from Ivan 80dd223 resize logo e6e9b02 Note/dev (#6) 9e2ec74 Note/dev (#6) 4757c44 new design (from Injun) 12c8197 Merge branch 'dev' of https://github.com/ferbsx/MPXRadar-frontend into dev aac9abb help page added 694caa8 edit Datasource Link cb54534 add page for table, edit for callback 1f79d23 data for geo 71e1e94 make footer&tool.checkout, edit some pages ee6e064 callbacks to the separate file dfed955 added map animation 9026b9f data f0a8727 Q&A on help.py, add photo, edit footer 9c4c7fa web page ceate and edit 0f50ada web page ceate and edit dd44c8b edit from evan/dev dbe41b2 edit from evan/dev 82a9053 map working instruments 96345f4 add style and map.py file 108070d playing with map 02da350 adding dbc components 3f5ef9d update mpxsonar 0.4.3, merge ec3196b update mpxsonar 0.4.3, merge 5de8641 update mpxsonar 0.4.3, merge 5bace10 update mpxsonar 0.4.3, merge 2744ff3 Squashed 'pages/libs/mpxsonar/' changes from 36581b5..0ffa7eb 8d95d44 note/dev (#5) 159896e Note/dev -Update MPXsonar 0.4.2 (#4) fd76d1d Note/dev -Update MPXsonar 0.4.2 (#4) a9c407b map manipulatios 85542e7 pages/home.py 6bea1dd cosmeic changes 3871a65 cosmetic changes 2c3c307 fixed internal address problems 41e3bd3 added an example map 922bcf6 some details fd9050e some details e210173 some details 08761c1 new design details 3d00552 Delete .gitkeep 95ba64a Add files via upload 9c303b6 Create .gitkeep 13801c7 Add files via upload b6fcb62 Merge commit 'f4be82d7efccd85009e4f538b5d52978ba74d4d2' as 'pages/libs/mpxsonar' f4be82d Squashed 'pages/libs/mpxsonar/' content from commit 36581b5 4ef7779 Merge commit 'f4be82d7efccd85009e4f538b5d52978ba74d4d2' as 'pages/libs/mpxsonar' 3943203 remove subtree bugs 5623f35 Update to latest Note/dev (#3) b5cca15 Update UI (#2) be05693 add pages contact, edit app&about pages, add assets folder for imgae files c82a67e making new repo for mpx ABOUT page f56f892 making new repo for mpx ABOUT page 7dfdd5f update MPXSonar 0.3.7 (#1) 9112085 update MPXSonar 0.3.7 (#1) 40a838f update config.yml and *.md db4ba15 setup *.md docs 6de6fe5 setup *.md docs 5a39726 Merge commit '555d93ff42af3d2468eb950ff0d2c8548b8b179e' 0.3.1 into note/dev 555d93f Squashed 'libs/mpxsonar/' changes from 50172ad..ae11fb4 7fc1420 add mpxsonar subtree b0eddcc Squashed 'libs/mpxsonar/' content from commit 50172ad 511b011 Merge commit 'b0eddcc84014b880deddc0839a64bc4435b8237e' as 'libs/mpxsonar' 3c15519 Initial commit git-subtree-dir: pages/libs/mpxsonar git-subtree-split: 91422cfde7b7bdddc4465ec50e52fddd46793a7c
rki-mf1 · Sep 27, 2023 · 89dc3b3 · 89dc3b3
1 parent f4e9902
commit 89dc3b3
Show file tree

Hide file tree

Showing 16 changed files with 16,854 additions and 766 deletions.
diff --git a/.env.template b/.env.template
@@ -1,9 +1,3 @@
-# DASH APP
-SERVER="0.0.0.0"
-PORT="80"
-SECRET_KEY="XX58602833XX"
-DEBUG=True
-
 # Database URL with generic format of connection.
 # https://USERNAME:PASSWORD@IP:PORT/DBNAME
 DB_URL="https://super_user:123456@localhost:3306/mpx"
@@ -12,11 +6,12 @@ DB_URL="https://super_user:123456@localhost:3306/mpx"
 # DEBUG, INFO, WARNING, ERROR, CRITICAL
 LOG_LEVEL=DEBUG
 
+
+# ------- For NCBI downloader -------
 # NCBI API-KEY
 # To get API key https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
 NCBI_API_KEY=""
 NCBI_TOOL="MPXSonar"
 NCBI_EMAIL=""
-
-# For NCBI downloader
-SAVE_PATH = ""
+# Output from processing.
+SAVE_PATH = "/data/prod/download"
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,20 +1,20 @@
-# Welcome to the covsonar contributing guide <!-- omit in toc -->
+# Welcome to the mpoxsonar contributing guide <!-- omit in toc -->
 
 In this guide you will get an overview of the contribution workflow from setting up a development environment, testing your changes, submitting a pull request and performing a release.
 
 Use the table of contents icon on the top left corner of this document to get to a specific section of this guide quickly.
 
 ## TLDR; I want to start hacking now!
 
-Clone the repo, and the you can run these commands from within the `covsonar/` directory:
+Clone the repo, and the you can run these commands from within the `mpoxsonar/` directory:
 
 ```sh
-mamba create -n covsonar-dev python=3.9 poetry fortran-compiler nox pre-commit emboss=6.6.0
-mamba activate covsonar-dev  # needs to be activated for the following commands to work
+mamba create -n mpoxsonar-dev python=3.10 poetry fortran-compiler nox pre-commit emboss=6.6.0
+mamba activate mpoxsonar-dev  # needs to be activated for the following commands to work
 git config blame.ignoreRevsFile .git-blame-ignore-revs  # ignore black reformatting when doing git blame
 pre-commit install  # install pre-commit hooks for formatting and linting
-poetry install  # install current source of covsonar and its dependencies
-poetry run covsonar <args>  # run covsonar
+poetry install  # install current source of mpoxsonar and its dependencies
+poetry run sonar <args>  # run sonar
 nox  # run linting and pytest tests (add -r to reuse previously built environments)
 nox -rs zimports black  # auto format imports and code
 ```
@@ -25,37 +25,37 @@ To get an overview of the project itself, read the [README](README.md).
 
 ## Getting started
 
-covsonar is written in Python and tries to follow the excellent packaging guidelines ["Hypermodern Python" by Claudio Jolowicz](https://cjolowicz.github.io/posts/hypermodern-python-01-setup/). Nevertheless, there are some places where covsonar differs from those guidelines, and we have tried to outline those differences here wherever relevant. The main differences are caused by most work on covsonar happening in an environment where administrator access is not available (a shared Linux HPC), and also because we want our package to be installable via [conda](https://docs.conda.io/en/latest/index.html) or [mamba](https://github.com/mamba-org/mamba), from the [bioconda](https://bioconda.github.io/) channel in particular.
+mpoxsonar is written in Python and tries to follow the excellent packaging guidelines ["Hypermodern Python" by Claudio Jolowicz](https://cjolowicz.github.io/posts/hypermodern-python-01-setup/). Nevertheless, there are some places where mpoxsonar differs from those guidelines, and we have tried to outline those differences here wherever relevant. The main differences are caused by most work on mpoxsonar happening in an environment where administrator access is not available (a shared Linux HPC), and also because we want our package to be installable via [conda](https://docs.conda.io/en/latest/index.html) or [mamba](https://github.com/mamba-org/mamba), from the [bioconda](https://bioconda.github.io/) channel in particular.
 
 ### Setting up your development tools
 
-Some tooling needs to be set up before you can work on covsonar. To install this we use mamba, a faster replacement for the conda package manager, and place them in their own environment:
+Some tooling needs to be set up before you can work on mpoxsonar. To install this we use mamba, a faster replacement for the conda package manager, and place them in their own environment:
 
 ```sh
-mamba create -n covsonar-dev python=3 poetry fortran-compiler nox pre-commit
+mamba create -n mpoxsonar-dev python=3 poetry fortran-compiler nox pre-commit
 ```
 
 Then when you want to work on the project, or at the very least if you want to use poetry commands or run tests, you need to switch to this environment:
 
 ```sh
-mamba activate covsonar-dev
+mamba activate mpoxsonar-dev
 ```
 
-The rest of this document assumes that you have the covsonar-dev environment active.
+The rest of this document assumes that you have the mpoxsonar-dev environment active.
 
-Once you have that environment installed and activated, you can run covsonar:
+Once you have that environment installed and activated, you can run mpoxsonar:
 
 ```sh
-poetry run covsonar --help
+poetry run sonar --help
 ```
 
 ### Installing the package
 
-As you're developing, you can install what you have developed using poetry install into your covsonar-dev conda environment:
+As you're developing, you can install what you have developed using poetry install into your mpoxsonar-dev conda environment:
 
 ```sh
 poetry install
-covsonar --version
+sonar --version
 ```
 
 ### Testing
@@ -154,13 +154,13 @@ $ poetry env info
 Virtualenv
 Python:         3.10.4
 Implementation: CPython
-Path:           /home/<redacted>/.conda/envs/covsonar-dev
+Path:           /home/<redacted>/.conda/envs/mpoxsonar-dev
 Valid:          True
 
 System
 Platform: linux
 OS:       posix
-Python:   /home/<redacted>/.conda/envs/covsonar-dev
+Python:   /home/<redacted>/.conda/envs/mpoxsonar-dev
 ```
 
 If you decide to rename your conda development environment or have multiple projects and decide to use multiple conda environments, then you might have to switch the environment that poetry is using. This can be done by running the commands:

diff --git a/NCBI.downloader.py b/NCBI.downloader.py
@@ -12,27 +12,64 @@
 import datetime
 import logging
 import os
+import random
 import sys
 import time
 import traceback
 from urllib.error import HTTPError
+from urllib.parse import urlparse
 
 from Bio import Entrez
 from Bio import SeqIO
 import dateparser
 from dotenv import load_dotenv
+import mariadb
 
 load_dotenv()
 LOG_LEVEL = os.getenv("LOG_LEVEL", "DEBUG")
-REF_LIST = [
-    "NC_063383.1",
-    "ON563414.3",
-    "MT903344.1",
+IGNORE_LIST = [
+    "NC_063383.1",  # REF
+    "ON563414.3",  # REF
+    "MT903344.1",  # REF
+    "KJ136820.1",
+    "FV537351.1",
+    "FV537352.1",
+    "OX044338.1",
+    "OX009124.1",
+    "NC_003310.1",  # from 1996
+    "8HG1_T",
+    "8HG1_P",
 ]
 
 Entrez.api_key = os.getenv("NCBI_API_KEY", "")
 Entrez.tool = os.getenv("NCBI_TOOL", "")
 Entrez.email = os.getenv("NCBI_EMAIL", "")  # Always tell NCBI who you are
+URI = urlparse(os.getenv("DB_URL", ""))
+# connection parameters
+
+
+def get_existing_sample_list():
+
+    database = URI.path.replace("/", "")
+    conn_params = {
+        "user": URI.username,
+        "password": URI.password,
+        "host": URI.hostname,
+        "port": URI.port,
+        "database": database,
+    }
+    # Establish a connection
+    connection = mariadb.connect(**conn_params)
+    cursor = connection.cursor()
+    # retrieve data
+    cursor.execute("SELECT name FROM sample;")
+    # print content
+    db_sample_list = [item[0] for item in cursor.fetchall()]
+
+    # free resources
+    cursor.close()
+    connection.close()
+    return db_sample_list
 
 
 def download(save_path):  # noqa: C901
@@ -56,7 +93,7 @@ def download(save_path):  # noqa: C901
 
             record = Entrez.read(handle)
             total_count = record["Count"]
-            logging.info("Total sample to download: %s " % (total_count))
+            logging.info("All samples are found: %s " % (total_count))
 
             handle = Entrez.esearch(
                 db=DB,
@@ -67,16 +104,23 @@ def download(save_path):  # noqa: C901
             )
             record = Entrez.read(handle)
             # setup cache
-            time.sleep(1)
+            time.sleep(random.randint(3, 6))
             # print(record)
             id_list = record["IdList"]
+            db_sample_list = get_existing_sample_list()
+            id_list = list(set(id_list) - set(db_sample_list))
+            total_count = len(id_list)
+
+            logging.info("Remaining samples after check: %s " % (total_count))
             search_results = Entrez.read(Entrez.epost(DB, id=",".join(id_list)))
             webenv = search_results["WebEnv"]
             query_key = search_results["QueryKey"]
-            time.sleep(1)
+
             success = True
         except Exception as e:
             logging.error("Error at %s", "getting ID", exc_info=e)
+            logging.info("Reties to reconnect...")
+            time.sleep(random.randint(10, 20))
     handle.close()
     if attempt == 3 and not success:
         return False
@@ -85,6 +129,7 @@ def download(save_path):  # noqa: C901
         file_log_handler = open(os.path.join(save_path, ".download.log"), "w+")
     else:
         file_log_handler = open(os.path.join(save_path, ".download.log"), "r+")
+
     try:
         _start = int(file_log_handler.read())
         logging.info(f"Resume previous download: start at {_start}")
@@ -120,15 +165,16 @@ def download(save_path):  # noqa: C901
                 if 500 <= err.code <= 599:
                     logging.warning(f"Received error from server {err}")
                     logging.warning("Attempt {attempt} of 3")
-                    time.sleep(5)
+                    time.sleep(random.randint(30, 60))
                 if 400 == err.code:
                     logging.warning(f"Received error from server {err}")
                     logging.warning("Attempt {attempt} of 3")
-                    time.sleep(5)
+                    time.sleep(random.randint(30, 60))
                 else:
                     raise
             except Exception as e:
                 logging.error("Error at %s", "download sample", exc_info=e)
+                time.sleep(random.randint(3, 6))
 
         if attempt == 3 and not success:
             fetch_handle.close()
@@ -145,7 +191,7 @@ def download(save_path):  # noqa: C901
         file_log_handler.seek(0)
         file_log_handler.write(str(end))
         file_log_handler.truncate()
-        time.sleep(2)
+        time.sleep(random.randint(3, 6))
 
     with open(os.path.join(save_path, ".download.success"), "w") as f:
         f.writelines("done")
@@ -161,7 +207,6 @@ def generate_outputfiles(save_download_path, save_final_path):  # noqa: C901
         if x.endswith(".GB"):
             # Prints only text file present in My Folder
             list_of_GB.append(os.path.join(save_download_path, x))
-    # TODO: remove reference genome from the list.
 
     # fasta & meta
     fasta_out_handler = open(os.path.join(save_final_path, "seq.fasta"), "w")
@@ -175,14 +220,17 @@ def generate_outputfiles(save_download_path, save_final_path):  # noqa: C901
         "RELEASE_DATE",
         "COLLECTION_DATE",
         "SEQ_TECH",
+        "HOST",
+        "GENOME_COMPLETENESS",
     ]
     meta_out_handler.write("\t".join(header) + "\n")  # Write the header line
     try:
         for _file in list_of_GB:
             logging.info("Load:" + _file)
             seq_GBrecords = list(SeqIO.parse(_file, "genbank"))
             for seq_record in seq_GBrecords:
-                if seq_record.id in REF_LIST:
+                # remove reference genome from the list.
+                if seq_record.id in IGNORE_LIST:
                     continue
 
                 _isolate = ""
@@ -191,6 +239,8 @@ def generate_outputfiles(save_download_path, save_final_path):  # noqa: C901
                 _NCBI_release_date = ""
                 _collection_date = ""
                 _seq_tech = ""
+                _nuc_completeness = ""
+                _host = ""
                 # print("Dealing with GenBank record %s" % seq_record.id)
 
                 fasta_out_handler.write(
@@ -199,6 +249,12 @@ def generate_outputfiles(save_download_path, save_final_path):  # noqa: C901
                 )
 
                 # assume all keys are exit.
+                if "partial" in seq_record.description:
+                    _nuc_completeness = "partial"
+                elif "complete" in seq_record.description:
+                    _nuc_completeness = "complete"
+                if "host" in seq_record.features[0].qualifiers:
+                    _host = seq_record.features[0].qualifiers["host"][0]
                 if "isolate" in seq_record.features[0].qualifiers:
                     _isolate = seq_record.features[0].qualifiers["isolate"][0]
                 if "country" in seq_record.features[0].qualifiers:
@@ -210,8 +266,10 @@ def generate_outputfiles(save_download_path, save_final_path):  # noqa: C901
                     _collection_date = seq_record.features[0].qualifiers[
                         "collection_date"
                     ][0]
-                    # 1.) need to fix date Nov-2017 -> 2017-11-01, 09-Nov-2017 -> 2017-11-09
-                    # 1995 -> 1995-01-01 set default value with first day of
+                    # Step
+                    # 1.) Fix date;
+                    # * Nov-2017 -> 2017-11-01, 09-Nov-2017 -> 2017-11-09
+                    # * 1995 -> 1995-01-01 set default value with first day of
                     # the month and first month of the year
                     # 2.) Year needs to be present in the format.
 
@@ -238,15 +296,15 @@ def generate_outputfiles(save_download_path, save_final_path):  # noqa: C901
 
                 if "date" in seq_record.annotations:
                     _NCBI_release_date = seq_record.annotations["date"]
-                    # need to fix date 18-NOV-2022 -> 2022-11-18
+                    # Fix date; 18-NOV-2022 -> 2022-11-18
                     d = dateparser.parse(
                         _NCBI_release_date,
                         settings={"PREFER_DAY_OF_MONTH": "first", "DATE_ORDER": "YMD"},
                     )
                     _NCBI_release_date = d.strftime("%Y-%m-%d")
 
                 meta_out_handler.write(
-                    "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
+                    "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
                     % (
                         seq_record.id,
                         _isolate,
@@ -256,6 +314,8 @@ def generate_outputfiles(save_download_path, save_final_path):  # noqa: C901
                         _NCBI_release_date,
                         _collection_date,
                         _seq_tech,
+                        _host,
+                        _nuc_completeness,
                     )
                 )
     except Exception:
@@ -295,7 +355,7 @@ def run(args):
             logging.StreamHandler(),
         ],
     )
-    logging.info("Script version: 1")
+    logging.info("Script version: 1.1")
     logging.info("Save output to:" + SAVE_PATH)
 
     save_download_path = os.path.join(SAVE_PATH, "GB")
@@ -318,16 +378,16 @@ def run(args):
 
     # 3
     logging.info("--- Convert GeneBank to fasta and meta file ---")
-    if not os.path.exists(os.path.join(SAVE_PATH, ".success")):
-        if generate_outputfiles(save_download_path, save_final_path):
+    # if not os.path.exists(os.path.join(SAVE_PATH, ".success")):
+    if generate_outputfiles(save_download_path, save_final_path):
 
-            logging.info("Processing completed")
-        else:
-            logging.error("Process stop before it is finished")
-            sys.exit("Please rerun it again later.")
+        logging.info("Processing completed")
+    else:
+        logging.error("Process stop before it is finished")
+        sys.exit("Please rerun it again later.")
 
-        with open(os.path.join(SAVE_PATH, ".success"), "w+") as f:
-            f.write("done")
+    with open(os.path.join(SAVE_PATH, ".success"), "w+") as f:
+        f.write("done")
     logging.info("--- Done ---")