Squashed 'pages/libs/mpxsonar/' changes from 36581b5..0ffa7eb

0ffa7eb fix match command 199b4a9 manual merge note/dev 397c013 Untrack files in .gitignore cf24653 update NCBI.downloader and pipeline.sh, fix minor bugs c385ff3 add multithreads to support paranoid test 5906b20 fix bugs 4817043 patch bug on matching 8315835 add ncbi download script 700db00 update to version 0.3.8 5cde748 fix bugs git-subtree-dir: pages/libs/mpxsonar git-subtree-split: 0ffa7eba073b416835c7b196b20c79d9345b9dd6
rki-mf1 · Nov 28, 2022 · 2744ff3 · 2744ff3
1 parent f4be82d
commit 2744ff3
Show file tree

Hide file tree

Showing 14 changed files with 1,430 additions and 326 deletions.
diff --git a/.env.template b/.env.template
@@ -8,6 +8,15 @@ DEBUG=True
 # https://USERNAME:PASSWORD@IP:PORT/DBNAME
 DB_URL="https://super_user:123456@localhost:3306/mpx"
 
-# Logging
+# Logging for MPXSonar
 # DEBUG, INFO, WARNING, ERROR, CRITICAL
 LOG_LEVEL=DEBUG
+
+# NCBI API-KEY
+# To get API key https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
+NCBI_API_KEY=""
+NCBI_TOOL="MPXSonar"
+NCBI_EMAIL=""
+
+# For NCBI downloader
+SAVE_PATH = ""
diff --git a/.gitignore b/.gitignore
@@ -164,3 +164,9 @@ cython_debug/
 
 # Test data
 !tests/data/cache-test/var
+
+# NCBI
+epost.dtd
+
+# pipeline
+pipeline.sh
diff --git a/197120 b/197120
diff --git a/NCBI.downloader.py b/NCBI.downloader.py
@@ -0,0 +1,345 @@
+#!/usr/bin/env python3
+# Author: K2.
+# Step in this File
+# 1. Search query term and then get total count
+# 2. load with batch size and save into tmp file.
+# 3. Parse the record and save into tsv and fasta
+# ----
+# 4. log.log file to keep track process
+# 5. .success to indicate this folder is ready to import.
+
+import argparse
+import datetime
+import logging
+import os
+import sys
+import time
+import traceback
+from urllib.error import HTTPError
+
+from Bio import Entrez
+from Bio import SeqIO
+import dateparser
+from dotenv import load_dotenv
+
+load_dotenv()
+LOG_LEVEL = os.getenv("LOG_LEVEL", "DEBUG")
+REF_LIST = [
+    "NC_063383.1",
+    "ON563414.3",
+    "MT903344.1",
+]
+
+Entrez.api_key = os.getenv("NCBI_API_KEY", "")
+Entrez.tool = os.getenv("NCBI_TOOL", "")
+Entrez.email = os.getenv("NCBI_EMAIL", "")  # Always tell NCBI who you are
+
+
+def download(save_path):  # noqa: C901
+    # nucleotide nuccore
+    DB = "nucleotide"
+    QUERY = "Monkeypox virus[Organism] AND complete[prop]"
+    BATCH_SIZE = 10
+    # 1
+    # retmax=1 just returns first result of possibly many.
+    attempt = 0
+    success = False
+    while attempt < 3 and not success:
+        attempt += 1
+        try:
+            handle = Entrez.esearch(
+                db=DB,
+                term=QUERY,
+                usehistory="y",
+                idtype="acc",
+            )
+
+            record = Entrez.read(handle)
+            total_count = record["Count"]
+            logging.info("Total sample to download: %s " % (total_count))
+
+            handle = Entrez.esearch(
+                db=DB,
+                term=QUERY,
+                retmax=total_count,
+                usehistory="y",
+                idtype="acc",
+            )
+            record = Entrez.read(handle)
+            # setup cache
+            time.sleep(1)
+            # print(record)
+            id_list = record["IdList"]
+            search_results = Entrez.read(Entrez.epost(DB, id=",".join(id_list)))
+            webenv = search_results["WebEnv"]
+            query_key = search_results["QueryKey"]
+            time.sleep(1)
+            success = True
+        except Exception as e:
+            logging.error("Error at %s", "getting ID", exc_info=e)
+    handle.close()
+    if attempt == 3 and not success:
+        return False
+
+    if not os.path.exists(os.path.join(save_path, ".download.log")):
+        file_log_handler = open(os.path.join(save_path, ".download.log"), "w+")
+    else:
+        file_log_handler = open(os.path.join(save_path, ".download.log"), "r+")
+    try:
+        _start = int(file_log_handler.read())
+        logging.info(f"Resume previous download: start at {_start}")
+    except Exception:
+        logging.warning(
+            "Cannot resume previous download, we will start a new download."
+        )
+        _start = 0
+    # 2
+    for start in range(_start, int(total_count), BATCH_SIZE):
+        end = min(int(total_count), start + BATCH_SIZE)
+        logging.info("Going to download record %i to %i" % (start + 1, end))
+
+        save_filename = os.path.join(save_path, f"MPX.{start}-{end}.GB")
+        file_handler = open(save_filename, "w")
+        attempt = 0
+        success = False
+        while attempt < 3 and not success:
+            attempt += 1
+            try:
+                fetch_handle = Entrez.efetch(
+                    db=DB,
+                    rettype="gb",
+                    retmode="text",
+                    retstart=start,
+                    retmax=BATCH_SIZE,
+                    webenv=webenv,
+                    query_key=query_key,
+                    idtype="acc",
+                )
+                success = True
+            except HTTPError as err:
+                if 500 <= err.code <= 599:
+                    logging.warning(f"Received error from server {err}")
+                    logging.warning("Attempt {attempt} of 3")
+                    time.sleep(5)
+                if 400 == err.code:
+                    logging.warning(f"Received error from server {err}")
+                    logging.warning("Attempt {attempt} of 3")
+                    time.sleep(5)
+                else:
+                    raise
+            except Exception as e:
+                logging.error("Error at %s", "download sample", exc_info=e)
+
+        if attempt == 3 and not success:
+            fetch_handle.close()
+            file_handler.close()
+            file_log_handler.close()
+            return False
+
+        data = fetch_handle.read()
+        fetch_handle.close()
+
+        file_handler.write(data)
+        file_handler.close()
+        # save  end .log
+        file_log_handler.seek(0)
+        file_log_handler.write(str(end))
+        file_log_handler.truncate()
+        time.sleep(2)
+
+    with open(os.path.join(save_path, ".download.success"), "w") as f:
+        f.writelines("done")
+
+    file_log_handler.close()
+    return True
+
+
+def generate_outputfiles(save_download_path, save_final_path):  # noqa: C901
+
+    list_of_GB = []
+    for x in os.listdir(save_download_path):
+        if x.endswith(".GB"):
+            # Prints only text file present in My Folder
+            list_of_GB.append(os.path.join(save_download_path, x))
+    # TODO: remove reference genome from the list.
+
+    # fasta & meta
+    fasta_out_handler = open(os.path.join(save_final_path, "seq.fasta"), "w")
+    meta_out_handler = open(os.path.join(save_final_path, "meta.tsv"), "w")
+    header = [
+        "ID",
+        "ISOLATE",
+        "LENGTH",
+        "COUNTRY",
+        "GEO_LOCATION",
+        "RELEASE_DATE",
+        "COLLECTION_DATE",
+        "SEQ_TECH",
+    ]
+    meta_out_handler.write("\t".join(header) + "\n")  # Write the header line
+    try:
+        for _file in list_of_GB:
+            logging.info("Load:" + _file)
+            seq_GBrecords = list(SeqIO.parse(_file, "genbank"))
+            for seq_record in seq_GBrecords:
+                if seq_record.id in REF_LIST:
+                    continue
+
+                _isolate = ""
+                _country = ""
+                _geo_location = ""
+                _NCBI_release_date = ""
+                _collection_date = ""
+                _seq_tech = ""
+                # print("Dealing with GenBank record %s" % seq_record.id)
+
+                fasta_out_handler.write(
+                    ">%s |%s\n%s\n"
+                    % (seq_record.id, seq_record.description, seq_record.seq)
+                )
+
+                # assume all keys are exit.
+                if "isolate" in seq_record.features[0].qualifiers:
+                    _isolate = seq_record.features[0].qualifiers["isolate"][0]
+                if "country" in seq_record.features[0].qualifiers:
+                    _geo_location = seq_record.features[0].qualifiers["country"][0]
+                    # extract country only
+                    _country = _geo_location.split(":")[0]
+
+                if "collection_date" in seq_record.features[0].qualifiers:
+                    _collection_date = seq_record.features[0].qualifiers[
+                        "collection_date"
+                    ][0]
+                    # 1.) need to fix date Nov-2017 -> 2017-11-01, 09-Nov-2017 -> 2017-11-09
+                    # 1995 -> 1995-01-01 set default value with first day of
+                    # the month and first month of the year
+                    # 2.) Year needs to be present in the format.
+
+                    d = dateparser.parse(
+                        _collection_date,
+                        settings={
+                            "PREFER_DAY_OF_MONTH": "first",
+                            "DATE_ORDER": "YMD",
+                            "REQUIRE_PARTS": ["year"],
+                            "RELATIVE_BASE": datetime.datetime(2022, 1, 1),
+                        },
+                    )
+                    _collection_date = d.strftime("%Y-%m-%d")
+
+                if (
+                    "structured_comment" in seq_record.annotations
+                    and "Assembly-Data" in seq_record.annotations["structured_comment"]
+                    and "Sequencing Technology"
+                    in seq_record.annotations["structured_comment"]["Assembly-Data"]
+                ):
+                    _seq_tech = seq_record.annotations["structured_comment"][
+                        "Assembly-Data"
+                    ]["Sequencing Technology"]
+
+                if "date" in seq_record.annotations:
+                    _NCBI_release_date = seq_record.annotations["date"]
+                    # need to fix date 18-NOV-2022 -> 2022-11-18
+                    d = dateparser.parse(
+                        _NCBI_release_date,
+                        settings={"PREFER_DAY_OF_MONTH": "first", "DATE_ORDER": "YMD"},
+                    )
+                    _NCBI_release_date = d.strftime("%Y-%m-%d")
+
+                meta_out_handler.write(
+                    "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
+                    % (
+                        seq_record.id,
+                        _isolate,
+                        len(seq_record),
+                        _country,
+                        _geo_location,
+                        _NCBI_release_date,
+                        _collection_date,
+                        _seq_tech,
+                    )
+                )
+    except Exception:
+        fasta_out_handler.close()
+        meta_out_handler.close()
+        logging.exception(f"Error in {seq_record.id} !!")
+        traceback.print_exc()
+        return False
+
+    fasta_out_handler.close()
+    meta_out_handler.close()
+    return True
+
+
+def run(args):
+    # logging.info("api_key:" + Entrez.api_key)
+    # logging.info("tool:" + Entrez.tool)
+    # logging.info("email:" + Entrez.email)
+    if args.date:
+        TODAY_DATE = args.date
+    else:
+        TODAY_DATE = datetime.datetime.now().strftime("%Y-%m-%d")
+
+    if args.output:
+        SAVE_PATH = args.output
+    else:
+        SAVE_PATH = os.path.join(os.getenv("SAVE_PATH", ""), TODAY_DATE)
+    if not os.path.exists(SAVE_PATH):
+        os.makedirs(SAVE_PATH)
+
+    logging.basicConfig(
+        format="NCBI-DL:%(asctime)s %(levelname)-4s: %(message)s",
+        level=LOG_LEVEL,
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[
+            logging.FileHandler(os.path.join(SAVE_PATH, "debug.log")),
+            logging.StreamHandler(),
+        ],
+    )
+    logging.info("Script version: 1")
+    logging.info("Save output to:" + SAVE_PATH)
+
+    save_download_path = os.path.join(SAVE_PATH, "GB")
+    if not os.path.exists(save_download_path):
+        os.makedirs(save_download_path)
+
+    save_final_path = os.path.join(SAVE_PATH, "output")
+    if not os.path.exists(save_final_path):
+        os.makedirs(save_final_path)
+    #
+    logging.info("--- Download samples ---")
+    if not os.path.exists(os.path.join(SAVE_PATH, "GB", ".download.success")):
+        if download(save_download_path):
+            logging.info("Download completed")
+        else:
+            logging.error("Download stop before it is finished")
+            sys.exit("Please rerun it again later.")
+    else:
+        logging.info("Download completed, continue to process on GeneBank files.")
+
+    # 3
+    logging.info("--- Convert GeneBank to fasta and meta file ---")
+    if not os.path.exists(os.path.join(SAVE_PATH, ".success")):
+        if generate_outputfiles(save_download_path, save_final_path):
+
+            logging.info("Processing completed")
+        else:
+            logging.error("Process stop before it is finished")
+            sys.exit("Please rerun it again later.")
+
+        with open(os.path.join(SAVE_PATH, ".success"), "w+") as f:
+            f.write("done")
+    logging.info("--- Done ---")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-d",
+        "--date",
+        type=lambda s: datetime.datetime.strptime(s, "%Y-%m-%d"),
+        help="Date use for download.",
+    )
+    parser.add_argument("-o", "--output", type=str, help="Output")
+    args = parser.parse_args()
+    run(args)
diff --git a/example-dash/app_controller.py b/example-dash/app_controller.py
@@ -105,7 +105,19 @@ def list_prop(db=None):
             # output = output + "\n"
             # output = output + "DATE FORMAT" + "\n"
             output = pd.DataFrame(rows, columns=cols)
-            print(output)
+            # tmp fix.
+            output = output[
+                ~output["name"].isin(
+                    [
+                        "AA_PROFILE",
+                        "AA_X_PROFILE",
+                        "NUC_N_PROFILE",
+                        "NUC_PROFILE",
+                        "IMPORTED",
+                        "MODIFIED",
+                    ]
+                )
+            ]
         return output