Skip to content

Commit

Permalink
Squashed 'pages/libs/mpxsonar/' changes from 36581b5..0ffa7eb
Browse files Browse the repository at this point in the history
0ffa7eb fix match command
199b4a9 manual merge note/dev
397c013 Untrack files in .gitignore
cf24653 update NCBI.downloader and pipeline.sh, fix minor bugs
c385ff3 add multithreads to support paranoid test
5906b20 fix bugs
4817043 patch bug on matching
8315835 add ncbi download script
700db00 update to version 0.3.8
5cde748 fix bugs

git-subtree-dir: pages/libs/mpxsonar
git-subtree-split: 0ffa7eba073b416835c7b196b20c79d9345b9dd6
  • Loading branch information
silenus092 committed Nov 28, 2022
1 parent f4be82d commit 2744ff3
Show file tree
Hide file tree
Showing 14 changed files with 1,430 additions and 326 deletions.
11 changes: 10 additions & 1 deletion .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,15 @@ DEBUG=True
# https://USERNAME:PASSWORD@IP:PORT/DBNAME
DB_URL="https://super_user:123456@localhost:3306/mpx"

# Logging
# Logging for MPXSonar
# DEBUG, INFO, WARNING, ERROR, CRITICAL
LOG_LEVEL=DEBUG

# NCBI API-KEY
# To get API key https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
NCBI_API_KEY=""
NCBI_TOOL="MPXSonar"
NCBI_EMAIL=""

# For NCBI downloader
SAVE_PATH = ""
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,9 @@ cython_debug/

# Test data
!tests/data/cache-test/var

# NCBI
epost.dtd

# pipeline
pipeline.sh
Empty file added 197120
Empty file.
345 changes: 345 additions & 0 deletions NCBI.downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,345 @@
#!/usr/bin/env python3
# Author: K2.
# Step in this File
# 1. Search query term and then get total count
# 2. load with batch size and save into tmp file.
# 3. Parse the record and save into tsv and fasta
# ----
# 4. log.log file to keep track process
# 5. .success to indicate this folder is ready to import.

import argparse
import datetime
import logging
import os
import sys
import time
import traceback
from urllib.error import HTTPError

from Bio import Entrez
from Bio import SeqIO
import dateparser
from dotenv import load_dotenv

load_dotenv()
LOG_LEVEL = os.getenv("LOG_LEVEL", "DEBUG")
REF_LIST = [
"NC_063383.1",
"ON563414.3",
"MT903344.1",
]

Entrez.api_key = os.getenv("NCBI_API_KEY", "")
Entrez.tool = os.getenv("NCBI_TOOL", "")
Entrez.email = os.getenv("NCBI_EMAIL", "") # Always tell NCBI who you are


def download(save_path): # noqa: C901
# nucleotide nuccore
DB = "nucleotide"
QUERY = "Monkeypox virus[Organism] AND complete[prop]"
BATCH_SIZE = 10
# 1
# retmax=1 just returns first result of possibly many.
attempt = 0
success = False
while attempt < 3 and not success:
attempt += 1
try:
handle = Entrez.esearch(
db=DB,
term=QUERY,
usehistory="y",
idtype="acc",
)

record = Entrez.read(handle)
total_count = record["Count"]
logging.info("Total sample to download: %s " % (total_count))

handle = Entrez.esearch(
db=DB,
term=QUERY,
retmax=total_count,
usehistory="y",
idtype="acc",
)
record = Entrez.read(handle)
# setup cache
time.sleep(1)
# print(record)
id_list = record["IdList"]
search_results = Entrez.read(Entrez.epost(DB, id=",".join(id_list)))
webenv = search_results["WebEnv"]
query_key = search_results["QueryKey"]
time.sleep(1)
success = True
except Exception as e:
logging.error("Error at %s", "getting ID", exc_info=e)
handle.close()
if attempt == 3 and not success:
return False

if not os.path.exists(os.path.join(save_path, ".download.log")):
file_log_handler = open(os.path.join(save_path, ".download.log"), "w+")
else:
file_log_handler = open(os.path.join(save_path, ".download.log"), "r+")
try:
_start = int(file_log_handler.read())
logging.info(f"Resume previous download: start at {_start}")
except Exception:
logging.warning(
"Cannot resume previous download, we will start a new download."
)
_start = 0
# 2
for start in range(_start, int(total_count), BATCH_SIZE):
end = min(int(total_count), start + BATCH_SIZE)
logging.info("Going to download record %i to %i" % (start + 1, end))

save_filename = os.path.join(save_path, f"MPX.{start}-{end}.GB")
file_handler = open(save_filename, "w")
attempt = 0
success = False
while attempt < 3 and not success:
attempt += 1
try:
fetch_handle = Entrez.efetch(
db=DB,
rettype="gb",
retmode="text",
retstart=start,
retmax=BATCH_SIZE,
webenv=webenv,
query_key=query_key,
idtype="acc",
)
success = True
except HTTPError as err:
if 500 <= err.code <= 599:
logging.warning(f"Received error from server {err}")
logging.warning("Attempt {attempt} of 3")
time.sleep(5)
if 400 == err.code:
logging.warning(f"Received error from server {err}")
logging.warning("Attempt {attempt} of 3")
time.sleep(5)
else:
raise
except Exception as e:
logging.error("Error at %s", "download sample", exc_info=e)

if attempt == 3 and not success:
fetch_handle.close()
file_handler.close()
file_log_handler.close()
return False

data = fetch_handle.read()
fetch_handle.close()

file_handler.write(data)
file_handler.close()
# save end .log
file_log_handler.seek(0)
file_log_handler.write(str(end))
file_log_handler.truncate()
time.sleep(2)

with open(os.path.join(save_path, ".download.success"), "w") as f:
f.writelines("done")

file_log_handler.close()
return True


def generate_outputfiles(save_download_path, save_final_path): # noqa: C901

list_of_GB = []
for x in os.listdir(save_download_path):
if x.endswith(".GB"):
# Prints only text file present in My Folder
list_of_GB.append(os.path.join(save_download_path, x))
# TODO: remove reference genome from the list.

# fasta & meta
fasta_out_handler = open(os.path.join(save_final_path, "seq.fasta"), "w")
meta_out_handler = open(os.path.join(save_final_path, "meta.tsv"), "w")
header = [
"ID",
"ISOLATE",
"LENGTH",
"COUNTRY",
"GEO_LOCATION",
"RELEASE_DATE",
"COLLECTION_DATE",
"SEQ_TECH",
]
meta_out_handler.write("\t".join(header) + "\n") # Write the header line
try:
for _file in list_of_GB:
logging.info("Load:" + _file)
seq_GBrecords = list(SeqIO.parse(_file, "genbank"))
for seq_record in seq_GBrecords:
if seq_record.id in REF_LIST:
continue

_isolate = ""
_country = ""
_geo_location = ""
_NCBI_release_date = ""
_collection_date = ""
_seq_tech = ""
# print("Dealing with GenBank record %s" % seq_record.id)

fasta_out_handler.write(
">%s |%s\n%s\n"
% (seq_record.id, seq_record.description, seq_record.seq)
)

# assume all keys are exit.
if "isolate" in seq_record.features[0].qualifiers:
_isolate = seq_record.features[0].qualifiers["isolate"][0]
if "country" in seq_record.features[0].qualifiers:
_geo_location = seq_record.features[0].qualifiers["country"][0]
# extract country only
_country = _geo_location.split(":")[0]

if "collection_date" in seq_record.features[0].qualifiers:
_collection_date = seq_record.features[0].qualifiers[
"collection_date"
][0]
# 1.) need to fix date Nov-2017 -> 2017-11-01, 09-Nov-2017 -> 2017-11-09
# 1995 -> 1995-01-01 set default value with first day of
# the month and first month of the year
# 2.) Year needs to be present in the format.

d = dateparser.parse(
_collection_date,
settings={
"PREFER_DAY_OF_MONTH": "first",
"DATE_ORDER": "YMD",
"REQUIRE_PARTS": ["year"],
"RELATIVE_BASE": datetime.datetime(2022, 1, 1),
},
)
_collection_date = d.strftime("%Y-%m-%d")

if (
"structured_comment" in seq_record.annotations
and "Assembly-Data" in seq_record.annotations["structured_comment"]
and "Sequencing Technology"
in seq_record.annotations["structured_comment"]["Assembly-Data"]
):
_seq_tech = seq_record.annotations["structured_comment"][
"Assembly-Data"
]["Sequencing Technology"]

if "date" in seq_record.annotations:
_NCBI_release_date = seq_record.annotations["date"]
# need to fix date 18-NOV-2022 -> 2022-11-18
d = dateparser.parse(
_NCBI_release_date,
settings={"PREFER_DAY_OF_MONTH": "first", "DATE_ORDER": "YMD"},
)
_NCBI_release_date = d.strftime("%Y-%m-%d")

meta_out_handler.write(
"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
% (
seq_record.id,
_isolate,
len(seq_record),
_country,
_geo_location,
_NCBI_release_date,
_collection_date,
_seq_tech,
)
)
except Exception:
fasta_out_handler.close()
meta_out_handler.close()
logging.exception(f"Error in {seq_record.id} !!")
traceback.print_exc()
return False

fasta_out_handler.close()
meta_out_handler.close()
return True


def run(args):
# logging.info("api_key:" + Entrez.api_key)
# logging.info("tool:" + Entrez.tool)
# logging.info("email:" + Entrez.email)
if args.date:
TODAY_DATE = args.date
else:
TODAY_DATE = datetime.datetime.now().strftime("%Y-%m-%d")

if args.output:
SAVE_PATH = args.output
else:
SAVE_PATH = os.path.join(os.getenv("SAVE_PATH", ""), TODAY_DATE)
if not os.path.exists(SAVE_PATH):
os.makedirs(SAVE_PATH)

logging.basicConfig(
format="NCBI-DL:%(asctime)s %(levelname)-4s: %(message)s",
level=LOG_LEVEL,
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[
logging.FileHandler(os.path.join(SAVE_PATH, "debug.log")),
logging.StreamHandler(),
],
)
logging.info("Script version: 1")
logging.info("Save output to:" + SAVE_PATH)

save_download_path = os.path.join(SAVE_PATH, "GB")
if not os.path.exists(save_download_path):
os.makedirs(save_download_path)

save_final_path = os.path.join(SAVE_PATH, "output")
if not os.path.exists(save_final_path):
os.makedirs(save_final_path)
#
logging.info("--- Download samples ---")
if not os.path.exists(os.path.join(SAVE_PATH, "GB", ".download.success")):
if download(save_download_path):
logging.info("Download completed")
else:
logging.error("Download stop before it is finished")
sys.exit("Please rerun it again later.")
else:
logging.info("Download completed, continue to process on GeneBank files.")

# 3
logging.info("--- Convert GeneBank to fasta and meta file ---")
if not os.path.exists(os.path.join(SAVE_PATH, ".success")):
if generate_outputfiles(save_download_path, save_final_path):

logging.info("Processing completed")
else:
logging.error("Process stop before it is finished")
sys.exit("Please rerun it again later.")

with open(os.path.join(SAVE_PATH, ".success"), "w+") as f:
f.write("done")
logging.info("--- Done ---")


if __name__ == "__main__":
parser = argparse.ArgumentParser()

parser.add_argument(
"-d",
"--date",
type=lambda s: datetime.datetime.strptime(s, "%Y-%m-%d"),
help="Date use for download.",
)
parser.add_argument("-o", "--output", type=str, help="Output")
args = parser.parse_args()
run(args)
14 changes: 13 additions & 1 deletion example-dash/app_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,19 @@ def list_prop(db=None):
# output = output + "\n"
# output = output + "DATE FORMAT" + "\n"
output = pd.DataFrame(rows, columns=cols)
print(output)
# tmp fix.
output = output[
~output["name"].isin(
[
"AA_PROFILE",
"AA_X_PROFILE",
"NUC_N_PROFILE",
"NUC_PROFILE",
"IMPORTED",
"MODIFIED",
]
)
]
return output


Expand Down
Loading

0 comments on commit 2744ff3

Please sign in to comment.