diff --git a/GCP RAPT.md b/GCP RAPT.md index f6f2997..9c26a5b 100644 --- a/GCP RAPT.md +++ b/GCP RAPT.md @@ -20,7 +20,7 @@ Here are instructions to execute RAPT once your system is set up. Additional ins 3. Download the latest release by executing the following commands: ``` - ~$ curl -sSLo rapt.tar.gz https://github.com/ncbi/rapt/releases/download/v1.2.2/rapt-v1.2.2.tar.gz + ~$ curl -sSLo rapt.tar.gz https://github.com/ncbi/rapt/releases/download/v0.5.1/rapt-v0.5.1.tar.gz ~$ tar -xzf rapt.tar.gz && rm -f rapt.tar.gz ``` 4. Run `run_rapt_gcp.sh help` to see the *GCP RAPT* usage information. diff --git a/RAPT_context2.png b/RAPT_context2.png deleted file mode 100644 index 5948ac8..0000000 Binary files a/RAPT_context2.png and /dev/null differ diff --git a/RAPT_context4.png b/RAPT_context4.png new file mode 100644 index 0000000..9d0f1d0 Binary files /dev/null and b/RAPT_context4.png differ diff --git a/README.md b/README.md index f476b07..9b5e60d 100644 --- a/README.md +++ b/README.md @@ -2,17 +2,17 @@ RAPT is a NCBI pipeline designed for assembling and annotating short genomic sequencing reads obtained from bacterial or archaeal isolates. RAPT consists of two major components, [SKESA](https://github.com/ncbi/SKESA) and [PGAP](https://github.com/ncbi/pgap). SKESA is a *de novo* assembler for microbial genomes based on DeBruijn graphs. PGAP is a prokaryotic genome annotation pipeline that combines *ab initio* gene prediction algorithms with homology-based methods. RAPT takes an SRA run or a fasta or fastq file of Illumina reads as input and produces an assembled and annotated genome. -If you are new to RAPT, please visit our [wiki page](https://github.com/ncbi/rapt/wiki) for detailed information. +If you are new to RAPT, please visit our [wiki page](https://github.com/ncbi/rapt/wiki) for detailed information, and watch a [short webinar](https://www.youtube.com/watch?v=7trM1pKAVXQ). -![RAPT](RAPT_context2.png) +![RAPT](RAPT_context4.png) To use the latest version, download the RAPT command-line interface with the following commands: ``` -~$ curl -sSLo rapt.tar.gz https://github.com/ncbi/rapt/releases/download/v1.2.2/rapt-v1.2.2.tar.gz +~$ curl -sSLo rapt.tar.gz https://github.com/ncbi/rapt/releases/download/v0.5.1/rapt-v0.5.1.tar.gz ~$ tar -xzf rapt.tar.gz && rm -f rapt.tar.gz ``` -There should be two scripts in your directory now, `run_rapt_gcp.sh` and `run_rapt.py`, corresponding to two variations of RAPT: Google Cloud Platform (GCP) RAPT and Standalone RAPT. [GCP RAPT](GCP%20RAPT.md) is designed to run on GCP and is for users with GCP accounts (please note this is different from a gmail account), while [Stand-alone RAPT](Standalone%20RAPT.md) can run on any computing environments meeting a few pre-requisites. +There should be two scripts in your directory now, `run_rapt_gcp.sh` and `run_rapt.py`, corresponding to two variations of RAPT: Google Cloud Platform (GCP) RAPT and Standalone RAPT. [GCP RAPT](https://github.com/ncbi/rapt/wiki/GCP%20RAPT%20In-depth%20Documentation%20and%20Examples) is designed to run on GCP and is for users with GCP accounts (please note this is different from a gmail account), while [Stand-alone RAPT](https://github.com/ncbi/rapt/wiki/Standalone%20RAPT%20In-depth%20Documentation%20and%20Recommendations) can run on any computing environments meeting a few pre-requisites. -For instructions on running RAPT, please go to their respective documentation pages: [GCP RAPT](GCP%20RAPT.md) or [Stand-alone RAPT](Standalone%20RAPT.md). +For instructions on running RAPT, please go to their respective documentation pages: [GCP RAPT](https://github.com/ncbi/rapt/wiki/GCP%20RAPT%20In-depth%20Documentation%20and%20Examples) or [Stand-alone RAPT](https://github.com/ncbi/rapt/wiki/Standalone%20RAPT%20In-depth%20Documentation%20and%20Recommendations). diff --git a/Standalone RAPT.md b/Standalone RAPT.md index 99ab545..85f0bac 100644 --- a/Standalone RAPT.md +++ b/Standalone RAPT.md @@ -31,7 +31,7 @@ Here are instructions to execute RAPT once your system is set up. Additional ins 2. Download the latest release by executing the following commands:
``` - ~$ curl -sSLo rapt.tar.gz https://github.com/ncbi/rapt/releases/download/v1.2.2/rapt-v1.2.2.tar.gz + ~$ curl -sSLo rapt.tar.gz https://github.com/ncbi/rapt/releases/download/v0.5.1/rapt-v0.5.1.tar.gz ~$ tar -xzf rapt.tar.gz && rm -f rapt.tar.gz ``` 3. Run `./run_rapt.py -h` to see the *Stand-alone RAPT* usage information
diff --git a/dist/CHANGELOG.md b/dist/CHANGELOG.md index b67307c..5b59048 100644 --- a/dist/CHANGELOG.md +++ b/dist/CHANGELOG.md @@ -1,4 +1,53 @@ -### Release v1.2.2 +### Release v0.5.1 +* PGAP at 2022-02-10.build5872 + +### Release v0.5.0 +- updated PGAPX to 2021-11-29.build574 +- use dedicated ```prefetch``` binary instead of curl to retrieve SRA data + + +### Release v0.4.2 +- upgrade to booster 1.76 for SKESA + +### Release v0.4.1 +* Fix double quoted string syntax error +* try use fewer words of genus_species string and retry if taxcheck fails +* try to run GCP RAPT even if network connection detection failed. + +### Release v0.4.0 +* Update SKESA to 2.5.0 and PGAPX to 2021-07-01.build5508 +* Remove container upon exit with '--rm' (docker and podman) +* Refactoring to use config loader +* Download reference data from Google storage if run on GCP (run_rapt_gcp.sh) +* Improved organism name parsing algorithm +* Distinguish different flavors of RAPT in applog +* Some bug fixes + +### Release v0.3.2 +- change default machine type from n1-himem-16 to n1-himem-8 +- attach ncbi_user to pinger events if run by web-rapt + +### Release v0.3.1 +- Added --network command line to specify custom network for container +- Some exit codes merged and combined +- Exclude dummy strings for taxonomy names +- Remove redundent errors.xml file from pgap output +- Updated SKESA version and ngs/vdb libs +- Added docker daemon status check + +### Release v0.3.0 +- new PGAP version +- sends email notifications +- accepts forward and reverse reads in two files +- monitors user quotas +- verifies taxonomic data +- accepts an argument to stop upon taxonomic disagreement +- bug fixes, including invalid SRA index +- improved logging +- improved error messaging +- cleaned up output files + +### Release v2.2.6 - GCP-RAPT: added `--project` option to specify custom project. - GCP-RAPT: log file names are fixed to concise.log and verbose.log - GCP-RAPT: log files are included in the output archive diff --git a/dist/README.txt b/dist/README.txt index 7cb75bf..feda682 100644 --- a/dist/README.txt +++ b/dist/README.txt @@ -1,4 +1,4 @@ -Read Assembly and Annotation Pipeline Tool (RAPT) v1.2.2 +Read Assembly and Annotation Pipeline Tool (RAPT) v0.5.1 RAPT is a NCBI pipeline designed for assembling and annotating Illumina genome sequencing reads obtained from bacterial or archaeal isolates. RAPT consists of two major NCBI components, SKESA and PGAP. SKESA is a de-novo assembler for microbial genomes based on DeBruijn graphs. PGAP is a prokaryotic genome annotation pipeline that combines ab initio gene prediction algorithms with homology based methods. RAPT takes an Illumina SRA run or a fasta file as input and produces an assembled and annotated genome. diff --git a/dist/release-notes.txt b/dist/release-notes.txt index f0c40fd..aa2138c 100644 --- a/dist/release-notes.txt +++ b/dist/release-notes.txt @@ -1,12 +1,9 @@ -RELEASE: v1.2.2 -DATE: 12-29-2020 -BUILD: rapt-30372431 -SKESA: 2.4.0 -PGAPX: 2020-09-24.build4894 +RELEASE: v0.5.1 +DATE: 03-18-2022 +BUILD: rapt-37347638 +SKESA: 2.5.0 +PGAPX: 2022-02-10.build5872 DESCRIPTION: -DESCRIPTION: - GCP-RAPT now displays job status in "joblist" command as "Done" and "Failed" instead of "Finished" and "Aborted" for more clarity; and more information has been added to the output of "jobdetails" command for easier problem identification. Log files are now included in the result archive "output.tar.gz" so that it is the only file to download under one jobid. - - This release also provides a python script, run_rapt.py, to run RAPT agnostically on customer's own system with python and docker-compatible container system installed. Refer to the documentation for details. +This release updates PGAP to 2022-02-10.build5872 diff --git a/dist/run_rapt.py b/dist/run_rapt.py index 6dbafe4..29b017a 100755 --- a/dist/run_rapt.py +++ b/dist/run_rapt.py @@ -6,41 +6,57 @@ import subprocess import uuid import platform +import shutil from distutils.spawn import find_executable ##to be compatible with python2 from abc import ABCMeta, abstractmethod -IMAGE_URI="us.gcr.io/ncbi-seqplus-rapt-build/rapt/rapt:RC-0.2.2-30372431" +IMAGE_URI="ncbi/rapt:v0.5.1" -RAPT_VERSION="rapt-30372431" +RAPT_VERSION="rapt-37347638" + +DEFAULT_REF_DIR = '.rapt_refdata' ACT_FUNC_TEST = 'functest' ACT_VERSION = 'version' FLG_SKESA_ONLY = 'skesa_only' FLG_NO_REPORT = 'no_report' +FLG_STOP_ON_ERRORS = 'stop_on_errors' CONCISE_LOG='concise.log' VERBOSE_LOG='verbose.log' +RAPT_FLAVOR='raptdocker' ################################################################## # Environment variable names used ################################################################## +ENV_RAPT_NCBI_APP='rapt_ncbi_app' + ENV_UUID = 'rapt_uuid' ##uuid, mainly for pgap log. ENV_JOBID = 'rapt_jobid' ##jobid is related to uuid, but still receive from wrapper script so that we do not dup the algorithm that compute jobid from uuid ENV_LOG = 'rapt_log' ##concise log location. ENV_VLOG_DST = 'rapt_vlog_dst' ##verbose log location. Different interpretation in GCP and non-GCP ##actions. +ENV_REFDATA_SRC = 'rapt_refsrc' EVN_ACT = 'rapt_act' ENV_OPTS = 'rapt_opts' ENV_MEM_AVAIL = 'rapt_mem' ENV_SRR = 'rapt_srr' ENV_FASTQ = 'rapt_fastq' +ENV_FASTQ_FWD = 'rapt_fastq_fwd' +ENV_FASTQ_REV = 'rapt_fastq_rev' + + ENV_GEN_SP = 'rapt_fastq_org' ENV_STRAIN = 'rapt_fastq_strain' +# Entering pre-downloaded tarball mount path +ENV_PGAP_REF = 'pgap_ref' +ENV_ANI_REF = 'ani_ref' + ARGDEST_ACXN = 'acxn' ARGDEST_FASTQ = 'fastq' ARGDEST_ACT = 'act' @@ -49,11 +65,13 @@ ARGDEST_FLAGS = 'flags' ARGDEST_OUTDIR = 'outdir' ARGDEST_REFDATA = 'refdata_hub' -ARGDEST_JOBID = 'jobid' +ARGDEST_PGAP_REF = 'pgap_ref' +ARGDEST_ANI_REF = 'ani_ref' ARGDEST_DOCKER = 'dockerbin' ARGDEST_ITUSER = 'it_user' ARGDEST_MAXMEM = 'maxmem' ARGDEST_MAXCPU = 'maxcpu' +ARGDEST_NETWORK = 'docker_network' ARGDEST_DO_VERBOSE_STD = 'verbose_std' META_CURR_USR = '__curr_user__' @@ -65,6 +83,7 @@ REF_DATA_MOUNT = '/dkm_ref_data' INPUT_MOUNT = '/dkm_input_dir' +INPUT_MOUNT_PAIR = '/dkm_pair_input_dir' OUTPUT_MOUNT = '/dkm_output_dir' def get_arg(args, n): @@ -109,6 +128,8 @@ def __init__(self, bin_path, args, parser): ##internal use + self.add_env(ENV_RAPT_NCBI_APP, RAPT_FLAVOR) # applog rapt flavor + self.add_env(ENV_REFDATA_SRC, 's3') # explicit: download refdata from gcs self.verbose_out = get_arg(args, ARGDEST_DO_VERBOSE_STD) if not self.verbose_out: ##The lengthy output to console are all on stderr, by the stream handler of python logger. We leave stdout open in case we need to output some message from inside the image. @@ -135,10 +156,66 @@ def __init__(self, bin_path, args, parser): else: fastq = get_arg(args, ARGDEST_FASTQ) if fastq: - if not os.path.exists(fastq): - eprint('FASTQ input file {} does not exist.'.format(fastq)) + # added to handle two paird files: /path/to/file_1,/path/to/file_2 + fq_files = fastq.split(',') + nfiles = len(fq_files) + + # if path contains ~, must expand + pathparse = fq_files[0].split(os.sep) + if '~' == pathparse[0][0:1]: + # if user does not exist, it returns the original string(treat ~ as literal) + pathparse[0] = os.path.expanduser(pathparse[0]) + fq_files[0] = os.sep.join(pathparse) + + if not os.path.exists(fq_files[0]): + eprint('FASTQ input file {} does not exist.'.format(fq_files[0])) self.rc = 1 return + + absfastq = os.path.abspath(fq_files[0]) + fq_path = os.path.dirname(absfastq) + + self.add_mount(fq_path, INPUT_MOUNT) + fq_files[0] = os.path.join(INPUT_MOUNT, os.path.basename(fq_files[0])) + + # handle paired file + if nfiles > 1: + fq_pair_path = os.path.dirname(fq_files[1]) + + pair_basename = os.path.basename(fq_files[1]) + + # if no path in the second file, try the same path first + if not fq_pair_path: + abspair_path = os.path.join(fq_path, pair_basename) + if os.path.exists(abspair_path): + fq_files[1] = abspair_path + else: + abspair_path = os.path.abspath(fq_files[1]) + else: # has path + pathparse = fq_files[1].split(os.sep) + if '~' == pathparse[0][0:1]: + # if user does not exist, it returns the original string(treat ~ as literal) + pathparse[0] = os.path.expanduser(pathparse[0]) + fq_files[1] = os.sep.join(pathparse) + + abspair_path = os.path.abspath(fq_files[1]) + + if not os.path.exists(abspair_path): + eprint('FASTQ input file {} does not exist.'.format(fq_files[1])) + self.rc = 1 + return + fq_pair_path = os.path.dirname(abspair_path) + + if fq_pair_path != fq_path: + self.add_mount(fq_pair_path, INPUT_MOUNT_PAIR) + fq_files[1] = os.path.join(INPUT_MOUNT_PAIR, pair_basename) + else: + fq_files[1] = os.path.join(INPUT_MOUNT, pair_basename) + self.add_env(ENV_FASTQ_FWD, fq_files[0]) + self.add_env(ENV_FASTQ_REV, fq_files[1]) + else: # single file + self.add_env(ENV_FASTQ, fq_files[0]) + orga = get_arg(args, ARGDEST_ORGA) if not orga: eprint('For FASTQ input, \'--organism ""\' is required') @@ -146,10 +223,6 @@ def __init__(self, bin_path, args, parser): return self.is_null=False - absfastq = os.path.abspath(fastq) - - self.add_mount(os.path.dirname(absfastq), INPUT_MOUNT) - self.add_env(ENV_FASTQ, os.path.basename(absfastq)) self.add_env(ENV_GEN_SP, orga) strain = get_arg(args, ARGDEST_STRAIN) @@ -174,39 +247,70 @@ def __init__(self, bin_path, args, parser): if maxcpu: self.set_maxcpus(maxcpu) + dk_network = get_arg(args, ARGDEST_NETWORK) + if dk_network: + self.set_network(dk_network) + run_uuid = get_uuid() jobid = uuid2jobid(run_uuid) - usrtag = get_arg(args, ARGDEST_JOBID) - if usrtag: - jobid += '_' + usrtag - self.add_env(ENV_UUID, run_uuid) self.add_env(ENV_JOBID, jobid) - outdir = get_arg(args, ARGDEST_OUTDIR) + if not self.is_version: # only create output directory when action is not version + outdir = get_arg(args, ARGDEST_OUTDIR) - if not outdir: - outdir = os.getcwd() - else: - ##for python2 compatibility - if not os.path.exists(outdir): - try: - os.makedirs(outdir, 0o755) - except Exception as e: - eprint('Unable to create output directory {}: {}'.format(outdir, e)) - self.rc = 1 - return + if not outdir: # If user did not specify, we create one. + outdir = os.path.join(os.getcwd(), 'raptout_{}'.format(jobid)) outdir = os.path.abspath(outdir) + ##for python2 compatibility + if os.path.exists(outdir): + shutil.rmtree(outdir) # clear old data + try: + os.makedirs(outdir, 0o755) + except Exception as e: + eprint('Unable to create output directory {}: {}'.format(outdir, e)) + self.rc = 1 + return + + self.add_mount(outdir, OUTPUT_MOUNT) + self.prog_msg = 'RAPT is now running, it may take a long time to finish. To see the progress, track the verbose log file {}/{}.'.format(outdir, VERBOSE_LOG) + + # Support using pre-downloaded refdata tarball, but no way to match pgap build, assume user get it right. + abs_predl_dir = None + mount_point = '/predl_dir' + + pgap_ref = get_arg(args, ARGDEST_PGAP_REF) + if pgap_ref: + abs_predl_pgap = os.path.abspath(pgap_ref) + abs_predl_dir = os.path.dirname(abs_predl_pgap) + self.add_mount(abs_predl_dir, mount_point) + self.add_env(ENV_PGAP_REF, os.path.join(mount_point, os.path.basename(abs_predl_pgap))) + + ani_ref = get_arg(args, ARGDEST_ANI_REF) + if ani_ref: + asb_predl_ani = os.path.abspath(ani_ref) + abs_predl_ani_dir = os.path.dirname(asb_predl_ani) + if abs_predl_ani_dir != abs_predl_dir: # different dir, need mount + mount_point = '/predl_ani_dir' + self.add_mount(abs_predl_ani_dir, mount_point) + + self.add_env(ENV_ANI_REF, os.path.join(mount_point, os.path.basename(asb_predl_ani))) + refdir = get_arg(args, ARGDEST_REFDATA) + if not refdir: + refdir = os.path.join(os.getcwd(), DEFAULT_REF_DIR) - self.prog_msg = 'RAPT is now running, it may take a long time to finish. To see the progress, track the verbose log file {}/raptout_{}/{}.'.format(outdir, jobid, VERBOSE_LOG) - self.add_mount(outdir, OUTPUT_MOUNT) + refdir = os.path.abspath(refdir) + if not os.path.exists(refdir): + try: + os.makedirs(refdir, 0o755) + except Exception as e: + eprint('Unable to create reference data directory {}: {}'.format(refdir, e)) + self.rc = 1 + return - refdir = get_arg(args, ARGDEST_REFDATA) - if refdir: - refdir = os.path.abspath(refdir) - self.add_mount(refdir, REF_DATA_MOUNT) + self.add_mount(refdir, REF_DATA_MOUNT) ##we do not need to specify log files anymore, they are always created inside the output dir. @@ -222,7 +326,7 @@ def run(self): print(self.prog_msg) subp = self.run_container() subp_std = subp.communicate() - + if 0 != subp.returncode and self.std_err: # we have suppressed stdderr err_msgs = subp_std[1] try: # try python3 first @@ -234,11 +338,11 @@ def run(self): total_lines = len(err_msgs) if total_lines > 10: err_msgs = err_msgs[-10:] - + for m in err_msgs: eprint(m) return subp.returncode - + @abstractmethod def add_env(self, name, val): @@ -256,6 +360,10 @@ def set_maxmem(self, maxmem): def set_maxcpus(self, maxcpus): pass + @abstractmethod + def set_network(self, network): + pass + @abstractmethod def set_runmode(self, user, is_it=False): pass @@ -271,6 +379,7 @@ class DockerCompatibleRunner(ContainerRunner): def __init__(self, bin_path, args, parser): super(DockerCompatibleRunner, self).__init__(bin_path, args, parser) + self.clean_up = '--rm' def add_env(self, name, val): self.envs.extend([DockerCompatibleRunner.ENV_SWITCH, '{}={}'.format(name, val)]) @@ -285,13 +394,19 @@ def set_maxcpus(self, maxcpus): ##if 'Windows' == platform.system() self.runmode.extend(['--cpu-count' if 'Windows' == platform.system() else '--cpus', maxcpus]) + def set_network(self, network): + self.runmode.extend(['--network', network]) + def set_runmode(self, user, is_it=False): if is_it: + self.clean_up = None self.runmode.extend(['-it']) self.cmdltail.append('/bin/bash') def run_container(self): cmdl=[self.bin_path, DockerCompatibleRunner.RUN_CMD] + if self.clean_up: + cmdl.append(self.clean_up) cmdl.extend(self.runmode) cmdl.extend(self.envs) cmdl.extend(self.mounts) @@ -307,6 +422,14 @@ class DockerRunner(DockerCompatibleRunner): RUN_BINARY = 'docker' def __init__(self, bin_path, args, parser): super(DockerRunner, self).__init__(bin_path, args, parser) + # test run docker, see if it is accessible + test_run = subprocess.Popen([bin_path, 'info'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + test_run.wait() + if test_run.returncode != 0: + eprint('===============================================\nIt seems the docker daemon is not running. Try to start the docker service\n(by running "sudo systemctl start docker" or "sudo service docker start"\ndepends on your system) and add your user id to the docker group (by running\n"sudo usermod -a -G docker $USER"), then log out and log back in. If you do not\nhave superuser privilege, ask your system admins for help. \n===============================================') + eprint('Error message:\n{}'.format(test_run.stdout.read().decode('utf-8'))) + self.rc = 1 + def set_runmode(self, user, is_it=False): super(DockerRunner, self).set_runmode(user, is_it) @@ -342,6 +465,9 @@ def set_maxmem(self, maxmem): def set_maxcpus(self, maxcpus): pass + def set_network(self, network): + self.runmode.extend(['--net', '--network', network]) + def set_runmode(self, user, is_it=False): if is_it: self.run_cmd = SingularityRunner.RUN_CMD_IT @@ -395,7 +521,7 @@ def detect_real_prog(ploc): real_prog = tproc.stdout.read().decode('utf-8').split()[0].lower() for r in VALID_RUNNERS: - if r.RUN_BINARY == real_prog: + if real_prog.startswith(r.RUN_BINARY): return r eprint('WARNING: {} support as {} alternative has not been tested'.format(real_prog, DockerRunner.RUN_BINARY)) @@ -412,7 +538,6 @@ def main(args, parser): dockerbin = get_arg(args, ARGDEST_DOCKER) - if dockerbin: bin_name = os.path.basename(dockerbin) ploc = os.path.dirname(dockerbin) @@ -448,7 +573,7 @@ def main(args, parser): excl_1.add_argument('-a', '--submitacc', dest=ARGDEST_ACXN, help='Run RAPT on an SRA run accession (sra_acxn).') - excl_1.add_argument('-q', '--submitfastq', dest=ARGDEST_FASTQ, help='Run RAPT on Illumina reads in FASTQ or FASTA format. The file must be readable from the computer that runs RAPT. The --organism argument is mandatory for this type of input, while the --strain argument is optional.') + excl_1.add_argument('-q', '--submitfastq', dest=ARGDEST_FASTQ, help='Run RAPT on Illumina reads in FASTQ or FASTA format. The file must be readable from the computer that runs RAPT. If forward and reverse readings are in two separate files, specify as "path/to/forward.fastq,path/to/reverse.fastq", or "path/to/forward.fastq,reverse.fastq" if they are in the same directory. The --organism argument is mandatory for this type of input, while the --strain argument is optional.') excl_1.add_argument('-v', '--version', dest=ARGDEST_ACT, action='store_const', const=ACT_VERSION, help='Display the current RAPT version') @@ -463,19 +588,25 @@ def main(args, parser): parser.add_argument('--no-usage-reporting', dest=ARGDEST_FLAGS, action='append_const', const=FLG_NO_REPORT, help='Prevents usage report back to NCBI. By default, RAPT sends usage information back to NCBI for statistical analysis. The information collected are a unique identifier for the RAPT process, the machine IP address, the start and end time of RAPT, and its three modules: SKESA, taxcheck and PGAP. No personal or project-specific information (such as the input data) are collected') + parser.add_argument('--stop-on-errors', dest=ARGDEST_FLAGS, action='append_const', const=FLG_STOP_ON_ERRORS, help='Do not run PGAP annotation pipeline when the genome sequence is misassigned or contaminated') + parser.add_argument('-o', '--output-dir', dest=ARGDEST_OUTDIR, help='Directory to store results and logs. If omitted, use current directory') ##general switches parser.add_argument('--refdata-dir', dest=ARGDEST_REFDATA, help='Specify a location to store reference data used by RAPT. If omitted, use output directory') + parser.add_argument('--pgap-ref', dest=ARGDEST_PGAP_REF, help='Full path to pre-downloaded PGAP reference data tarball, if applicable. File is usually named like input-.prod.tgz') + + parser.add_argument('--ani-ref', dest=ARGDEST_ANI_REF, help='Full path to pre-downloaded ANI reference data tarball, if applicable. File is usually named like input-.prod.ani.tgz') + parser.add_argument('-c', '--cpus', dest=ARGDEST_MAXCPU, help='Specify the maximal CPU cores the container should use.') parser.add_argument('-m', '--memory', dest=ARGDEST_MAXMEM, help='Specify the maximal memory (number in GB) the container should use.') - parser.add_argument('--tag', dest=ARGDEST_JOBID, help='Specify a custom string to tag the job') - parser.add_argument('-D', '--docker', dest=ARGDEST_DOCKER, choices=[r.RUN_BINARY for r in VALID_RUNNERS], help='Use specified docker compatible program to run RAPT image') + parser.add_argument('-n', '--network', dest=ARGDEST_NETWORK, help='Specify the network the container should use. Note: this parameter is passed directly to the --network parameter to the container. RAPT does not check the validity of the argument.') + ##special action for -it class ItAct(argparse.Action): def __init__(self, option_strings, dest, nargs=None, const=None, default=None, type=None, choices=None, required=False, help=None, metavar=None): diff --git a/dist/run_rapt_gcp.sh b/dist/run_rapt_gcp.sh index 3bf3681..ac6ecf6 100755 --- a/dist/run_rapt_gcp.sh +++ b/dist/run_rapt_gcp.sh @@ -1,12 +1,12 @@ #!/usr/bin/env bash ###############################* Global Constants *################################## -IMAGE_URI="us.gcr.io/ncbi-seqplus-rapt-build/rapt/rapt:RC-0.2.2-30372431" -RAPT_VERSION="rapt-30372431" +IMAGE_URI="ncbi/rapt:v0.5.1" +RAPT_VERSION="rapt-37347638" GCP_LOGS_VIEWER="https://console.cloud.google.com/logs/viewer" -DEFAULT_VM="n1-highmem-16" +DEFAULT_VM="n1-highmem-8" DEFAULT_BDISKSIZE=128 DEFAULT_FORMAT=table DEFAULT_JOB_TIMEOUT="86400s" ##24 hours @@ -43,6 +43,7 @@ OPT_JOBTIMEOUT="--timeout" ##flags FLG_SKESA_ONLY="--skesa-only" FLG_NOREPORT="--no-usage-reporting" +FLG_STOPONERRORS="--stop-on-errors" FLG_USE_CSV="--csv" ####################################### Utilities #################################### @@ -71,27 +72,31 @@ Usage: ${script_name} [options] Job creation commands: ${CMD_ACXN} <${OPT_BUCKET}|${OPT_BUCKET_L} URL> [${OPT_LABEL} LABEL] - [${FLG_SKESA_ONLY}] [${FLG_NOREPORT}] [${OPT_VMTYPE} TYPE] [${OPT_BDSIZE} NUM] - [${OPT_JOBTIMEOUT} SECONDS] + [${FLG_SKESA_ONLY}] [${FLG_NOREPORT}] [${FLG_STOPONERRORS}] [${OPT_VMTYPE} TYPE] + [${OPT_BDSIZE} NUM] [${OPT_JOBTIMEOUT} SECONDS] Submit a job to run RAPT on an SRA run accession (sra_acxn). ${CMD_FASTQ} <${OPT_ORG} "Genus species"> [${OPT_STRAIN} "ATCC xxxx"] <${OPT_BUCKET}|${OPT_BUCKET_L} URL> [${OPT_LABEL} LABEL] [${FLG_SKESA_ONLY}] - [${FLG_NOREPORT}] [${OPT_VMTYPE} TYPE] [${OPT_BDSIZE} NUM] + [${FLG_NOREPORT}] [${FLG_STOPONERRORS}] [${OPT_VMTYPE} TYPE] [${OPT_BDSIZE} NUM] [${OPT_JOBTIMEOUT} SECONDS] Submit a job to run RAPT on Illumina reads in FASTQ or FASTA format. - fastq_uri is expected to point to a google cloud storage (bucket). + fastq_uri is expected to point to a google cloud storage (bucket). If forward + and reverse readings are in two separate files, put them in the same storage + bucket and delimit their names with a comma (no space!) in fastq_uri: + + ${CMD_FASTQ} gs://mybucket/forward.fastq,reverse.fastq The ${OPT_ORG} argument is mandatory. It is the binomial name or, if the species is unknown, the genus for the sequenced organism. This identifier must be valid in NCBI Taxonomy. The ${OPT_STRAIN} argument is optional. ${CMD_TEST} <${OPT_BUCKET}|${OPT_BUCKET_L}> [${OPT_LABEL} LABEL] [${FLG_SKESA_ONLY}] - [${FLG_NOREPORT}] + [${FLG_NOREPORT}] [${FLG_STOPONERRORS}] - Run a test suite. When RAPT does not produce the expected results, it may be + Run a test suite. When RAPT does not produce the expected results, it may be helpful to use this command to ensure RAPT is functioning normally. Common options: @@ -114,25 +119,30 @@ Job creation commands: ${FLG_NOREPORT} - Optional. Prevents usage report back to NCBI. By default, RAPT sends usage - information back to NCBI for statistical analysis. The information collected + Optional. Prevents usage report back to NCBI. By default, RAPT sends usage + information back to NCBI for statistical analysis. The information collected are a unique identifier for the RAPT process, the machine IP address, the start and end time of RAPT, and its three modules: SKESA, taxcheck and PGAP. No personal or project-specific information (such as the input data) are collected. + ${FLG_STOPONERRORS} + + Optional. Do not run PGAP annotation pipeline when the genome sequence is + misassigned or contaminated. + ${OPT_REGIONS} Optional, comma-separated. Specify in which GCP region(s) RAPT should run. Note: it should be regions in which you have sufficient CPU quotas (verify at https://console.cloud.google.com/iam-admin/quotas/details). Default is - a single region, us-east4. + a single region, ${DEFAULT_REGION}. ${OPT_VMTYPE} TYPE Optional. Specify the type of google cloud virtual machine to run this job (see Google documentation, https://cloud.google.com/compute/docs/machine-types). - Default is "n1-highmem-16", which is suitable for most jobs. + Default is "${DEFAULT_VM}", which is suitable for most jobs. ${OPT_BDSIZE} NUM @@ -218,7 +228,7 @@ verify_prerequisites() [[ -z $(command -v gsutil 2>/dev/null) ]] && errexit "gsutil is required. See https://cloud.google.com/storage/docs/gsutil_install for help." GCP_ACCOUNT=$(${GCLOUD} config get-value account 2>/dev/null) - + } verify_bucket() @@ -274,7 +284,7 @@ parse_opts() dst_bkt="${opt#*=}" ;; ${OPT_PROJECT}) - + GCP_PROJECT="$1" shift ;; @@ -292,7 +302,7 @@ parse_opts() ${OPT_REGIONS}) gcp_regions="$1" shift - ;; + ;; ${OPT_REGIONS}=*) gcp_regions="${opt#*=}" ;; @@ -303,7 +313,7 @@ parse_opts() ;; ${OPT_VMTYPE}=*) vm_type="${opt#*=}" - ;; + ;; ${OPT_BDSIZE}) bd_size="$1" @@ -345,6 +355,10 @@ parse_opts() flags+=("no_report") ;; + ${FLG_STOPONERRORS}) + flags+=("stop_on_errors") + ;; + ${FLG_USE_CSV}) format="csv" ;; @@ -371,14 +385,14 @@ parse_opts() ##validate [[ ${job_timeout} =~ ^[0-9]+$ ]] && job_timeout="${job_timeout}s" - + if [[ -z ${GCP_PROJECT} ]] ##no project specified then GCP_PROJECT=$(gcloud config get-value project 2>/dev/null) [[ -z ${GCP_PROJECT} ]] && errexit "GCP project not set. Refer to 'gcloud init' for help to initiate a project." fi ##add project to all gcloud commands - + GCLOUD="${GCLOUD} --project=${GCP_PROJECT}" } @@ -390,7 +404,7 @@ cleanup() trap cleanup EXIT ##job related -env_params=() +env_params=("rapt_refsrc=gcs" "rapt_ncbi_app=rapt") finputs=() labels=("app=rapt" "rapt_version=${RAPT_VERSION}" "user=${USER}" "host=${HOSTNAME}" "image_tag=$(normalize_val ${IMAGE_URI##*:})") @@ -608,13 +622,27 @@ ${CMD_FASTQ}) [[ ! -z ${strain} ]] && env_params+=("rapt_fastq_strain=${strain}") - finputs+=("--inputs" "rapt_fastq=${fastq_uri}") + fq_fwd=${fastq_uri%%\,*} + if [[ ${fq_fwd} == ${fastq_uri} ]] # only one file + then + finputs+=("--inputs" "rapt_fastq=${fastq_uri}") + else + fq_rev=${fastq_uri##*\,} + fq_rev_base=${fq_rev##*\/} + if [[ ${fq_rev_base} == ${fq_rev} ]] # second file no path + then + # assume path always exists + fq_rev=${fq_fwd%\/*}/${fq_rev} + fi + + finputs+=("--inputs" "rapt_fastq_fwd=${fq_fwd},rapt_fastq_rev=${fq_rev}") + fi + create_job ;; ${CMD_JOBLST}) parse_opts "$@" - list_jobs ;;