Skip to content

Commit

Permalink
update REFSEQ and GENBANK functions.
Browse files Browse the repository at this point in the history
  • Loading branch information
pchaumeil committed Mar 10, 2020
1 parent 420e816 commit 15c7a01
Show file tree
Hide file tree
Showing 5 changed files with 849 additions and 11 deletions.
76 changes: 66 additions & 10 deletions bin/gtdb_migration_tk
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,18 @@ def print_help():
version() + ' :::...''')
print('''\
NCBI folder to GTDB folder:
update_refseq -> Update Refseq genomes.
update_genbank -> Update Genbank genomes.
Information from Nomenclatural resources:
lpsn -> Process steps for LPSN.
bacdive -> Process steps for BacDive. [In Dev]
strains -> Set of tools to combined information from LPSN,DSMZ and Straininfo.
Miscellaneous commands:
list_genomes -> Produce file indicating the directory of each genome.
Test suite for data validation:
overview -> Compare the Metadata file from the previous version with the new one.
compare_field -> Compare a specific metadata field between to metadata files.
Expand All @@ -79,6 +85,56 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(add_help=False)
subparsers = parser.add_subparsers(help="--", dest='subparser_name')

# Misc commands
genome_dir_parser = subparsers.add_parser(
'list_genomes', formatter_class=CustomHelpFormatter, description='Produce file indicating the directory of each genome.')
genome_dir_parser.add_argument(
'genome_dir', help='base directory leading to NCBI archaeal and bacterial genome assemblies')
genome_dir_parser.add_argument('output_file', help='output metadata file')
genome_dir_parser.add_argument(
'--silent', help="suppress output", action='store_true')

# Steps to move NCBI genomes to GTDB folders
update_rfq_parser = subparsers.add_parser(
'update_refseq', formatter_class=CustomHelpFormatter, description='Update Refseq genomes.')
update_rfq_parser.add_argument('--ftp_refseq_directory', dest="ftp_refseq", required=True,
help='base directory leading the the FTP repository for refseq')
update_rfq_parser.add_argument('--new_refseq_directory', dest="output_dir",
required=True, help='base directory leading the new repository for refseq')
update_rfq_parser.add_argument('--ftp_genome_dirs_file', dest="ftp_genome_dirs", required=True,
help='metadata file listing all directories for the FTP folder (generated by genome_dirs.py)')
update_rfq_parser.add_argument('--old_genome_dirs_file', dest="old_genome_dirs", required=True,
help='metadata file listing all directories from the previous NCBI update date (generated by genome_dirs.py)')
update_rfq_parser.add_argument('--arc_assembly_summary', required=True,
help='metadata file downloaded from ncbi.')
update_rfq_parser.add_argument('--bac_assembly_summary', required=True,
help='metadata file downloaded from ncbi.')
update_rfq_parser.add_argument('--cpus', type=int, default=1,
help='Number of cpus')
update_rfq_parser.add_argument(
'--silent', help="suppress output", action='store_true')

update_gbk_parser = subparsers.add_parser(
'update_genbank', formatter_class=CustomHelpFormatter, description='Update Genbank genomes.')
update_gbk_parser.add_argument('--ftp_genbank_directory', dest="ftp_genbank", required=True,
help='base directory leading the the FTP repository for genbank')
update_gbk_parser.add_argument('--new_genbank_directory', dest="output_dir",
required=True, help='base directory leading the new repository for genbank')
update_gbk_parser.add_argument('--ftp_genbank_genome_dirs_file', dest="ftp_genbank_genome_dirs", required=True,
help='metadata file listing all directories for the FTP folder (generated by ncbi_genome_dirs.py)')
update_gbk_parser.add_argument('--old_genbank_genome_dirs_file', dest="old_genbank_genome_dirs", required=True,
help='metadata file listing all directories from the previous NCBI update date (generated by genome_dirs.py)')
update_gbk_parser.add_argument('--new_refseq_genome_dirs_file', dest="new_refseq_genome_dirs", required=True,
help='metadata file listing all directories from the previous NCBI update date (generated by genome_dirs.py)')
update_gbk_parser.add_argument('--arc_assembly_summary', required=True,
help='Genbank metadata file downloaded from ncbi.')
update_gbk_parser.add_argument('--bac_assembly_summary', required=True,
help='Genbank metadata file downloaded from ncbi.')
update_gbk_parser.add_argument('--cpus', type=int, default=1,
help='Number of cpus')
update_gbk_parser.add_argument(
'--silent', help="suppress output", action='store_true')

# Steps to update LPSN Metadata
lpsn_parser = subparsers.add_parser('lpsn',
formatter_class=CustomHelpFormatter,
Expand Down Expand Up @@ -192,25 +248,25 @@ if __name__ == '__main__':
formatter_class=CustomHelpFormatter,
help='Compare the Metadata file from the previous version with the new one.')
overview_parser.add_argument(
'--previous_metadata_file', help='file indicating metadata of each genome in previous GTDB version.',required=True)
'--previous_metadata_file', help='file indicating metadata of each genome in previous GTDB version.', required=True)
overview_parser.add_argument(
'--new_metadata_file', help='file indicating metadata of each genome in latest GTDB version.',required=True)
'--new_metadata_file', help='file indicating metadata of each genome in latest GTDB version.', required=True)
overview_parser.add_argument(
'--only_ncbi', help='Output file.', action='store_true')
overview_parser.add_argument(
'--silent', help="suppress output", action='store_true')

metafield_parser = subparsers.add_parser('compare_field',
formatter_class=CustomHelpFormatter,
help='Compare a specific metadata field between to metadata files.')
formatter_class=CustomHelpFormatter,
help='Compare a specific metadata field between to metadata files.')
metafield_parser.add_argument(
'--previous_metadata_file', help='file indicating metadata of each genome in previous GTDB version.',required=True)
'--previous_metadata_file', help='file indicating metadata of each genome in previous GTDB version.', required=True)
metafield_parser.add_argument(
'--new_metadata_file', help='file indicating metadata of each genome in latest GTDB version.',required=True)
'--new_metadata_file', help='file indicating metadata of each genome in latest GTDB version.', required=True)
metafield_parser.add_argument(
'--field_of_interest', help='common field to compare between files.',required=True)
'--field_of_interest', help='common field to compare between files.', required=True)
metafield_parser.add_argument(
'--output_file', help='Output file.',required=True)
'--output_file', help='Output file.', required=True)
metafield_parser.add_argument(
'--only_ncbi', help='Output file.', action='store_true')
metafield_parser.add_argument(
Expand Down
Loading

0 comments on commit 15c7a01

Please sign in to comment.