Skip to content

Commit

Permalink
Nucleotide annotation fix update
Browse files Browse the repository at this point in the history
  • Loading branch information
ktmeaton committed Sep 5, 2019
1 parent fc7b7f9 commit e881005
Show file tree
Hide file tree
Showing 7 changed files with 1,653 additions and 0 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
191 changes: 191 additions & 0 deletions NCBImeta/config/test_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
OUTPUT_DIR = "NCBImeta/output"
EMAIL = "eatonk3@mcmaster.ca"
DATABASE = "yersinia_pestis_db.sqlite"
TABLES = ["Nucleotide"]
SEARCH_TERMS = {"Nucleotide": "(plague OR pestis OR Yersinia pestis) AND WGS_MASTER[KYWD] AND nucleotide assembly[Filter]"}

TABLE_COLUMNS = {
"Assembly" : [
{"AssemblyAccession" : "AssemblyAccession"},
{"AssemblyBioSampleAccession" : 'BioSampleAccn'},
{"AssemblyBioSampleID" : 'BioSampleId'},
{"AssemblyGenbankBioprojectAccession" : ["GB_BioProjects","BioprojectAccn"]},
{"AssemblyGenbankID" : 'GbUid'},
{"AssemblyRefseqBioprojectAccession" : ["RS_BioProjects","BioprojectAccn"]},
{"AssemblyRefSeqCategory" : 'RefSeq_category'},
{"AssemblyRefSeqID" : 'RsUid'},
{"AssemblyWGSAccession" : 'WGS'},
{"AssemblyInfraspecies" : ["InfraspeciesList","Biosource","Sub_value"]},
{"AssemblyIsolate" : "Isolate"},
{"AssemblyOrganism" : 'Organism'},
{"AssemblySpeciesTaxonomicID" : 'SpeciesTaxid'},
{"AssemblySpeciesName" : 'SpeciesName'},
{"AssemblyTaxonomicID" : 'Taxid'},
{"AssemblyName" : "AssemblyName"},
{"AssemblyStatus" : 'AssemblyStatus'},
{"AssemblyType" : 'AssemblyType'},
{"AssemblyCoverage" : 'Coverage'},
{"AssemblyChromosomes" : ["Stat", "chromosome_count", "category"]},
{"AssemblyContigCount": ["Stat", "contig_count", "category"]},
{"AssemblyContigN50" : ["Stat", "contig_n50", "category"]},
{"AssemblyContigL50" : ["Stat", "contig_l50", "category"]},
{"AssemblyNonChromosomalReplicons" : ["Stat", "non_chromosome_replicon_count", "category"]},
{"AssemblyReplicons" : ["Stat", "replicon_count", "category"]},
{"AssemblyScaffolds" : ["Stat", "scaffold_count", "category"]},
{"AssemblyScaffoldN50" : ["Stat", "scaffold_n50", "category"]},
{"AssemblyScaffoldL50" : ["Stat", "scaffold_l50", "category"]},
{"AssemblyTotalLength" : ["Stat", "total_length", "category"]},
{"AssemblyUngappedLength" : ["Stat", "ungapped_length", "category"]},
{"AssemblySubmitterOrganization" : 'SubmitterOrganization'},
{"AssemblySubmissionDate" : 'SubmissionDate'},
{"AssemblyReleaseDate" : 'SeqReleaseDate'},
{"AssemblyFTPAssemblyReport" : 'FtpPath_Assembly_rpt'},
{"AssemblyFTPGenbank" : 'FtpPath_GenBank'},
{"AssemblyFTPRefSeq" : 'FtpPath_RefSeq'},
{"AssemblyFTPStatsReport" : 'FtpPath_Stats_rpt'},
{"AssemblyComment" : 'NullValue'}
],
"BioSample" : [
{"BioSampleAccession": "Accession"},
{"BioSampleAccessionSecondary" : "NullValue"},
{"BioSampleBioProjectAccession": ["Link","label"]},
{"BioSampleSRAAccession": ["Id","SRA","db"]},
{"BioSampleTitle": "Title"},
{"BioSampleName": ["Id","Sample name","db_label"]},
{"BioSampleType": ["Attribute","sample_type","harmonized_name"]},
{"BioSamplePackage": "Package"},
{"BioSampleInfraspecies": "Infraspecies"},
{"BioSampleOrganism": "OrganismName"},
{"BioSampleOrganismAlt": ["Organism","taxonomy_name"]},
{"BioSampleSubSpecies": ["Attribute","sub_species","harmonized_name"]},
{"BioSampleStrain": ["Attribute","strain","harmonized_name"]},
{"BioSampleTaxonomyID": "Taxonomy"},
{"BioSampleBiovar": ["Attribute","biovar","harmonized_name"]},
{"BioSampleCollectionDate": ["Attribute","collection_date","harmonized_name"]},
{"BioSampleGeographicLocation": ["Attribute","geo_loc_name","harmonized_name"]},
{"BioSampleHost": ["Attribute","host","harmonized_name"]},
{"BioSampleHostDisease" : ["Attribute", "host_disease", "harmonized_name"]},
{"BioSampleIsolateNameAlias": ["Attribute","isolate_name_alias","harmonized_name"]},
{"BioSampleIsolationSource": ["Attribute","isolation_source","harmonized_name"]},
{"BioSampleLat" : ["Attribute", "latitude", "harmonized_name"]},
{"BioSampleLatLon" : ["Attribute", "lat_lon", "harmonized_name"]},
{"BioSampleLon" : ["Attribute", "longitude", "harmonized_name"]},
{"BioSampleDate": "Date"},
{"BioSampleModificationDate": "ModificationDate"},
{"BioSamplePublicationDate": "PublicationDate"},
{"BioSampleOrganization": "Organization"},
{"BioSampleComment" : "NullValue"},
],
"BioProject" : [
{"BioProjectAccession" : "Project_Acc"},
{"BioProjectDataType" : "Project_Data_Type"},
{"BioProjectDescription" : "Project_Description"},
{"BioProjectMethodType" : "Project_MethodType"},
{"BioProjectName" : "Project_Name"},
{"BioProjectTargetCapture" : "Project_Target_Capture"},
{"BioProjectTargetMaterial" : "Project_Target_Material"},
{"BioProjectTargetScope" : "Project_Target_Scope"},
{"BioProjectTitle" : "Project_Title"},
{"BioProjectType" : "Project_Type"},
{"BioProjectOrganismLabel" : "Organism_Label"},
{"BioProjectOrganismStrain" : "Organism_Strain"},
{"BioProjectSupergroup" : "Supergroup"},
{"BioProjectTaxonomicID" : "TaxId"},
{"BioProjectRegistrationDate" : "Registration_Date"},
{"BioProjectRelevanceMedical" : "Relevance_Medical"},
{"BioProjectSequencingStatus" : "Sequencing_Status"},
{"BioProjectSubmitterOrganization" : "Submitter_Organization"},
{"BioProjectComment" : "NullValue"}
],
"Nucleotide" : [
{"NucleotideAccession" : 'GBSeq_primary-accession'},
{"NucleotideAccessionVersion" : 'GBSeq_accession-version'},
{"NucleotideBioSampleAccession": "NucleotideBioSample"},
{"NucleotideBioProjectAccession" : "GBSeq_project"},
{"NucleotideFirstAccession" : ['GBAltSeqData_items', 'GBSeq_alt-seq', 'GBAltSeqItem_first-accn']},
{"NucleotideLastAccession" : ['GBAltSeqData_items', 'GBSeq_alt-seq', 'GBAltSeqItem_last-accn']},
{"NucleotideOrganism" : 'GBSeq_organism'},
{"NucleotideTaxonomy" : "GBSeq_taxonomy"},
{"NucleotideDefinition" : "GBSeq_definition"},
{"NucleotideDivision" : "GBSeq_division"},
{"NucleotideJournal": ['GBSeq_references', 'GBReference_journal']},
{"NucleotideLength" : 'GBSeq_length'},
{"NucleotideMoleculeType" : "GBSeq_moltype"},
{"NucleotideReferenceTitle": ['GBSeq_references', u'GBReference_title']},
{"NucleotideSeqDataName" : ['GBSeq_alt-seq', 'GBAltSeqData_name']},
{"NucleotideSource" : 'GBSeq_source'},
{"NucleotideStrandedness" : "GBSeq_strandedness"},
{"NucleotideTopology" : "GBSeq_topology"},
{"NucleotideCreateDate" : "GBSeq_create-date"},
{"NucleotideUpdateDate" : "GBSeq_update-date"},
{"NucleotideGenBankComment" : "GBSeq_comment"},
{'NucleotideAnnotationDate': 'Annotation Date'},
{'NucleotideAnnotationMethod': 'Annotation Method'},
{'NucleotideAnnotationPipeline': 'Annotation Pipeline'},
{'NucleotideAnnotationProvider': 'Annotation Provider'},
{'NucleotideAnnotationSoftwarerevision': 'Annotation Software revision'},
{'NucleotideAssemblyDate': 'Assembly Date'},
{'NucleotideAssemblyMethod': 'Assembly Method'},
{'NucleotideAssemblyName': 'Assembly Name'},
{'NucleotidetCDS': 'CDS (total)'},
{'NucleotidetCDSCoding': 'CDS (coding)'},
{'NucleotidetCRISPRArrays': 'CRISPR Arrays'},
{'NucleotidetExpectedFinalVersion': 'Expected Final Version'},
{'NucleotidetFeaturesAnnotated': 'Features Annotated'},
{'NucleotidetGenes': 'Genes (total)'},
{'NucleotidetGenesCoding': 'Genes (coding)'},
{'NucleotidetGenesRNA': 'Genes (RNA)'},
{'NucleotidetGenomeCoverage': 'Genome Coverage'},
{'NucleotidetGenomeRepresentation': 'Genome Representation'},
{'NucleotidetncRNAs': 'ncRNAs'},
{'NucleotidetPseudoGenes': 'Pseudo Genes (total)'},
{'NucleotidetPseudoGenesAmbResidues': 'Pseudo Genes (ambiguous residues)'},
{'NucleotidetPseudoGenesFrameshifted': 'Pseudo Genes (frameshifted)'},
{'NucleotidetPseudoGenesIncomplete': 'Pseudo Genes (incomplete)'},
{'NucleotidetPseudoGenesInternalStop': 'Pseudo Genes (internal stop)'},
{'NucleotidetPseudoGenesMultipleProblems': 'Pseudo Genes (multiple problems)'},
{'NucleotidetrRNAs': 'rRNAs'},
{'NucleotidetrRNAsComplete': 'complete rRNAs'},
{'NucleotidetrRNAsPartial': 'partial rRNAs'},
{'NucleotideSequencingTechnology': 'Sequencing Technology'},
{'NucleotidetRNAs': 'tRNAs'},
{'NucleotideComment' : 'NullValue'}
],
"SRA" : [
{"SRABioProjectAccession" : "Bioproject"},
{"SRABioSampleAccession" : "Biosample"},
{"SRAExperimentAccession" : ["Experiment", "acc"]},
{"SRARunAccession" : ["Run", "acc"]},
{"SRASampleAccession" : ["Sample", "acc"]},
{"SRAExperimentName" : ["Experiment", "name"]},
{"SRAExperimentStatus" : ["Experiment", "status"]},
{"SRAExperimentVersion" : ["Experiment", "ver"]},
{"SRAIsPublic" : ["Run", "is_public"]},
{"SRASampleName" : ["Sample", "name"]},
{"SRAStaticDataAvailable" : ["Run", "static_data_available"]},
{"SRAStudyAcc" : ["Study", "acc"]},
{"SRAStudName" : ["Study", "name"]},
{"SRATitle" : "Title"},
{"SRAOrganismName" : ["Organism", "CommonName"]},
{"SRAOrganismTaxID" : ["Organism", "taxid"]},
{"SRAClusterName" : ["Statistics", "cluster_name"]},
{"SRAInstrumentModel" : ["Platform", "instrument_model"]},
{"SRALibraryName" : "LIBRARY_NAME"},
{"SRALibraryLayout" : "LIBRARY_LAYOUT"},
{"SRALibrarySelection" : "LIBRARY_SELECTION"},
{"SRALibrarySource" : "LIBRARY_SOURCE"},
{"SRALibraryStrategy" : "LIBRARY_STRATEGY"},
{"SRAPlatform" : "Platform"},
{"SRATotalBases" : ["Statistics", "total_bases"]},
{"SRATotalSize" : ["Statistics", "total_size"]},
{"SRATotalSpots" : ["Statistics", "total_spots"]},
{"SRATotalRuns" : ["Statistics", "total_runs"]},
{"SRACreateDate" : "CreateDate"},
{"SRAUpdateDate" : "UpdateDate"},
{"SRACenterName" : ["Submitter", "center_name"]},
{"SRAContactName" : ["Submitter", "contact_name"]},
{"SRALabName" : ["Submitter", "lab_name"]},
{"SRASubmitterAccession" : ["Submitter", "acc"]},
{'SRAComment' : 'NullValue'}
]
}
Binary file modified NCBImeta/output/yersinia_pestis_db.sqlite
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit e881005

Please sign in to comment.