From 51771350a073c82bec6140d99bdf26e789cc804b Mon Sep 17 00:00:00 2001 From: Ramesh Mantri Date: Thu, 25 May 2017 08:59:00 -0700 Subject: [PATCH 1/7] schema changes for fields --- metadb/models/db_array.py | 1 + metadb/models/field.py | 14 +++++++++++++- metadb/models/field_set.py | 35 +++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 4 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 metadb/models/field_set.py diff --git a/metadb/models/db_array.py b/metadb/models/db_array.py index 6b3a8b5..29e5cc4 100644 --- a/metadb/models/db_array.py +++ b/metadb/models/db_array.py @@ -31,6 +31,7 @@ class DBArray(_Base): id = sa.Column(BigInteger, primary_key=True) guid = sa.Column(sa.String(36), nullable=False, unique=True) reference_set_id = sa.Column(BigInteger, sa.ForeignKey('reference_set.id'), nullable=False) + field_set_id = sa.Column(BigInteger, sa.ForeignKey('field_set.id'), nullable=False) workspace_id = sa.Column(BigInteger, sa.ForeignKey('workspace.id'), nullable=False) # num_rows that exist in a given array - must be incremented after a new row is added # When creating a new array, by default no rows exist - hence, num_rows=0 diff --git a/metadb/models/field.py b/metadb/models/field.py index 18d6fca..0283d57 100644 --- a/metadb/models/field.py +++ b/metadb/models/field.py @@ -27,4 +27,16 @@ class Field(_Base): __tablename__ = "field" id = sa.Column(BigInteger, primary_key=True) - field = sa.Column(sa.Text, nullable=False) + guid = sa.Column(sa.String(36), nullable=False, unique=True) + name = sa.Column(sa.Text, nullable=False) + field_set_id = sa.Column(BigInteger, sa.ForeignKey('field_set.id'), nullable=False) + md5_checksum = sa.Column(sa.String(32)) + # Unique constraint on (field_set_id, name) + __table_args__ = ( + sa.UniqueConstraint('field_set_id', 'name', + name='unique_name_per_field_set_constraint'), + ) + type = sa.Column(sa.String(6), nullable=False) + vcf_field_class = sa.Column(sa.Text) + length = sa.Column(sa.String(4), nullable=False) + vcf_field_combine_operation = sa.Column(sa.String(20)) diff --git a/metadb/models/field_set.py b/metadb/models/field_set.py new file mode 100644 index 0000000..3f90a55 --- /dev/null +++ b/metadb/models/field_set.py @@ -0,0 +1,35 @@ +""" + The MIT License (MIT) + Copyright (c) 2016 Intel Corporation + + Permission is hereby granted, free of charge, to any person obtaining a copy of + this software and associated documentation files (the "Software"), to deal in + the Software without restriction, including without limitation the rights to + use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of + the Software, and to permit persons to whom the Software is furnished to do so, + subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR + COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +""" + +from ..models import _Base, BigInteger +import sqlalchemy as sa +from sqlalchemy.orm import relationship, backref + + +class FieldSet(_Base): + __tablename__ = "field_set" + id = sa.Column(BigInteger, primary_key=True) + guid = sa.Column(sa.String(36), nullable=False, unique=True) + md5_checksum = sa.Column(sa.String(32)) + description = sa.Column(sa.Text) + arrays = relationship('DBArray', backref='field_set') + fields = relationship('Field', backref='field_set') diff --git a/requirements.txt b/requirements.txt index f8c4a23..e007fb4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,3 +26,4 @@ alembic==0.8.2 psycopg2==2.6.1 PyVCF==0.6.8 pysam==0.9.0 +sqlalchemy_schemadisplay=1.3 From 52771a349397f0905a80d099b620255a38cd5116 Mon Sep 17 00:00:00 2001 From: Ramesh Mantri Date: Thu, 25 May 2017 09:03:35 -0700 Subject: [PATCH 2/7] schema changes for fields --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e007fb4..ec551db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,4 +26,4 @@ alembic==0.8.2 psycopg2==2.6.1 PyVCF==0.6.8 pysam==0.9.0 -sqlalchemy_schemadisplay=1.3 +sqlalchemy_schemadisplay==1.3 From 5d4256b5063acd5d96894d4a9d2c6a2d9658d585 Mon Sep 17 00:00:00 2001 From: Ramesh Mantri Date: Mon, 5 Jun 2017 16:01:24 -0700 Subject: [PATCH 3/7] schema enhanced for fields --- .../4f93dc7aa8e8_create_field_set_table.py | 57 +++++++++++ metadb/api/dbimport.py | 94 ++++++++++++++++++- metadb/api/query.py | 1 + metadb/models/__init__.py | 1 + metadb/models/field.py | 19 ++-- metadb/models/field_set.py | 3 +- requirements.txt | 1 + utils/helper.py | 10 +- utils/vcf_importer.py | 7 +- 9 files changed, 175 insertions(+), 18 deletions(-) create mode 100644 metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py diff --git a/metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py b/metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py new file mode 100644 index 0000000..99a3d95 --- /dev/null +++ b/metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py @@ -0,0 +1,57 @@ +"""create field_set table + +Revision ID: 4f93dc7aa8e8 +Revises: 4631177f4ecc +Create Date: 2017-05-25 11:22:32.344826 + +""" + +# revision identifiers, used by Alembic. +revision = '4f93dc7aa8e8' +down_revision = '4631177f4ecc' +branch_labels = None +depends_on = None + +from alembic import op +import sqlalchemy as sa + + +def upgrade(): + # making ontology terms strings for now + # leaving out externalId, diseases, pheno, etc. mappings for now + op.drop_table('field') + op.create_table( + 'field_set', + sa.Column('id', sa.BigInteger, primary_key=True), + sa.Column('guid', sa.String(36), nullable=False, unique=True), + sa.Column('assembly_id', sa.String(100), nullable=False), + sa.Column('md5_checksum', sa.String(32)), + sa.Column('description', sa.Text) + ) + op.create_table( + 'field', + sa.Column('id', sa.String(32), primary_key=True), + sa.Column('guid', sa.String(36), nullable=False, unique=True), + sa.Column('field', sa.String(32), nullable=False), + sa.Column('field_set_id', sa.BigInteger, sa.ForeignKey('field_set.id'), nullable=False), + sa.Column('md5_checksum', sa.String(32)), + sa.Column('type', sa.Enum('Integer', 'String', 'Float', 'Flag', name='type_enum')), + sa.Column('is_filter', sa.Boolean, nullable=False), + sa.Column('is_format', sa.Boolean, nullable=False), + sa.Column('is_info', sa.Boolean, nullable=False), + sa.Column('length_type', sa.Enum('A', 'R', 'G', 'VAR', 'NUM', name='length_enum')), + sa.Column('length_intval', sa.Integer) + ) + op.create_unique_constraint('unique_name_per_field_set_constraint', 'field', ['field_set_id', 'field']) + op.add_column(u'db_array', sa.Column('field_set_id', sa.BigInteger, sa.ForeignKey('field_set.id'), nullable=False)) + +def downgrade(): + op.drop_constraint('unique_name_per_reference_set_constraint', 'field', type_='unique') + op.drop_table('field') + op.drop_table('field_set') + op.create_table( + 'field', + sa.Column('id', sa.BigInteger, primary_key=True), + sa.Column('field', sa.Text, nullable=False) + ) + op.drop_column(u'db_array', 'field_set_id') diff --git a/metadb/api/dbimport.py b/metadb/api/dbimport.py index e190f9e..3e8f625 100644 --- a/metadb/api/dbimport.py +++ b/metadb/api/dbimport.py @@ -31,6 +31,8 @@ from metadb.models import Individual from metadb.models import Reference from metadb.models import ReferenceSet +from metadb.models import Field +from metadb.models import FieldSet from metadb.models import Sample from metadb.models import VariantSet from metadb.models import Workspace @@ -62,6 +64,7 @@ class Import(): def __init__(self, db): self.db = db + self.length_type_map = {-1:'A', -2:'G', -3:'R', None:'VAR'} def __enter__(self): self.session = self.db.Session() @@ -134,6 +137,93 @@ def registerReference(self, guid, reference_set_id, name, length): return reference + def registerFieldSet(self, guid, reader, assembly_id, description=None): + fieldSet = self.session.query(FieldSet).filter( + or_(FieldSet.assembly_id == assembly_id, FieldSet.guid == guid))\ + .first() + fieldMap = {} + + if fieldSet is None: + try: + fieldSet = FieldSet(guid=guid, assembly_id=assembly_id, description=description) + self.session.add(fieldSet) + self.session.commit() + + except exc.DataError as e: + self.session.rollback() + raise ValueError("{0} : {1} ".format(str(e), guid)) + + if ((reader.filters != None) and (len(reader.filters) > 0)): + for filter_name, filter_item in reader.filters.items(): + item_list = [] + item_list.append(filter_item) + fieldMap[filter_name] = item_list + + if ((reader.formats != None) and (len(reader.formats) > 0)): + for format_name, format_item in reader.formats.items(): + if (format_name in fieldMap): + fieldMap[format_name].append(format_item) + else: + item_list = [] + item_list.append(format_item) + fieldMap[format_name] = item_list + + if ((reader.infos != None) and (len(reader.infos) > 0)): + for info_name, info_item in reader.infos.items(): + if (info_name in fieldMap): + fieldMap[info_name].append(info_item) + else: + item_list = [] + item_list.append(info_item) + fieldMap[info_name] = item_list + + if (len(fieldMap) > 0): + for field_name, item_list in fieldMap.items(): + self.registerField(str(uuid.uuid4()), fieldSet.id, field_name, item_list) + + return fieldSet + + def registerField(self, guid, field_set_id, field_name, field_item_list): + field = self.session.query(Field).filter( + and_(Field.field_set_id == field_set_id, Field.field == field_name)).first() + + if field is None: + try: + field = Field() + field.is_filter = False + field.is_format = False + field.is_info = False + field.id = field_name + field.guid = guid + field.field = field_name + field.field_set_id = field_set_id + + for field_item in field_item_list: + if ('type' in field_item.__dict__.keys()): + field.type = field_item.type + if ('Filter' in str(type(field_item))): + field.is_filter = True + if ('Format' in str(type(field_item))): + field.is_format = True + if ('Info' in str(type(field_item))): + field.is_info = True + if ('num' in field_item.__dict__.keys()): + number = field_item.num + if ((None == number) or (number < 0)): + field.length_type = self.length_type_map[number] + else: + field.length_type = 'NUM' + field.length_intval = number + + self.session.add(field) + self.session.commit() + + except exc.DataError as e: + self.session.rollback() + raise ValueError("{0} : {1} ".format(str(e), guid)) + + return field + def registerWorkspace(self, guid, name): """ Registers a workspace. @@ -164,7 +254,7 @@ def registerWorkspace(self, guid, name): return workspace - def registerDBArray(self, guid, reference_set_id, workspace_id, name): + def registerDBArray(self, guid, reference_set_id, field_set_id, workspace_id, name): """ Registers a DBArray. An array is unique named folder in a unique workspace path and a given reference id. @@ -175,6 +265,7 @@ def registerDBArray(self, guid, reference_set_id, workspace_id, name): dbarray = self.session.query(DBArray) .filter( and_(DBArray.reference_set_id == reference_set_id,\ DBArray.workspace_id == workspace_id,\ + DBArray.field_set_id == field_set_id,\ DBArray.name == name))\ .first() @@ -183,6 +274,7 @@ def registerDBArray(self, guid, reference_set_id, workspace_id, name): dbarray = DBArray( guid=guid, reference_set_id=reference_set_id, + field_set_id=field_set_id, workspace_id=workspace_id, name=name ) diff --git a/metadb/api/query.py b/metadb/api/query.py index 9f14386..8a135f8 100644 --- a/metadb/api/query.py +++ b/metadb/api/query.py @@ -28,6 +28,7 @@ from itertools import chain from metadb.models import Field +from metadb.models import FieldSet from metadb.models import Individual from metadb.models import Workspace from metadb.models import DBArray diff --git a/metadb/models/__init__.py b/metadb/models/__init__.py index e873290..4f36c4c 100644 --- a/metadb/models/__init__.py +++ b/metadb/models/__init__.py @@ -42,6 +42,7 @@ def get_tiledb_padded_reference_length_string_default(reference_length_str): from .inc_counter import BigInteger from .inc_counter import autoinc_handler from .field import Field +from .field_set import FieldSet from .reference_set import ReferenceSet from .reference import Reference from .source_accession import SourceAccession diff --git a/metadb/models/field.py b/metadb/models/field.py index 0283d57..1d1d48f 100644 --- a/metadb/models/field.py +++ b/metadb/models/field.py @@ -22,21 +22,24 @@ from ..models import _Base, BigInteger import sqlalchemy as sa - +import enum class Field(_Base): __tablename__ = "field" - id = sa.Column(BigInteger, primary_key=True) + id = sa.Column(sa.String(32), primary_key=True) guid = sa.Column(sa.String(36), nullable=False, unique=True) - name = sa.Column(sa.Text, nullable=False) + field = sa.Column(sa.String(32), nullable=False) field_set_id = sa.Column(BigInteger, sa.ForeignKey('field_set.id'), nullable=False) md5_checksum = sa.Column(sa.String(32)) # Unique constraint on (field_set_id, name) __table_args__ = ( - sa.UniqueConstraint('field_set_id', 'name', + sa.UniqueConstraint('field_set_id', 'field', name='unique_name_per_field_set_constraint'), ) - type = sa.Column(sa.String(6), nullable=False) - vcf_field_class = sa.Column(sa.Text) - length = sa.Column(sa.String(4), nullable=False) - vcf_field_combine_operation = sa.Column(sa.String(20)) + # int/float/char/flag + type = sa.Column(sa.Enum('Integer', 'String', 'Float', 'Flag', name='type_enum')) + is_filter = sa.Column(sa.Boolean, nullable=False) + is_format = sa.Column(sa.Boolean, nullable=False) + is_info = sa.Column(sa.Boolean, nullable=False) + length_type = sa.Column(sa.Enum('A', 'R', 'G', 'VAR', 'NUM', name='length_enum')) + length_intval = sa.Column(sa.Integer) diff --git a/metadb/models/field_set.py b/metadb/models/field_set.py index 3f90a55..1703481 100644 --- a/metadb/models/field_set.py +++ b/metadb/models/field_set.py @@ -29,7 +29,8 @@ class FieldSet(_Base): __tablename__ = "field_set" id = sa.Column(BigInteger, primary_key=True) guid = sa.Column(sa.String(36), nullable=False, unique=True) - md5_checksum = sa.Column(sa.String(32)) description = sa.Column(sa.Text) + md5_checksum = sa.Column(sa.String(32)) + assembly_id = sa.Column(sa.String(100)) arrays = relationship('DBArray', backref='field_set') fields = relationship('Field', backref='field_set') diff --git a/requirements.txt b/requirements.txt index ec551db..23b30af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,3 +27,4 @@ psycopg2==2.6.1 PyVCF==0.6.8 pysam==0.9.0 sqlalchemy_schemadisplay==1.3 +enum34==1.1.6 diff --git a/utils/helper.py b/utils/helper.py index 9baa4fc..967961d 100644 --- a/utils/helper.py +++ b/utils/helper.py @@ -150,10 +150,11 @@ def writeVIDMappingFile(DB_URI, reference_set_id, output_file, fields_dict=Const writeJSON2File(vid_mapping, output_file) -def registerWithMetadb(config, vcf=False, references=OrderedDict()): +def registerWithMetadb(config, reader, vcf=False): """ Registers parent object of a callset in metadb for both MAF and VCF importing. """ + if not vcf: # set MAF specific vars with open(config.TileDBAssembly) as config_file: @@ -180,18 +181,21 @@ def registerWithMetadb(config, vcf=False, references=OrderedDict()): rs = metadb.registerReferenceSet( str(uuid.uuid4()), assembly, - references=references) + references=reader.contigs) + fs = metadb.registerFieldSet( + str(uuid.uuid4()), reader, assembly) dba = metadb.registerDBArray( guid=str(uuid.uuid4()), name=array, reference_set_id=rs.id, + field_set_id=fs.id, workspace_id=ws.id) vs = metadb.registerVariantSet( guid=str(uuid.uuid4()), reference_set_id=rs.id, dataset_id=os.path.basename(workspace)) - return dba, vs, rs + return dba, vs, rs, fs def createMappingFiles(outputDir, callset_mapping, rs_id, DB_URI, array, loader_config=None): diff --git a/utils/vcf_importer.py b/utils/vcf_importer.py index f2f358b..02cae86 100644 --- a/utils/vcf_importer.py +++ b/utils/vcf_importer.py @@ -81,9 +81,7 @@ def __enter__(self): self.source_idx = conf.get('source_idx', 0) self.target_idx = conf.get('target_idx', 1) - self.array, self.variantset, self.referenceset = helper.registerWithMetadb( - conf, vcf=True, references=self.reader.contigs) - + self.array, self.variantset, self.referenceset, self.fieldset = helper.registerWithMetadb(conf, self.reader, vcf=True) return self def __exit__(self, exc_type, exc_value, traceback): @@ -260,7 +258,6 @@ def poolImportVCF(file_info): return (-1, inputFile) return (0, inputFile, vc.callset_mapping) - def parallelGen(config_file, inputFileList, outputDir, callset_file=None, loader_config=None): """ Spawns the Pool of VCF objects to work on each input VCF @@ -273,7 +270,7 @@ def parallelGen(config_file, inputFileList, outputDir, callset_file=None, loader # if no contig information in header, assume it's already registered with open(inputFileList[0], 'rb') as vcf_init: reader = vcf.Reader(vcf_init) - dba, vs, rs = helper.registerWithMetadb(config, vcf=True, references=reader.contigs) + dba, vs, rs, fs = helper.registerWithMetadb(config, reader, vcf=True) # sort and index vcfs, and # build arguments for parallel gen From 0163a70904beaec8ac5ec86166cde20e18c99a27 Mon Sep 17 00:00:00 2001 From: Ramesh Mantri Date: Wed, 7 Jun 2017 17:17:10 -0700 Subject: [PATCH 4/7] fixed schema for field and field_set --- .../4f93dc7aa8e8_create_field_set_table.py | 5 +-- metadb/api/dbimport.py | 34 +++++++++++++--- metadb/models/field.py | 4 +- metadb/models/field_set.py | 2 - utils/helper.py | 40 ++++++++++--------- utils/vcf_importer.py | 1 + 6 files changed, 52 insertions(+), 34 deletions(-) diff --git a/metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py b/metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py index 99a3d95..02fbf8e 100644 --- a/metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py +++ b/metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py @@ -24,17 +24,14 @@ def upgrade(): 'field_set', sa.Column('id', sa.BigInteger, primary_key=True), sa.Column('guid', sa.String(36), nullable=False, unique=True), - sa.Column('assembly_id', sa.String(100), nullable=False), - sa.Column('md5_checksum', sa.String(32)), sa.Column('description', sa.Text) ) op.create_table( 'field', - sa.Column('id', sa.String(32), primary_key=True), + sa.Column('id', sa.BigInteger, primary_key=True), sa.Column('guid', sa.String(36), nullable=False, unique=True), sa.Column('field', sa.String(32), nullable=False), sa.Column('field_set_id', sa.BigInteger, sa.ForeignKey('field_set.id'), nullable=False), - sa.Column('md5_checksum', sa.String(32)), sa.Column('type', sa.Enum('Integer', 'String', 'Float', 'Flag', name='type_enum')), sa.Column('is_filter', sa.Boolean, nullable=False), sa.Column('is_format', sa.Boolean, nullable=False), diff --git a/metadb/api/dbimport.py b/metadb/api/dbimport.py index 3e8f625..47a807a 100644 --- a/metadb/api/dbimport.py +++ b/metadb/api/dbimport.py @@ -137,15 +137,13 @@ def registerReference(self, guid, reference_set_id, name, length): return reference - def registerFieldSet(self, guid, reader, assembly_id, description=None): - fieldSet = self.session.query(FieldSet).filter( - or_(FieldSet.assembly_id == assembly_id, FieldSet.guid == guid))\ - .first() + def registerFieldSet(self, guid, reader, description=None): + fieldSet = self.session.query(FieldSet).filter(FieldSet.guid == guid).first() fieldMap = {} if fieldSet is None: try: - fieldSet = FieldSet(guid=guid, assembly_id=assembly_id, description=description) + fieldSet = FieldSet(guid=guid, description=description) self.session.add(fieldSet) self.session.commit() @@ -193,7 +191,6 @@ def registerField(self, guid, field_set_id, field_name, field_item_list): field.is_filter = False field.is_format = False field.is_info = False - field.id = field_name field.guid = guid field.field = field_name field.field_set_id = field_set_id @@ -254,6 +251,31 @@ def registerWorkspace(self, guid, name): return workspace + def queryDBArray(self, workspace_name, dbarray_name): + dba = None + vs = None + rs = None + fs = None + try: + workspace = self.session.query(Workspace).filter(Workspace.name == workspace_name).first() + if workspace is not None: + wsid = workspace.id + dba = self.session.query(DBArray).filter( + and_(DBArray.workspace_id == wsid,\ + DBArray.name == dbarray_name))\ + .first() + if dba is not None: + dbid = dba.id + rsid = dba.reference_set_id + fsid = dba.field_set_id + rs = self.session.query(ReferenceSet).filter(ReferenceSet.id == rsid).first() + fs = self.session.query(FieldSet).filter(FieldSet.id == fsid).first() + vs = self.session.query(VariantSet).filter(VariantSet.reference_set_id == rsid).first() + except exc.DataError as e: + self.session.rollback() + raise ValueError("{0} : {1} : {2} ".format(str(e), guid, workspace_name, dbarray_name)) + return dba, vs, rs, fs + def registerDBArray(self, guid, reference_set_id, field_set_id, workspace_id, name): """ Registers a DBArray. diff --git a/metadb/models/field.py b/metadb/models/field.py index 1d1d48f..8faf509 100644 --- a/metadb/models/field.py +++ b/metadb/models/field.py @@ -26,17 +26,15 @@ class Field(_Base): __tablename__ = "field" - id = sa.Column(sa.String(32), primary_key=True) + id = sa.Column(sa.BigInteger, primary_key=True) guid = sa.Column(sa.String(36), nullable=False, unique=True) field = sa.Column(sa.String(32), nullable=False) field_set_id = sa.Column(BigInteger, sa.ForeignKey('field_set.id'), nullable=False) - md5_checksum = sa.Column(sa.String(32)) # Unique constraint on (field_set_id, name) __table_args__ = ( sa.UniqueConstraint('field_set_id', 'field', name='unique_name_per_field_set_constraint'), ) - # int/float/char/flag type = sa.Column(sa.Enum('Integer', 'String', 'Float', 'Flag', name='type_enum')) is_filter = sa.Column(sa.Boolean, nullable=False) is_format = sa.Column(sa.Boolean, nullable=False) diff --git a/metadb/models/field_set.py b/metadb/models/field_set.py index 1703481..7081c66 100644 --- a/metadb/models/field_set.py +++ b/metadb/models/field_set.py @@ -30,7 +30,5 @@ class FieldSet(_Base): id = sa.Column(BigInteger, primary_key=True) guid = sa.Column(sa.String(36), nullable=False, unique=True) description = sa.Column(sa.Text) - md5_checksum = sa.Column(sa.String(32)) - assembly_id = sa.Column(sa.String(100)) arrays = relationship('DBArray', backref='field_set') fields = relationship('Field', backref='field_set') diff --git a/utils/helper.py b/utils/helper.py index 967961d..981d0e2 100644 --- a/utils/helper.py +++ b/utils/helper.py @@ -175,25 +175,27 @@ def registerWithMetadb(config, reader, vcf=False): dbimport = DBImport(config['dburi']) with dbimport.getSession() as metadb: - # register workspace, referenceset, array, and variantset - ws = metadb.registerWorkspace( - str(uuid.uuid4()), workspace) - rs = metadb.registerReferenceSet( - str(uuid.uuid4()), - assembly, - references=reader.contigs) - fs = metadb.registerFieldSet( - str(uuid.uuid4()), reader, assembly) - dba = metadb.registerDBArray( - guid=str(uuid.uuid4()), - name=array, - reference_set_id=rs.id, - field_set_id=fs.id, - workspace_id=ws.id) - vs = metadb.registerVariantSet( - guid=str(uuid.uuid4()), - reference_set_id=rs.id, - dataset_id=os.path.basename(workspace)) + dba, vs, rs, fs = metadb.queryDBArray(workspace, array) + if dba is None: + # register workspace, referenceset, array, and variantset + ws = metadb.registerWorkspace( + str(uuid.uuid4()), workspace) + rs = metadb.registerReferenceSet( + str(uuid.uuid4()), + assembly, + references=reader.contigs) + fs = metadb.registerFieldSet( + str(uuid.uuid4()), reader, assembly) + dba = metadb.registerDBArray( + guid=str(uuid.uuid4()), + name=array, + reference_set_id=rs.id, + field_set_id=fs.id, + workspace_id=ws.id) + vs = metadb.registerVariantSet( + guid=str(uuid.uuid4()), + reference_set_id=rs.id, + dataset_id=os.path.basename(workspace)) return dba, vs, rs, fs diff --git a/utils/vcf_importer.py b/utils/vcf_importer.py index 02cae86..ce2c180 100644 --- a/utils/vcf_importer.py +++ b/utils/vcf_importer.py @@ -293,6 +293,7 @@ def parallelGen(config_file, inputFileList, outputDir, callset_file=None, loader pool = Pool() failed = list() + for returncode in pool.imap_unordered(poolImportVCF, function_args): if returncode[0] == -1: failed.append(returncode[1]) From 810b3cbcbc4dd87f50f29c222ca72057632aea6b Mon Sep 17 00:00:00 2001 From: Ramesh Mantri Date: Thu, 15 Jun 2017 15:21:33 -0700 Subject: [PATCH 5/7] changes for field_combine_op column --- metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py | 3 ++- metadb/api/dbimport.py | 2 ++ metadb/models/field.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py b/metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py index 02fbf8e..21317bc 100644 --- a/metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py +++ b/metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py @@ -37,7 +37,8 @@ def upgrade(): sa.Column('is_format', sa.Boolean, nullable=False), sa.Column('is_info', sa.Boolean, nullable=False), sa.Column('length_type', sa.Enum('A', 'R', 'G', 'VAR', 'NUM', name='length_enum')), - sa.Column('length_intval', sa.Integer) + sa.Column('length_intval', sa.Integer), + sa.Column('field_combine_op', sa.Enum('sum', 'mean', 'median', 'move_to_FORMAT', 'element_wise_sum', 'concatenate', name='field_combine_optype')) ) op.create_unique_constraint('unique_name_per_field_set_constraint', 'field', ['field_set_id', 'field']) op.add_column(u'db_array', sa.Column('field_set_id', sa.BigInteger, sa.ForeignKey('field_set.id'), nullable=False)) diff --git a/metadb/api/dbimport.py b/metadb/api/dbimport.py index 47a807a..e8441c8 100644 --- a/metadb/api/dbimport.py +++ b/metadb/api/dbimport.py @@ -211,6 +211,8 @@ def registerField(self, guid, field_set_id, field_name, field_item_list): else: field.length_type = 'NUM' field.length_intval = number + if ('VCF_field_combine_operation' in field_item.__dict__.keys()): + field.field_combine_op = field_item.VCF_field_combine_operation self.session.add(field) self.session.commit() diff --git a/metadb/models/field.py b/metadb/models/field.py index 8faf509..eab1952 100644 --- a/metadb/models/field.py +++ b/metadb/models/field.py @@ -41,3 +41,4 @@ class Field(_Base): is_info = sa.Column(sa.Boolean, nullable=False) length_type = sa.Column(sa.Enum('A', 'R', 'G', 'VAR', 'NUM', name='length_enum')) length_intval = sa.Column(sa.Integer) + field_combine_op = sa.Column(sa.Enum('sum', 'mean', 'median', 'move_to_FORMAT', 'element_wise_sum', 'concatenate', name='field_combine_optype')) From cd837a457ed94ac1b86cc6e207fa8bab6129b3c1 Mon Sep 17 00:00:00 2001 From: Ramesh Mantri Date: Mon, 19 Jun 2017 14:40:23 -0700 Subject: [PATCH 6/7] suggestions in code review --- .../4f93dc7aa8e8_create_field_set_table.py | 2 +- metadb/api/dbimport.py | 26 ++++++++++--------- metadb/models/field.py | 2 +- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py b/metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py index 21317bc..eeecfb7 100644 --- a/metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py +++ b/metadb/alembic/versions/4f93dc7aa8e8_create_field_set_table.py @@ -37,7 +37,7 @@ def upgrade(): sa.Column('is_format', sa.Boolean, nullable=False), sa.Column('is_info', sa.Boolean, nullable=False), sa.Column('length_type', sa.Enum('A', 'R', 'G', 'VAR', 'NUM', name='length_enum')), - sa.Column('length_intval', sa.Integer), + sa.Column('length_intval', sa.Integer, default=0, server_default=sa.text('1')), sa.Column('field_combine_op', sa.Enum('sum', 'mean', 'median', 'move_to_FORMAT', 'element_wise_sum', 'concatenate', name='field_combine_optype')) ) op.create_unique_constraint('unique_name_per_field_set_constraint', 'field', ['field_set_id', 'field']) diff --git a/metadb/api/dbimport.py b/metadb/api/dbimport.py index e8441c8..16c08b7 100644 --- a/metadb/api/dbimport.py +++ b/metadb/api/dbimport.py @@ -137,27 +137,29 @@ def registerReference(self, guid, reference_set_id, name, length): return reference - def registerFieldSet(self, guid, reader, description=None): + def registerFieldSet(self, guid, reader=None, description=None): fieldSet = self.session.query(FieldSet).filter(FieldSet.guid == guid).first() + if (fieldSet is not None): + return fieldSet fieldMap = {} - if fieldSet is None: - try: - fieldSet = FieldSet(guid=guid, description=description) - self.session.add(fieldSet) - self.session.commit() + try: + fieldSet = FieldSet(guid=guid, description=description) + self.session.add(fieldSet) + self.session.commit() - except exc.DataError as e: - self.session.rollback() - raise ValueError("{0} : {1} ".format(str(e), guid)) + except exc.DataError as e: + self.session.rollback() + raise ValueError("{0} : {1} ".format(str(e), guid)) - if ((reader.filters != None) and (len(reader.filters) > 0)): + if (reader is not None): + if ((reader.filters is not None) and (len(reader.filters) > 0)): for filter_name, filter_item in reader.filters.items(): item_list = [] item_list.append(filter_item) fieldMap[filter_name] = item_list - if ((reader.formats != None) and (len(reader.formats) > 0)): + if ((reader.formats is not None) and (len(reader.formats) > 0)): for format_name, format_item in reader.formats.items(): if (format_name in fieldMap): fieldMap[format_name].append(format_item) @@ -166,7 +168,7 @@ def registerFieldSet(self, guid, reader, description=None): item_list.append(format_item) fieldMap[format_name] = item_list - if ((reader.infos != None) and (len(reader.infos) > 0)): + if ((reader.infos is not None) and (len(reader.infos) > 0)): for info_name, info_item in reader.infos.items(): if (info_name in fieldMap): fieldMap[info_name].append(info_item) diff --git a/metadb/models/field.py b/metadb/models/field.py index eab1952..df20a75 100644 --- a/metadb/models/field.py +++ b/metadb/models/field.py @@ -40,5 +40,5 @@ class Field(_Base): is_format = sa.Column(sa.Boolean, nullable=False) is_info = sa.Column(sa.Boolean, nullable=False) length_type = sa.Column(sa.Enum('A', 'R', 'G', 'VAR', 'NUM', name='length_enum')) - length_intval = sa.Column(sa.Integer) + length_intval = sa.Column(sa.Integer, default=1, server_default=sa.text('1')) field_combine_op = sa.Column(sa.Enum('sum', 'mean', 'median', 'move_to_FORMAT', 'element_wise_sum', 'concatenate', name='field_combine_optype')) From 705191680283198ddd50c72e0a5d906d902c5cb4 Mon Sep 17 00:00:00 2001 From: Ramesh Mantri Date: Wed, 21 Jun 2017 19:29:37 -0700 Subject: [PATCH 7/7] fixing travis build --- metadb/api/test/test_dbimport.py | 10 +++++----- utils/helper.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/metadb/api/test/test_dbimport.py b/metadb/api/test/test_dbimport.py index 2387341..932ed31 100644 --- a/metadb/api/test/test_dbimport.py +++ b/metadb/api/test/test_dbimport.py @@ -292,20 +292,20 @@ def test_registerDBArray(self): # new array result = session.registerDBArray( - aguid, self.referenceset.id, self.workspace.id, self.array) + aguid, self.referenceset.id, self.fieldset.id, self.workspace.id, self.array) assert result.name == self.array assert result.guid == aguid # registered array reg_result = session.registerDBArray( - str(uuid.uuid4()), self.referenceset.id, self.workspace.id, self.array) + str(uuid.uuid4()), self.referenceset.id, self.fieldset.id, self.workspace.id, self.array) assert reg_result.name == self.array assert reg_result.guid == aguid # negative with pytest.raises(ValueError) as exec_info: neg_result = session.registerDBArray( - fguid, self.referenceset.id, self.workspace.id, "negative") + fguid, self.referenceset.id, self.fieldset.id, self.workspace.id, "negative") assert "DataError" in str(exec_info.value) def test_registerSample(self): @@ -405,9 +405,9 @@ def setUpClass(self): self.workspace = session.registerWorkspace( str(uuid.uuid4()), "/test/dbimport/workspace") self.array = session.registerDBArray( - str(uuid.uuid4()), self.referenceset.id, self.workspace.id, "test") + str(uuid.uuid4()), self.referenceset.id, self.fieldset.id, self.workspace.id, "test") self.array2 = session.registerDBArray( - str(uuid.uuid4()), self.referenceset.id, self.workspace.id, "test2") + str(uuid.uuid4()), self.referenceset.id, self.fieldset.id, self.workspace.id, "test2") self.variantset = session.registerVariantSet( str(uuid.uuid4()), self.referenceset.id, "Dataset") diff --git a/utils/helper.py b/utils/helper.py index 981d0e2..96d5a37 100644 --- a/utils/helper.py +++ b/utils/helper.py @@ -150,7 +150,7 @@ def writeVIDMappingFile(DB_URI, reference_set_id, output_file, fields_dict=Const writeJSON2File(vid_mapping, output_file) -def registerWithMetadb(config, reader, vcf=False): +def registerWithMetadb(config, reader=None, vcf=False): """ Registers parent object of a callset in metadb for both MAF and VCF importing. """ @@ -183,7 +183,7 @@ def registerWithMetadb(config, reader, vcf=False): rs = metadb.registerReferenceSet( str(uuid.uuid4()), assembly, - references=reader.contigs) + references=((reader is not None) ? reader.contigs : None) fs = metadb.registerFieldSet( str(uuid.uuid4()), reader, assembly) dba = metadb.registerDBArray(