Skip to content

Commit

Permalink
use pandas instead of pysam to read vcfs
Browse files Browse the repository at this point in the history
resolves: #265
  • Loading branch information
creisle committed Dec 29, 2021
1 parent f143720 commit 6af3454
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 17 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ junit
.tox
*eggs/
.mypy_cache
.snakemake

# aligners
blat
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def check_nonpython_dependencies():
'networkx==1.11.0',
'numpy>=1.13.1',
'pandas>=1.1, <2',
'pysam>=0.9, <=0.15.2',
'pysam',
'shortuuid>=0.5.0',
'svgwrite',
'mavis_config>=1.1.0, <2.0.0',
Expand Down
143 changes: 129 additions & 14 deletions src/mavis/tools/vcf.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,56 @@
import logging
import re
from typing import Dict, List, Tuple
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple

import pandas as pd
from pysam import VariantFile
from typing_extensions import TypedDict

from ..constants import COLUMNS, ORIENT, SVTYPE
from ..util import DEVNULL
from .constants import SUPPORTED_TOOL

PANDAS_DEFAULT_NA_VALUES = [
'-1.#IND',
'1.#QNAN',
'1.#IND',
'-1.#QNAN',
'#N/A',
'N/A',
'NA',
'#NA',
'NULL',
'NaN',
'-NaN',
'nan',
'-nan',
]


class VcfInfoType(TypedDict, total=False):
SVTYPE: str
CHR2: str
CIPOS: Tuple[int, int]
CIEND: Tuple[int, int]
CT: str
END: Optional[int]
PRECISE: bool


@dataclass
class VcfRecordType:
id: str
pos: int
chrom: str
alts: List[Optional[str]]
info: VcfInfoType
ref: str

@property
def stop(self) -> Optional[int]:
return self.info.get('END', self.pos)


def parse_bnd_alt(alt: str) -> Tuple[str, int, str, str, str, str]:
"""
Expand Down Expand Up @@ -88,6 +132,7 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]:
- duplication: 5to3
"""
records = []

for alt in record.alts if record.alts else [None]:
info = {}
for key in record.info.keys():
Expand All @@ -106,7 +151,7 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]:
if record.id and record.id != 'N': # to account for NovoBreak N in the ID field
std_row['id'] = record.id

if info.get('SVTYPE', None) == 'BND':
if info.get('SVTYPE') == 'BND':
chr2, end, orient1, orient2, ref, alt = parse_bnd_alt(alt)
std_row[COLUMNS.break1_orientation] = orient1
std_row[COLUMNS.break2_orientation] = orient2
Expand Down Expand Up @@ -172,6 +217,82 @@ def convert_record(record, record_mapping={}, log=DEVNULL) -> List[Dict]:
return records


def convert_pandas_rows_to_variants(df):
def parse_info(info_field):
info = {}
for pair in info_field.split(';'):
if '=' in pair:
key, value = pair.split('=', 1)
info[key] = value
else:
info[pair] = True

# convert info types
for key in info:
if key in {'CIPOS', 'CIEND'}:
ci_start, ci_end = info[key].split(',')
info[key] = (int(ci_start), int(ci_end))
elif key == 'END':
info[key] = int(info[key])

return info

df['info'] = df['INFO'].apply(parse_info)
df['alts'] = df['ALT'].apply(lambda a: a.split(','))

rows = []
for _, row in df.iterrows():

rows.append(
VcfRecordType(
id=row['ID'],
pos=row['POS'],
info=VcfInfoType(row['info']),
chrom=row['CHROM'],
ref=row['REF'],
alts=row['alts'],
)
)
return rows


def pandas_vcf(input_file) -> Tuple[List[str], pd.DataFrame]:
"""
Read a standard vcf file into a pandas dataframe
"""
# read the comment/header information
header_lines = []
with open(input_file, 'r') as fh:
line = '##'
while line.startswith('##'):
header_lines.append(line)
line = fh.readline().strip()
header_lines = header_lines[1:]
# read the data
df = pd.read_csv(
input_file,
sep='\t',
skiprows=len(header_lines),
dtype={
'CHROM': str,
'POS': int,
'ID': str,
'INFO': str,
'FORMAT': str,
'REF': str,
'ALT': str,
},
na_values=PANDAS_DEFAULT_NA_VALUES + ['.'],
)
df = df.rename(columns={df.columns[0]: df.columns[0].replace('#', '')})
required_columns = ['CHROM', 'INFO', 'POS', 'REF', 'ALT', 'ID']
for col in required_columns:
if col not in df.columns:
raise KeyError(f'Missing required column: {col}')
# convert the format fields using the header
return header_lines, df


def convert_file(input_file: str, file_type: str, log):
"""process a VCF file
Expand All @@ -183,18 +304,12 @@ def convert_file(input_file: str, file_type: str, log):
err: [description]
"""
rows = []
vfile = VariantFile(input_file)
try:
vfile.header.info.add('END', number=1, type='Integer', description='End of the interval')
except ValueError:
pass

for vcf_record in vfile.fetch():
_, data = pandas_vcf(input_file)

for variant_record in convert_pandas_rows_to_variants(data):
try:
rows.extend(convert_record(vcf_record, log=log))
except Exception as err:
if file_type != SUPPORTED_TOOL.STRELKA:
raise err
else:
log('Ignoring', vcf_record)
rows.extend(convert_record(variant_record, log=log))
except NotImplementedError as err:
logging.warning(str(err))
return rows
4 changes: 2 additions & 2 deletions tests/data/manta_events.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -133,5 +133,5 @@
7 126098487 MantaINV:4:28281:28286:0:0:0 T <INV> . PASS END=126167443;SVTYPE=INV;SVLEN=68956;CIPOS=0,3;CIEND=-3,0;HOMLEN=3;HOMSEQ=ATG;INV5;SOMATIC;SOMATICSCORE=120 PR:SR 42,0:48,0 95,45:104,36
9 28031861 MantaINV:162252:2:3:0:0:0 A <INV> . PASS END=28034467;SVTYPE=INV;SVLEN=2606;CIPOS=0,1;CIEND=-1,0;HOMLEN=1;HOMSEQ=C;SVINSLEN=11;SVINSSEQ=TTTTCGGAATT;INV5;SOMATIC;SOMATICSCORE=104 PR:SR 45,0:42,0 41,19:26,19
X 31196943 MantaDEL:290420:0:1:0:0:0 A <DEL> . PASS END=31216210;SVTYPE=DEL;SVLEN=-19267;SVINSLEN=8;SVINSSEQ=ATGTAGTG;SOMATIC;SOMATICSCORE=124 PR:SR 35,0:25,0 43,32:32,31
1 17051724 MantaBND:207:0:1:0:0:0:0 C [1:234912188[GCCCCATC 36 PASS SVTYPE=BND;MATEID=MantaBND:207:0:1:0:0:0:1;SVINSLEN=7;SVINSSEQ=GCCCCAT;BND_DEPTH=5;MATE_BND_DEPTH=4 GT:FT:GQ:PL:PR:SR 0/1:PASS:30:86,0,28:1,2:3,1 . . .
1 234912188 MantaBND:207:0:1:0:0:0:1 A [1:17051724[ATGGGGCA 36 PASS SVTYPE=BND;MATEID=MantaBND:207:0:1:0:0:0:0;SVINSLEN=7;SVINSSEQ=ATGGGGC;BND_DEPTH=4;MATE_BND_DEPTH=5 GT:FT:GQ:PL:PR:SR 0/1:PASS:30:86,0,28:1,2:3,1 . . .
1 17051724 MantaBND:207:0:1:0:0:0:0 C [1:234912188[GCCCCATC 36 PASS SVTYPE=BND;MATEID=MantaBND:207:0:1:0:0:0:1;SVINSLEN=7;SVINSSEQ=GCCCCAT;BND_DEPTH=5;MATE_BND_DEPTH=4 GT:FT:GQ:PL:PR:SR 0/1:PASS:30:86,0,28:1,2:3,1 . . .
1 234912188 MantaBND:207:0:1:0:0:0:1 A [1:17051724[ATGGGGCA 36 PASS SVTYPE=BND;MATEID=MantaBND:207:0:1:0:0:0:0;SVINSLEN=7;SVINSSEQ=ATGGGGC;BND_DEPTH=4;MATE_BND_DEPTH=5 GT:FT:GQ:PL:PR:SR 0/1:PASS:30:86,0,28:1,2:3,1 . . .
9 changes: 9 additions & 0 deletions tests/unit/test_tools_vcf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from mavis.tools.vcf import pandas_vcf

from ..util import get_data


def test_read_vcf():
header, df = pandas_vcf(get_data('delly_events.vcf'))
assert len(header) == 63
assert df.shape[0] == 31

0 comments on commit 6af3454

Please sign in to comment.