Skip to content

Commit

Permalink
nt seqs dict using tripl lookup instead of fasta parser
Browse files Browse the repository at this point in the history
  • Loading branch information
Elias Harkins committed Sep 25, 2018
1 parent 028d10a commit 4ac5304
Showing 1 changed file with 5 additions and 10 deletions.
15 changes: 5 additions & 10 deletions bin/build_olmsted_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ def get_args():
parser.add_argument('-s', '--sequences-out')
return parser.parse_args()


# Some generic data processing helpers helpers

def comp(f, g):
Expand Down Expand Up @@ -74,7 +73,7 @@ def pull_datasets(t):
"cft.reconstruction:seqmeta": [{"tripl.csv:data": ["bio.seq:id", "cft.seq:cluster_multiplicity", "cft.seq:multiplicity"]}],
"cft.reconstruction:cluster_aa": [{"bio.seq:set": ["*"]}],
"cft.reconstruction:asr_tree": ["*"],
"cft.reconstruction:asr_seqs": ["*"],
"cft.reconstruction:asr_seqs": [{'bio.seq:set': ['bio.seq:id', 'bio.seq:seq']}],
"cft.reconstruction:cluster":
[
"db:ident",
Expand Down Expand Up @@ -157,12 +156,9 @@ def parse_tree_data(s, nt_seqs_dict, aa_seqs_dict, seqmeta_dict):
t = PhyloTree(s, format=1)
return create_node_records(t, nt_seqs_dict, aa_seqs_dict, seqmeta_dict)

def parse_fasta_string(fasta_string):
return dict([tuple(s.replace('\n', ':', 1).replace('\n','').split(':')) for s in fasta_string.split('>')[1:]])

def create_aa_seqs_dict(aa_seq_records):
def create_seqs_dict(seq_records):
d = dict()
for record in aa_seq_records:
for record in seq_records:
d[record["bio.seq:id"].pop()] = record["bio.seq:seq"].pop()
return d

Expand All @@ -177,10 +173,9 @@ def clean_clonal_family_record(d):
c = d.copy()
c['cft.reconstruction:cluster'] = c['cft.reconstruction:cluster'][0]
#c['cft.reconstruction:cluster']['cft.reconstruction:seqmeta'] = c['cft.reconstruction:seqmeta']
aa_seqs_dict = create_aa_seqs_dict(list(c['cft.reconstruction:cluster_aa'] )[0]['bio.seq:set'])
aa_seqs_dict = create_seqs_dict(list(c['cft.reconstruction:cluster_aa'])[0]['bio.seq:set'])
nt_seqs_dict = create_seqs_dict(list(c['cft.reconstruction:asr_seqs'])[0]['bio.seq:set'])
seqmeta_dict = create_seqmeta_dict(list(c['cft.reconstruction:seqmeta'])[0]['tripl.csv:data'])

nt_seqs_dict = parse_fasta_string(list(c['cft.reconstruction:asr_seqs'][0]['tripl.file:contents'])[0])
if(c['cft.reconstruction:asr_tree'][0].get('tripl.file:contents')):
c['cft.reconstruction:cluster']['cft.reconstruction:asr_tree'] = parse_tree_data(list(c['cft.reconstruction:asr_tree'][0]['tripl.file:contents'])[0], nt_seqs_dict, aa_seqs_dict, seqmeta_dict)
c = c['cft.reconstruction:cluster']
Expand Down

0 comments on commit 4ac5304

Please sign in to comment.