diff --git a/bin/build_olmsted_data.py b/bin/build_olmsted_data.py index ed01766..265d931 100755 --- a/bin/build_olmsted_data.py +++ b/bin/build_olmsted_data.py @@ -15,7 +15,6 @@ def get_args(): parser.add_argument('-s', '--sequences-out') return parser.parse_args() - # Some generic data processing helpers helpers def comp(f, g): @@ -74,7 +73,7 @@ def pull_datasets(t): "cft.reconstruction:seqmeta": [{"tripl.csv:data": ["bio.seq:id", "cft.seq:cluster_multiplicity", "cft.seq:multiplicity"]}], "cft.reconstruction:cluster_aa": [{"bio.seq:set": ["*"]}], "cft.reconstruction:asr_tree": ["*"], - "cft.reconstruction:asr_seqs": ["*"], + "cft.reconstruction:asr_seqs": [{'bio.seq:set': ['bio.seq:id', 'bio.seq:seq']}], "cft.reconstruction:cluster": [ "db:ident", @@ -157,12 +156,9 @@ def parse_tree_data(s, nt_seqs_dict, aa_seqs_dict, seqmeta_dict): t = PhyloTree(s, format=1) return create_node_records(t, nt_seqs_dict, aa_seqs_dict, seqmeta_dict) -def parse_fasta_string(fasta_string): - return dict([tuple(s.replace('\n', ':', 1).replace('\n','').split(':')) for s in fasta_string.split('>')[1:]]) - -def create_aa_seqs_dict(aa_seq_records): +def create_seqs_dict(seq_records): d = dict() - for record in aa_seq_records: + for record in seq_records: d[record["bio.seq:id"].pop()] = record["bio.seq:seq"].pop() return d @@ -177,10 +173,9 @@ def clean_clonal_family_record(d): c = d.copy() c['cft.reconstruction:cluster'] = c['cft.reconstruction:cluster'][0] #c['cft.reconstruction:cluster']['cft.reconstruction:seqmeta'] = c['cft.reconstruction:seqmeta'] - aa_seqs_dict = create_aa_seqs_dict(list(c['cft.reconstruction:cluster_aa'] )[0]['bio.seq:set']) + aa_seqs_dict = create_seqs_dict(list(c['cft.reconstruction:cluster_aa'])[0]['bio.seq:set']) + nt_seqs_dict = create_seqs_dict(list(c['cft.reconstruction:asr_seqs'])[0]['bio.seq:set']) seqmeta_dict = create_seqmeta_dict(list(c['cft.reconstruction:seqmeta'])[0]['tripl.csv:data']) - - nt_seqs_dict = parse_fasta_string(list(c['cft.reconstruction:asr_seqs'][0]['tripl.file:contents'])[0]) if(c['cft.reconstruction:asr_tree'][0].get('tripl.file:contents')): c['cft.reconstruction:cluster']['cft.reconstruction:asr_tree'] = parse_tree_data(list(c['cft.reconstruction:asr_tree'][0]['tripl.file:contents'])[0], nt_seqs_dict, aa_seqs_dict, seqmeta_dict) c = c['cft.reconstruction:cluster']