Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Editing datascript for Olmsted#17, Olmsted#11, scons for Olmsted#13 #249

Merged
merged 9 commits into from
Sep 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion SConstruct
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,8 @@ def add_cluster_analysis(w):
# orig/new names, joined on sequence from the other file
sources = {'--partis-seqmeta': c['partis_seqmeta'],
'--cluster-mapping': c['cluster_mapping'] if c['reconstruction']['prune_strategy'] == 'min_adcl' else None,
}
'--pruned-ids': c['pruned_ids'] if c['reconstruction']['prune_strategy'] == 'seed_lineage' else None,
}
sources = {k: v for k, v in sources.items() if v}
base_call = 'aggregate_minadcl_cluster_multiplicities.py '
for i, (k, v) in enumerate(sources.items()):
Expand Down
13 changes: 12 additions & 1 deletion bin/aggregate_minadcl_cluster_multiplicities.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
import collections
#import itertools


def filter_by_ids(results, ids):
filtered_results = [result for result in results if result['unique_id'] in ids]
return filtered_results

def aggregate_clusters(merge_results, cluster_mapping):
merge_results = {row['sequence']: row for row in merge_results}
Expand Down Expand Up @@ -69,11 +71,18 @@ def seqmeta_reader(filename):
'duplicates': d['duplicates'].split(':')})
for d in data]

def pruned_ids_reader(filename):
eharkins marked this conversation as resolved.
Show resolved Hide resolved
data = set()
with open(filename) as f:
for line in f:
data.add(line.rstrip())
return data

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--cluster-mapping', type=cluster_reader)
parser.add_argument('--partis-seqmeta', type=seqmeta_reader)
parser.add_argument('--pruned-ids', type=pruned_ids_reader)
parser.add_argument('output', type=argparse.FileType('w'))
args = parser.parse_args()
return args
Expand All @@ -91,6 +100,8 @@ def main():
results = args.partis_seqmeta
if args.cluster_mapping:
results = aggregate_clusters(results, args.cluster_mapping)
if args.pruned_ids:
results = filter_by_ids(results, args.pruned_ids)
out_writer.writerows(format_results(results))
args.output.close()

Expand Down
42 changes: 32 additions & 10 deletions bin/build_olmsted_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ def get_args():
parser.add_argument('-s', '--sequences-out')
return parser.parse_args()


# Some generic data processing helpers helpers

def comp(f, g):
Expand Down Expand Up @@ -71,9 +70,10 @@ def pull_datasets(t):

clonal_family_pull_pattern = [
{
#"cft.reconstruction:seqmeta": [{"tripl.csv:data": ["*"]}],
"cft.reconstruction:seqmeta": [{"tripl.csv:data": ["bio.seq:id", "cft.seq:cluster_multiplicity", "cft.seq:multiplicity"]}],
"cft.reconstruction:cluster_aa": [{"bio.seq:set": ["*"]}],
"cft.reconstruction:asr_tree": ["*"],
"cft.reconstruction:asr_seqs": [{'bio.seq:set': ['bio.seq:id', 'bio.seq:seq']}],
"cft.reconstruction:cluster":
[
"db:ident",
Expand Down Expand Up @@ -110,11 +110,20 @@ def pull_datasets(t):
}
]

def create_node_records(tree):
def create_node_records(tree, nt_seqs_dict, aa_seqs_dict, seqmeta_dict):
records = []
leaves_counter = 1
for n in tree.traverse('postorder'):
n.label = n.id = n.name
n.nt_seq = nt_seqs_dict[n.name]
n.aa_seq = aa_seqs_dict[n.name]
mult = None
cluster_mult = None
if n.name in seqmeta_dict.keys():
mult = seqmeta_dict[n.name]["cft.seq:multiplicity"].pop()
clust_mult = seqmeta_dict[n.name]["cft.seq:cluster_multiplicity"].pop()
n.multiplicity = int(mult) if mult else mult
n.cluster_multiplicity = int(clust_mult) if clust_mult else clust_mult
n.type = "node"
if n.is_leaf():
# get height for leaves
Expand All @@ -139,22 +148,36 @@ def create_node_records(tree):
n.parent = None
n.length = 0.0
n.distance = 0.0
records.append({'id': n.id, 'label': n.label, 'type': n.type, 'parent': n.parent, 'length': n.length, 'distance': n.distance, 'height': n.height})
records.append({'id': n.id, 'label': n.label, 'type': n.type, 'parent': n.parent, 'length': n.length, 'distance': n.distance, 'height': n.height, 'nt_seq': n.nt_seq, 'aa_seq': n.aa_seq, 'multiplicity': n.multiplicity, 'cluster_multiplicity': n.cluster_multiplicity})

return records

def parse_tree_data(s):
def parse_tree_data(s, nt_seqs_dict, aa_seqs_dict, seqmeta_dict):
t = PhyloTree(s, format=1)
return create_node_records(t)
return create_node_records(t, nt_seqs_dict, aa_seqs_dict, seqmeta_dict)

def create_seqs_dict(seq_records):
d = dict()
for record in seq_records:
d[record["bio.seq:id"].pop()] = record["bio.seq:seq"].pop()
return d

def create_seqmeta_dict(seqmeta_records):
d = dict()
for record in seqmeta_records:
seq_id = record["bio.seq:id"].pop()
d[seq_id] = record
return d

def clean_clonal_family_record(d):
c = d.copy()
c['cft.reconstruction:cluster'] = c['cft.reconstruction:cluster'][0]
#c['cft.reconstruction:cluster']['cft.reconstruction:asr_tree'] = c['cft.reconstruction:asr_tree'][0]
#c['cft.reconstruction:cluster']['cft.reconstruction:seqmeta'] = c['cft.reconstruction:seqmeta']
c['cft.reconstruction:cluster']['cft.reconstruction:cluster_aa'] = list(c['cft.reconstruction:cluster_aa'] )[0]['bio.seq:set']
aa_seqs_dict = create_seqs_dict(list(c['cft.reconstruction:cluster_aa'])[0]['bio.seq:set'])
nt_seqs_dict = create_seqs_dict(list(c['cft.reconstruction:asr_seqs'])[0]['bio.seq:set'])
seqmeta_dict = create_seqmeta_dict(list(c['cft.reconstruction:seqmeta'])[0]['tripl.csv:data'])
if(c['cft.reconstruction:asr_tree'][0].get('tripl.file:contents')):
c['cft.reconstruction:cluster']['cft.reconstruction:asr_tree'] = parse_tree_data(list(c['cft.reconstruction:asr_tree'][0]['tripl.file:contents'])[0])
c['cft.reconstruction:cluster']['cft.reconstruction:asr_tree'] = parse_tree_data(list(c['cft.reconstruction:asr_tree'][0]['tripl.file:contents'])[0], nt_seqs_dict, aa_seqs_dict, seqmeta_dict)
c = c['cft.reconstruction:cluster']
try:
del c['cft.cluster:unique_ids']
Expand All @@ -168,7 +191,6 @@ def pull_clonal_families(t):
#result[0]['cft.reconstruction:cluster']['cft.reconstruction:asr_tree'] = parse_tree_data(list(result['cft.reconstruction:cluster']['cft.reconstruction:asr_tree']['tripl.file:contents'])[0])
return result


def write_out(data, filename, args):
with open(filename, 'w') as fh:
if args.csv:
Expand Down
2 changes: 1 addition & 1 deletion site_scons/software_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def tripl_version():
'muscle': 'muscle -version',
'seqmagick': 'seqmagick --version',
'FastTree': None,
'prank': 'prank -v',
#'prank': 'prank -v',
'tripl': tripl_version,
#'nestly': lambda: nestly.__version__,
'ete3': lambda: ete3.__version__,
Expand Down