-
Notifications
You must be signed in to change notification settings - Fork 1
/
metaG_mix.py
executable file
·186 lines (148 loc) · 7.02 KB
/
metaG_mix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python3
import sys
import argparse
import os
import random
import shutil
import reads_generator
def parse_arguments():
# Parse the command line
parser = argparse.ArgumentParser(description="Use a bunch of fasta file to create a metagenomic synthetic sample based on a lognorm distribution.")
parser.add_argument('--genome_directory', '-d', required=True, help="A directory containing fasta files that will be used for the sample creation.")
parser.add_argument('--num_genomes', '-n', type=int, required=True, help="Number of genomes that must be includded")
parser.add_argument('--threads', '-t', type=int, default=4, help="The number of thred used to compute reads.")
parser.add_argument('--coverage', '-c', type=int, required=True, help="Average coverage.")
parser.add_argument('--outdir', '-o', default='.', help="Output directory")
parser.add_argument('--seed', '-s', type=int, help="The random seed (use it only for reproducibility purpose).")
parser.add_argument('--length_distribution', '-l', help="Read length distribution")
parser.add_argument('--only_distribution', '-od', action='store_true', help="Stop the execution after creating a distribution file.")
args = parser.parse_args()
# Verification of directory correctness
path = args.genome_directory
if not os.path.isdir(path):
print(f"{path} is not a directory", file=sys.stderr)
exit(1)
if path[-1] == "/":
args.genome_directory = path[:-1]
path = args.outdir
if not os.path.isdir(path):
os.mkdir(path)
if path[-1] == "/":
args.outdir = path[:-1]
# Conflicting parameters
if (not args.only_distribution) and (not args.length_distribution):
print("A csv length distribution must be provided", file=sys.stderr)
exit(1)
# Fix random seed
if args.seed:
random.seed(args.seed)
# print("TODO: add threading support", file=sys.stderr)
return args
def select_genomes(directory, n):
""" Select n genomes from the directory.
@args directory A directory containing fasta, fa or fna files.
@args n The number of genome to select
@except Raise an exception when not enought files (regarding n) are in directory.
@return A list of n filenames.
"""
# Select files from the directory, filetering out the wrong extentions
supported_ententions = ["fa", "fasta", "fna"]
files = [f for f in os.listdir(directory) if f.split('.')[-1] in supported_ententions]
# Sub-select n files
if len(files) < n:
raise Exception(f"Impossible to find {n} distinct genomes in {directory}.")
return []
return random.sample(files, n)
def create_mix(files):
""" Compute a relative abundance of each genome using a log-norm distribution.
The distribution is centered on 1 with a standard deviation of 2 (from CAMISIM).
A relative abundance of 1 significate that the genome associated must have
a coverage equivalent to the average coverage.
@args files All the files to compute abundances.
@return A dictionnary associating each filename to a relative abundance.
"""
relative_dist = {}
for f in files:
relative_dist[f] = random.lognormvariate(1, 1.5)
# Normalise to center on 1
vals = relative_dist.values()
avg = sum(vals) / len(vals)
relative_dist = {k:v/avg for k, v in relative_dist.items()}
return relative_dist
# def generate(filename, len_generator, depth, paired=False, circular=False, subsample_ratio=1, outprefix=None)
from datetime import datetime
def create_out_tree(outdir):
# Create output directory tree
sample_dir = outdir + '/' + str(datetime.now()).replace(' ', '_').replace(':', '_')
os.mkdir(sample_dir)
shutil.move(outdir + '/abundances.csv', sample_dir + '/abundances.csv')
out_gen_dir = sample_dir + '/genomes'
os.mkdir(out_gen_dir)
reads_dir = sample_dir + '/reads'
os.mkdir(reads_dir)
return sample_dir
from multiprocessing import Pool
# Function only needed for parallelism
def genome_reads_compute(args):
genome, abundance, gen_dir, len_dist, sample_dir = args
# Copy the complete genome
shutil.copy(f"{gen_dir}/{genome}", f"{sample_dir}/genomes/{genome}")
# Generate reads
rnd_gens = reads_generator.init_random_generators(len_dist)
reads_generator.generate(
f"{sample_dir}/genomes/{genome}",
rnd_gens,
abundance,
paired=False,
subsample_ratio=0.5,
outprefix=f"{sample_dir}/reads/{'.'.join(genome.split('.')[:-1])}"
)
def create_sequences_parallel(abundances, gen_dir, len_dist, sample_dir, threads=4):
""" Create one read file for each selected genome regarding the abundance in the sample.
The used genomes will be copied into a genome folder into the sample directory.
The reads will be copied into a reads folder.
@args abundances A dictionary associating genomes to abundances
@args gen_dir The directory containing the genome fasta files
@args len_dist A read length distribution must be provided. See reads_generator for more details.
@args sample_dir The directory where everything will be outputed.
"""
def gen_args():
for genome, abundance in abundances.items():
yield (genome, abundance, gen_dir, len_dist, sample_dir)
with Pool(processes=threads)as pool:
pool.map(genome_reads_compute, gen_args())
def pull_files(files, sample_dir):
""" Take all the content of each file from files and pull them together.
TODO: The reads must be shuffled.
@args files the list of file to concatenate
@args sample_dir the directory of the sample to pull
"""
with open(f"{sample_dir}/pulled_reads.fa", "w") as pulled:
for f in files:
core_name = ".".join(f.split(".")[:-1])
with open(f"{sample_dir}/reads/{core_name}.fasta") as fr:
for line in fr:
pulled.write(line)
def main():
args = parse_arguments()
print("1 - Selecting genomes", file=sys.stderr)
files = select_genomes(args.genome_directory, args.num_genomes)
print("2 - Generate abundances", file=sys.stderr)
# Compute relative distribution
relative_abundances = create_mix(files)
abundances = {k:v*args.coverage for k,v in relative_abundances.items()}
# Save distribution
with open(f"{args.outdir}/abundances.csv", "w") as ab_fw:
ab_fw.write("genome,abundance\n")
for gen, ab in abundances.items():
ab_fw.write(f"{gen},{ab}\n")
if args.only_distribution:
return
print("3 - Create reads", file=sys.stderr)
sample_dir = create_out_tree(args.outdir)
create_sequences_parallel(abundances, args.genome_directory, args.length_distribution, sample_dir, threads=args.threads)
print("4 - Pull reads together", file=sys.stderr)
pull_files(files, sample_dir)
print("TODO: Shuffle the concatenate reads", file=sys.stderr)
if __name__ == "__main__":
main()