-
Notifications
You must be signed in to change notification settings - Fork 1
/
PrepareGenomes_OrthoMCL.py
78 lines (56 loc) · 2.83 KB
/
PrepareGenomes_OrthoMCL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/local/bin/python
#Created on 2/12/13
__author__ = 'Juan Ugalde'
def read_genome_list(input_file):
"""
Function used to read the genome list. The output is a dictionary with the genome list
(fasta_prefix -> new_prefix) and the total number of genomes in the list
"""
genome_count = 0
genome_info = {}
for line in open(input_file, 'r'):
line = line.rstrip()
element = line.split("\t")
genome_info[element[0]] = element[2]
genome_count += 1
return genome_info, genome_count
def modify_fasta(fasta_file, new_fasta_prefix, output_directory):
"""
The input is a fasta file. This will rename the file with the new ID,
and modify each ID in each entry of the fasta file
"""
from Bio import SeqIO # Import from tools to read fasta, from Biopython
genome_output = open(output_directory + "/" + new_fasta_prefix + "." + "fasta", "w")
#Rename the fasta
for record in SeqIO.parse(fasta_file, "fasta"):
genome_output.write(">" + new_fasta_prefix + "|" + record.id + "\n")
genome_output.write(str(record.seq) + "\n")
genome_output.close()
#For standalone use
if __name__ == '__main__':
###This script takes a folder with fasta files (DNA) and a list of the genomes in the following format:
###Prefix_Fasta_File\tGenome_Name\tNewID
###The idea is to replace the prefix on each fasta file, with the NewID, to be used for the OrthoMCL analysis
import argparse
import os
program_description = "This script prepares a set of fasta files for running OrthoMCL"
parser = argparse.ArgumentParser(description=program_description)
parser.add_argument("-l", "--genome_list_index", type=str,
help="File with the genome Taxoid, species name and the ID to replace", required=True)
parser.add_argument("-f", "--fasta_directory", type=str,
help="Folder with the fasta files (downloaded from JGI)", required=True)
parser.add_argument("-e", "--file_extension", type=str, help="Extension of the files (usually faa or fna)",
required=True)
parser.add_argument("-o", "--output_directory", type=str,
help="Output folder for the modified fasta files", required=True)
args = parser.parse_args()
#Create the output directory
if not os.path.exists(args.output_directory):
os.makedirs(args.output_directory)
#Read the genome list, and create the dictionary
genome_dictionary, total_genome_count = read_genome_list(args.genome_list_index)
for genome in genome_dictionary:
fasta_file = args.fasta_directory + "/" + genome + "." + args.file_extension
modify_fasta(fasta_file, genome_dictionary.get(genome), args.output_directory)
#Print output
print "A total of %d genomes were renamed" % total_genome_count