genbank_to_fasta.py

#! /usr/bin/env python
#This work is copyright Cedar McKay and Gabrielle Rocap, University of Washington.
#This work is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License. 
#To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/ or send a 
#letter to Creative Commons, 171 Second Street, Suite 300, San Francisco, California, 94105, USA.


#To install, move this file to your bin directory, adjust the above path to python, and install
#Biopython.


import sys
import os.path
from optparse import OptionParser
from Bio import SeqIO
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqIO import InsdcIO


#### Collect Input ####
#######################
usage="""Takes a GenBank or EMBL format file as input, and outputs a fasta file.
usage: %prog -i FILE [options]"""


parser = OptionParser(usage=usage, version="%prog 1.1")


parser.add_option("-i", "--in_file", metavar="FILE", dest="in_file", default=None,
				help="Specify the input FILE that you wish to convert")

parser.add_option("-m", "--file_format", metavar="FORMAT", dest="file_format", default='genbank',
				help="Specify the input file format. Specify 'genbank' or 'embl'. "
				"Default is genbank.")

parser.add_option("-o", "--out_file", metavar="FILE", dest="out_file", default=None,
				help="Specify the path and name of the output fasta file you wish to create. "
				"Default will be the same as the in_file, but with a 'fasta' suffix.")

parser.add_option("-s", "--sequence_type", dest="sequence_type", default="aa",
				help="Specify the kind of sequence you would like to extract. Options are 'aa' "
				"(feature amino acids), 'nt' (feature nucleotides), 'whole' (the entire "
				"sequence, not just sequence corresponding to features) and 'taa'	 (amino acids "
				"translated on the fly, which generates amino acid sequence by translating the "
				"nucleotide sequence rather than extracting from the feature table)."
				"Default is 'aa'.") 

parser.add_option("-f", "--feature_type", dest="feature_type", default="CDS",
				help="Specify the type of feature that you would like to extract. This option "
				"accepts arbitrary text, and will fail if you input a non-existent feature name. "
				"Common options are 'CDS', 'rRNA', 'tRNA', or 'gene'. Default is 'CDS'.")

parser.add_option("-d", "--delimiter", dest="delimiter", default="spacepipe",
				help="Specify the character you wish to use to separate header elements. Options "
				"are 'tab', 'space', 'spacepipe', 'pipe', 'dash', or 'underscore'. "
				"Default is 'spacepipe'.")

parser.add_option("-q", "--qualifiers", dest="qualifiers", default="locus_tag,gene,product,location",
				help="Specify which qualifiers should make up the fasta header line. Takes comma "
				"separated list. Will accept any qualifier that appears in your genbank "
				"file, (e.g. 'note', 'protein_id', etc). Qualifiers appear in the header line in "
				"the order you list them. Use 'location_long' for the exact location information as it "
				"appears in the input file. Default is 'locus_tag,gene,product,location'.")

parser.add_option("-a", "--annotations", dest="annotations", default=None,
				help="Specify which record annotation should make up the header line. Takes "
				"comma separated list. Will accept any annotation that appears in your genbank "
				"file, (e.g. 'comment', 'taxonomy', accessions, etc). Only used with "
				"--sequence_type = whole. Default is 'organism'.")

parser.add_option("-u", "--user_header", dest="user_header", default=None,
				help="If you prefer to specify your own completely custom header line, you may "
				"specify it here. Should be speccified in single quotes. Only used with "
				"--sequence_type = whole.")

(options, args) = parser.parse_args()


#### Variables and Names ####
#############################
#Figure out some names and paths
if options.in_file:
	in_file = os.path.abspath(options.in_file)
else:	
	print "You must specify an in_file. Use '-h' for help."
	sys.exit()

(in_filePath, in_fileWholeName) = os.path.split(in_file)
(in_fileBase, in_fileExt) = os.path.splitext(in_fileWholeName)

#Figure out what our out_file is.
if options.out_file:
	out_file = os.path.join(in_filePath, options.out_file)
else:
	out_file = os.path.join(in_filePath, in_fileBase + '.fasta')
out_file = os.path.abspath(out_file)	

#Figure out what the user really wanted from delimiter:
delimiter = options.delimiter
if delimiter == 'space':
	delimiter = ' '
elif delimiter == 'spacepipe':
	delimiter = ' | '	
elif delimiter == 'pipe':
	delimiter = '|'
elif delimiter == 'dash':
	delimiter = '-'
elif delimiter == 'underscore':
	delimiter = '_'
elif delimiter == 'tab':
	delimiter = '	'
else:
	delimiter = ' | '
	
#Get the header_line user input, split on commas and turn into a list.
qualifier_list = options.qualifiers.split(',')
if options.annotations:
	annotation_list = options.annotations.split(',')
else:
	annotation_list = ['organism']

#Gather Remaining options
sequence_type = options.sequence_type
feature_type = options.feature_type
user_header = options.user_header
file_format = options.file_format


#Make sure no specified options conflict, or don't make sense
if user_header and sequence_type != 'whole':
	print "It doesn't make sense to set the user_header unless you are using the 'whole' " \
	"sequence_type. Use '-h' for help."
	sys.exit()

if file_format not in ['genbank', 'embl']:
	print "Must specify either 'genbank' or 'embl' format for the in_file. Use '-h' for help."
	sys.exit()

if options.annotations  and sequence_type != 'whole':
	print "It doesn't make sense to set the annotations unless you are using the 'whole' " \
	"sequence_type. Use '-h' for help."
	sys.exit()

	
#### Functions ####
###################
def build_header(feature, qualifier_list):
	header = []
	#First we have to handle the special case of location.
	if 'location' in qualifier_list:
		#Go through some pain to make location human readable by adding 1 to first position
		location = str(int(str(feature.location.nofuzzy_start))+1) + ":" + \
		str(feature.location.nofuzzy_end) 
		if feature.strand == 1:
			location = location + ' Forward'
		elif feature.strand == -1:
			location = location + ' Reverse'
		else:
			location = location + ' Could not determine strand'
	if 'location_long' in qualifier_list:
		location_long = InsdcIO._insdc_feature_location_string(feature, len(record.seq))
	#Now march through items in qualifier_list and get qualifiers, or special case each one.
	#Start with the special cases
	for item in qualifier_list:
		if item == 'location':
			header.append(location)
		elif item == 'location_long':
			header.append(location_long)
		else:
			if not feature.qualifiers.has_key(item) and item == 'gene':
				if feature.qualifiers.has_key('locus_tag'):
					item = 'locus_tag'
				else:
					item = 'db_xref'
			elif not feature.qualifiers.has_key(item) and item == "locus_tag":
				item == 'db_xref'
			#Finished with the special cases, now just getting plain old qualifiers
			if feature.qualifiers.has_key(item):
				header_part = feature.qualifiers[item][0]
				#Catch improper newline character in the middle of features.
				header_part = header_part.replace("\n"," ")
				#Catch inproper spaces in middle of feature and replace with single space.
				#No idea why this appears sometimes
				header_part = header_part.replace("						 ", " ") 
				if header_part == '':
					header.append('None')
				else:
					header.append(header_part)
			else:
				header_part = 'missing_%s_qualifer' % item
				header.append(header_part)
	return delimiter.join(header)


def get_nt_seq(feature):
	#Get a name to call this thing
	if feature.qualifiers.has_key('locus_tag'):
		name = feature.qualifiers['locus_tag'][0]
	elif feature.qualifiers.has_key('gene'):
		name = feature.qualifiers['gene'][0]
	else:
		name = "location" + str(feature.location)
	#In the simple case, there will be no sub_features (exons), just one location.
	if len(feature.sub_features) == 0:
		if str(feature.location.nofuzzy_start) != str(feature.location.start) or \
		str(feature.location.nofuzzy_end) != str(feature.location.end):
			print "Fuzzy location detected in %s" % name
		if feature.strand == 1:
			temp_seq = record.seq[feature.location.nofuzzy_start : feature.location.nofuzzy_end]
		elif feature.strand == -1: 
			# If on -1 (complement) strand, take the reverse complement of the whole feature
			temp_seq = record.seq[feature.location.nofuzzy_start : \
			feature.location.nofuzzy_end].reverse_complement()
	elif len(feature.sub_features) > 0:
		if feature.location_operator== 'order':
				print "WARNING %s has a location "	% (name)
				print "operator of 'order' rather than the more usual 'join'."
		#If we have sub_features, that means we have exons that must be patched together.
		temp_seq = Seq('', IUPAC.ambiguous_dna) # Make an empty sequence instance.	
		for exon in feature.sub_features:
			#Now deal with the sub-features one at a time
			if str(exon.location.nofuzzy_start) != str(exon.location.start) or \
			str(exon.location.nofuzzy_end) != str(exon.location.end):
				print "WARNING fuzzy location detected in %s" % name
			exon_chunk = record.seq[exon.location.nofuzzy_start : exon.location.nofuzzy_end]
			#But wait! Is it in the opposite orientation as the feature itself??
			if feature.strand != exon.strand:
				exon_chunk = exon_chunk.reverse_complement()
			temp_seq = temp_seq + exon_chunk #Add the chunk to the sequence we are building.
			if feature.strand == -1:
				#Take the reverse complement if the feature itself is on the complement strand.
				temp_seq = temp_seq.reverse_complement()
	return temp_seq			
				
		
def genbank_to_fasta(record, sequence_type, qualifier_list):
	new_records = []
	for feature in record.features:
		if feature.type == feature_type: # What kind of feature to extract. Usually CDS or tRFLP
			if sequence_type == 'nt':
				temp_record = SeqRecord(get_nt_seq(feature), id = build_header(feature, qualifier_list),\
				description = '')
			elif sequence_type == 'taa':
				if feature.qualifiers.has_key("transl_table"):
					translation_table = feature.qualifiers["transl_table"][0]
				else:
					translation_table = 11
				temp_record = SeqRecord(get_nt_seq(feature).translate(table = translation_table),\
				id = build_header(feature, qualifier_list), description = '')
			elif sequence_type == 'aa':
				if feature.qualifiers.has_key("translation"):
					temp_seq = Seq(feature.qualifiers["translation"][0], IUPAC.protein)
				else:				
					if feature.qualifiers.has_key("transl_table"):
						translation_table = feature.qualifiers["transl_table"][0]
					else:
						translation_table = 11
					temp_seq = get_nt_seq(feature).translate(table = translation_table)
				temp_record = SeqRecord(temp_seq, id = build_header(feature, qualifier_list), \
				description = '')
			new_records.append(temp_record)
	return new_records


def genbank_to_fasta_whole(record, annotation_list, user_header, delimiter):
	if user_header:
		header = user_header
	else:
		header = []
		for item in annotation_list:
			if record.annotations.has_key(item):
				header_part = record.annotations[item]
				if type(header_part) == type([]): #Some attributes are lists. Must turn into string
					header_part = ' : '.join(header_part)
				header_part = header_part.replace("\n"," ") #Catch improper newline character
				header_part = header_part.replace("						 ", " ")#Catch inproper spaces
				header.append(header_part)
			else:
				header_part = 'missing_%s_annotation' % item
				header.append(header_part)
		header = delimiter.join(header)
	temp_record = SeqRecord(record.seq, id = header, description = '')
	return [temp_record] #Return a list because will be used in a context requiring a list

#### Main ####
##############
in_file_handle = open (in_file, 'rU') #The 'U' option so we don't have to worry about line endings
out_file_handle = open (out_file, 'w')

record_iterator = SeqIO.parse(in_file_handle, file_format)

for record in record_iterator:
	print "Converting '%s' to fasta ..." % record.description
	if sequence_type in ['nt', 'aa', 'taa']:
		fasta_records = genbank_to_fasta(record, sequence_type, qualifier_list)
	elif sequence_type == 'whole': #whole records are handled specially
		fasta_records = genbank_to_fasta_whole(record, annotation_list, user_header, delimiter)
	else:
		print "Unrecognized sequence_type. Use '-h' for help."
		sys.exit()
	SeqIO.write(fasta_records, out_file_handle, 'fasta')

in_file_handle.close()
out_file_handle.close()