Skip to content

Commit

Permalink
Merge pull request #1 from andrewjpage/master
Browse files Browse the repository at this point in the history
initial working version
  • Loading branch information
andrewjpage committed Oct 6, 2014
2 parents 57fe555 + 88550f7 commit 34baebf
Show file tree
Hide file tree
Showing 15 changed files with 20,855 additions and 0 deletions.
Empty file added AUTHORS
Empty file.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Install Genometools, including python bindings. Make sure the library is in your LD_LIBRARY_PATH and your python is in PYTHONPATH
27 changes: 27 additions & 0 deletions gff3toembl/EMBLConverter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import sys
from gt import CustomVisitor
from collections import defaultdict
from gff3toembl import convert

class EMBLConverter(CustomVisitor):

def __init__(self):
CustomVisitor.__init__(self)
self.seqs = {}
self.feats = defaultdict(lambda: [], {})
self.regions = []
self.converter = convert.Convert()

def visit_feature_node(self, fn):
feature_string = self.converter.construct_feature(feature_type = fn.get_type(), start = fn.get_start(), end = fn.get_end(), strand = fn.get_strand(), feature_attributes = fn.attribs)
if feature_string != '':
self.feats[fn.get_seqid()].append(feature_string)

def visit_region_node(self, rn):
self.regions.append

def visit_comment_node(self, cn):
pass # for now

def visit_sequence_node(self, sn):
self.seqs[sn.get_description()] = sn.get_sequence()
65 changes: 65 additions & 0 deletions gff3toembl/EMBLWriter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
from collections import defaultdict
from gt import GFF3InStream

from gff3toembl import convert
from gff3toembl.EMBLConverter import EMBLConverter
from gff3toembl.VisitorStream import VisitorStream

class EMBLWriter():

def __init__(self, gff3_file, organism, taxonid, project, description, authors, title, publication, genome_type, classification, submitter_name, submitter_title, submitter_location, output_filename, locus_tag = None):
self.locus_tag = locus_tag
self.converter = convert.Convert(locus_tag)
self.conv = EMBLConverter()
self.gff3_file = gff3_file
self.organism = organism
self.taxonid = taxonid
self.project = project
self.description = description
self.authors = authors
self.title = title
self.publication = publication
self.genome_type = genome_type
self.classification = classification
self.submitter_name = submitter_name
self.submitter_title = submitter_title
self.submitter_location = submitter_location
self.output_filename = output_filename


def output_seq(self, seq):
sequence_string = self.converter.construct_sequence(seq)
return sequence_string

def output_source(self, sequence_length, organism, taxonid):
source_string = self.converter.source_template(sequence_length,organism, taxonid)
return source_string

def create_output_file(self, sequences, organism, taxonid, project, description, authors, title, publication, genome_type, classification, submitter_name, submitter_title, submitter_location):
i = 1
target = open(self.output_filename, 'w')
for seqid in sorted(sequences):
target.write(self.converter.populated_header(len(self.conv.seqs[seqid]), project, description, i, authors, title, publication, genome_type, classification, submitter_name, submitter_title, submitter_location ) )
target.write(self.output_source(len(self.conv.seqs[seqid]), organism, taxonid))
for feat in self.conv.feats[seqid]:
target.write(feat)
target.write(self.output_seq(self.conv.seqs[seqid]))
target.write("//\n")
i +=1
target.close()

def parse_and_run(self):
ins = GFF3InStream(self.gff3_file)
vs = VisitorStream(ins, self.conv)
try:
while (vs.next_tree()):
pass
except Exception, e:
print e
exit(1)
self.create_output_file(self.conv.seqs.keys(), self.organism, self.taxonid, self.project, self.description, self.authors, self.title, self.publication, self.genome_type, self.classification, self.submitter_name, self.submitter_title, self.submitter_location)

15 changes: 15 additions & 0 deletions gff3toembl/VisitorStream.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import sys
from gt import CustomStream

class VisitorStream(CustomStream):

def __init__(self, instream, visitor):
CustomStream.__init__(self)
self.instream = instream
self.visitor = visitor

def next(self):
node = self.instream.next_tree()
if node:
node.accept(self.visitor)
return node
4 changes: 4 additions & 0 deletions gff3toembl/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
__all__ = [
'convert',
]
from gff3toembl import *
171 changes: 171 additions & 0 deletions gff3toembl/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import os
import string

class Convert:
features_to_ignore = {'ncRNA': 1}
feature_attributes_to_ignore = {'ID': 1, 'protein_id': 1}
feature_attributes_translations = {'eC_number': 'EC_number'}
feature_attributes_to_split_on_multiple_lines = {'inference': 1, 'EC_number': 1}

def __init__(self, locus_tag = None):
self.locus_tag = locus_tag

def blank_header(self):
header = """\
ID XXX; XXX; %s; genomic DNA; STD; %s; %d BP.
XX
AC %s
XX
PR Project:%s
XX
DE %s contig %d
XX
RN [1]
RA %s
RT "%s"
RL %s
XX
RN [2]
RA %s
RT "%s"
RL %s
XX
CC Data release policy http://www.sanger.ac.uk/legal/#t_2
XX
FH Key Location/Qualifiers
FH
"""
return header

def populated_header(self,
num_bp=1,
project="",
description="",
contig_number=1,
authors="Pathogen Genomics",
title="Draft assembly with annotation from Prokka",
publication="Unpublished",
genome_type="circular",
classification="UNC",
submitter_name="Pathogen Informatics",
submitter_title="Direct submission",
submitter_location="Sanger"):

header = self.blank_header()
header_with_values = header % (genome_type, classification, num_bp,project+str(num_bp)+str(contig_number), project, description, contig_number,authors,title,publication,submitter_name,submitter_title,submitter_location )
return header_with_values

def source_template(self, sequence_length = None, organism = None, taxon_id = None):
source_template = """\
FT source 1..%d
FT /organism="%s"
FT /mol_type="genomic DNA"
FT /db_xref="taxon:%d"
""" % (sequence_length, organism,taxon_id)
return source_template

def construct_sequence(self,sequence):
sequence_string = ''
sequence_string += self.sequence_header(sequence)
sequence_string += self.sequence_body(sequence)
return sequence_string

def sequence_header(self, sequence):
sequence = sequence.lower()
a = sequence.count('a')
c = sequence.count('c')
g = sequence.count('g')
t = sequence.count('t')
o = len(sequence) - a - c - g - t;
return "SQ Sequence %d BP; %d A; %d C; %d G; %d T; %d other;\n" % \
(len(sequence), a, c, g, t, o)

def sequence_body(self, sequence):
sequence = sequence.lower()
output = " "
i = 1
for j in range(len(sequence)):
output +=sequence[j]
if (i) % 10 == 0:
output += " "
if (i) % 60 == 0 and i < len(sequence) :
output += "%9s\n " % (i)
elif (i) % 60 == 0 and i == len(sequence):
output += "%9s\n" % (i)
return output
i += 1

if((i)%60 ==0):
output += ' '*(66 -(((i-1)%60)/10) -((i-1)%60)) + "%9d\n" % (i - 1)
return output
else:
output +=' '*(80-i%60-(i%60)/10-13) + "%9d\n" % (i - 1)
return output

def feature_header(self, feature_type = None, start = None, end = None, strand = None):
string = ""
cmp1 = ''
cmp2 = ''
if strand == '-':
cmp1 = 'complement('
cmp2 = ')'
string += "FT %s%s%s%d..%d%s\n" % (feature_type, ' ' * (16-len(feature_type)), cmp1, start, end, cmp2)
return string

def construct_feature(self, feature_type = None, start = None, end = None, strand = None, feature_attributes = {}):
feature = ''
if feature_type in self.features_to_ignore:
return feature

feature += self.feature_header( feature_type ,start, end, strand )
for attribute_key in feature_attributes.keys():
feature += self.construct_feature_attribute( attribute_key = attribute_key, attribute_value = feature_attributes[attribute_key])

return feature

def update_locus_tag(self,attribute_value):
if self.locus_tag == None:
return attribute_value
locus_tag_parts = attribute_value.split('_')
new_attribute = self.locus_tag + '_' +str(locus_tag_parts[-1])
return new_attribute

def construct_feature_attribute(self,attribute_key = None, attribute_value = None):
feature_string = ''
if attribute_key in self.feature_attributes_to_ignore:
return feature_string
if attribute_key in self.feature_attributes_translations:
attribute_key = self.feature_attributes_translations[attribute_key]

if attribute_key == 'locus_tag':
attribute_value = self.update_locus_tag(attribute_value)

split_attribute_values = attribute_value.split( ',')
if attribute_key not in self.feature_attributes_to_split_on_multiple_lines:
feature_string += self.create_multi_line_feature_attribute_string(attribute_key, split_attribute_values[0])
else:
for split_attribute_value in split_attribute_values:
feature_string += self.create_multi_line_feature_attribute_string(attribute_key, split_attribute_value)
return feature_string


def create_multi_line_feature_attribute_string(self,attribute_key = None, attribute_value = None):
feature_string = ''
attribute_value = '"' + attribute_value + '"'

# First line < first_line_size
first_line_size = 55 - ( len(attribute_key))
feature_string += "FT%s/%s=%s\n" % (' ' * 19, attribute_key, attribute_value[:first_line_size])
if attribute_value[first_line_size:] == None:
return feature_string
attribute_value = attribute_value[first_line_size:]

while(len(attribute_value) > 0):
feature_string += "FT%s%s\n" % (' ' * 19, attribute_value[:57])
if attribute_value[57:] == None:
return feature_string
attribute_value = attribute_value[57:]

return feature_string


49 changes: 49 additions & 0 deletions gff3toembl/tests/EMBLWriter_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import unittest
import sys
import os
import filecmp
from gff3toembl.EMBLWriter import EMBLWriter
from gff3toembl import convert

modules_dir = os.path.dirname(os.path.abspath(convert.__file__))
data_dir = os.path.join(modules_dir, 'tests', 'data')

class TestEMBLWriter(unittest.TestCase):

def test_single_feature(self):
'''test that the script will convert from GFF3 to EMBL'''
emblwriter = EMBLWriter(os.path.join(data_dir,'single_feature.gff'),
'Organism',
1234,
'My project',
'My description',
'John',
'Some title',
'Some journal',
'circular',
'PROK',
'Jane',
'My institute',
'UK', 'single_feature.embl', None )
emblwriter.parse_and_run()
assert filecmp.cmp(os.path.join(data_dir, 'expected_single_feature.embl'), 'single_feature.embl', shallow=False)
os.remove('single_feature.embl')

def test_large_conversion(self):
'''test a large gff3 file converts to EMBL'''
emblwriter = EMBLWriter(os.path.join(data_dir,'large_annotation.gff'),
'Organism',
1234,
'My project',
'My description',
'John',
'Some title',
'Some journal',
'circular',
'PROK',
'Jane',
'My institute',
'UK', 'large_annotation.embl', None )
emblwriter.parse_and_run()
assert filecmp.cmp(os.path.join(data_dir, 'expected_large_annotation.embl'), 'large_annotation.embl', shallow=False)
os.remove('large_annotation.embl')
Loading

0 comments on commit 34baebf

Please sign in to comment.