-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from andrewjpage/master
initial working version
- Loading branch information
Showing
15 changed files
with
20,855 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Install Genometools, including python bindings. Make sure the library is in your LD_LIBRARY_PATH and your python is in PYTHONPATH |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import sys | ||
from gt import CustomVisitor | ||
from collections import defaultdict | ||
from gff3toembl import convert | ||
|
||
class EMBLConverter(CustomVisitor): | ||
|
||
def __init__(self): | ||
CustomVisitor.__init__(self) | ||
self.seqs = {} | ||
self.feats = defaultdict(lambda: [], {}) | ||
self.regions = [] | ||
self.converter = convert.Convert() | ||
|
||
def visit_feature_node(self, fn): | ||
feature_string = self.converter.construct_feature(feature_type = fn.get_type(), start = fn.get_start(), end = fn.get_end(), strand = fn.get_strand(), feature_attributes = fn.attribs) | ||
if feature_string != '': | ||
self.feats[fn.get_seqid()].append(feature_string) | ||
|
||
def visit_region_node(self, rn): | ||
self.regions.append | ||
|
||
def visit_comment_node(self, cn): | ||
pass # for now | ||
|
||
def visit_sequence_node(self, sn): | ||
self.seqs[sn.get_description()] = sn.get_sequence() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
import sys | ||
from collections import defaultdict | ||
from gt import GFF3InStream | ||
|
||
from gff3toembl import convert | ||
from gff3toembl.EMBLConverter import EMBLConverter | ||
from gff3toembl.VisitorStream import VisitorStream | ||
|
||
class EMBLWriter(): | ||
|
||
def __init__(self, gff3_file, organism, taxonid, project, description, authors, title, publication, genome_type, classification, submitter_name, submitter_title, submitter_location, output_filename, locus_tag = None): | ||
self.locus_tag = locus_tag | ||
self.converter = convert.Convert(locus_tag) | ||
self.conv = EMBLConverter() | ||
self.gff3_file = gff3_file | ||
self.organism = organism | ||
self.taxonid = taxonid | ||
self.project = project | ||
self.description = description | ||
self.authors = authors | ||
self.title = title | ||
self.publication = publication | ||
self.genome_type = genome_type | ||
self.classification = classification | ||
self.submitter_name = submitter_name | ||
self.submitter_title = submitter_title | ||
self.submitter_location = submitter_location | ||
self.output_filename = output_filename | ||
|
||
|
||
def output_seq(self, seq): | ||
sequence_string = self.converter.construct_sequence(seq) | ||
return sequence_string | ||
|
||
def output_source(self, sequence_length, organism, taxonid): | ||
source_string = self.converter.source_template(sequence_length,organism, taxonid) | ||
return source_string | ||
|
||
def create_output_file(self, sequences, organism, taxonid, project, description, authors, title, publication, genome_type, classification, submitter_name, submitter_title, submitter_location): | ||
i = 1 | ||
target = open(self.output_filename, 'w') | ||
for seqid in sorted(sequences): | ||
target.write(self.converter.populated_header(len(self.conv.seqs[seqid]), project, description, i, authors, title, publication, genome_type, classification, submitter_name, submitter_title, submitter_location ) ) | ||
target.write(self.output_source(len(self.conv.seqs[seqid]), organism, taxonid)) | ||
for feat in self.conv.feats[seqid]: | ||
target.write(feat) | ||
target.write(self.output_seq(self.conv.seqs[seqid])) | ||
target.write("//\n") | ||
i +=1 | ||
target.close() | ||
|
||
def parse_and_run(self): | ||
ins = GFF3InStream(self.gff3_file) | ||
vs = VisitorStream(ins, self.conv) | ||
try: | ||
while (vs.next_tree()): | ||
pass | ||
except Exception, e: | ||
print e | ||
exit(1) | ||
self.create_output_file(self.conv.seqs.keys(), self.organism, self.taxonid, self.project, self.description, self.authors, self.title, self.publication, self.genome_type, self.classification, self.submitter_name, self.submitter_title, self.submitter_location) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import sys | ||
from gt import CustomStream | ||
|
||
class VisitorStream(CustomStream): | ||
|
||
def __init__(self, instream, visitor): | ||
CustomStream.__init__(self) | ||
self.instream = instream | ||
self.visitor = visitor | ||
|
||
def next(self): | ||
node = self.instream.next_tree() | ||
if node: | ||
node.accept(self.visitor) | ||
return node |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
__all__ = [ | ||
'convert', | ||
] | ||
from gff3toembl import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
import os | ||
import string | ||
|
||
class Convert: | ||
features_to_ignore = {'ncRNA': 1} | ||
feature_attributes_to_ignore = {'ID': 1, 'protein_id': 1} | ||
feature_attributes_translations = {'eC_number': 'EC_number'} | ||
feature_attributes_to_split_on_multiple_lines = {'inference': 1, 'EC_number': 1} | ||
|
||
def __init__(self, locus_tag = None): | ||
self.locus_tag = locus_tag | ||
|
||
def blank_header(self): | ||
header = """\ | ||
ID XXX; XXX; %s; genomic DNA; STD; %s; %d BP. | ||
XX | ||
AC %s | ||
XX | ||
PR Project:%s | ||
XX | ||
DE %s contig %d | ||
XX | ||
RN [1] | ||
RA %s | ||
RT "%s" | ||
RL %s | ||
XX | ||
RN [2] | ||
RA %s | ||
RT "%s" | ||
RL %s | ||
XX | ||
CC Data release policy http://www.sanger.ac.uk/legal/#t_2 | ||
XX | ||
FH Key Location/Qualifiers | ||
FH | ||
""" | ||
return header | ||
|
||
def populated_header(self, | ||
num_bp=1, | ||
project="", | ||
description="", | ||
contig_number=1, | ||
authors="Pathogen Genomics", | ||
title="Draft assembly with annotation from Prokka", | ||
publication="Unpublished", | ||
genome_type="circular", | ||
classification="UNC", | ||
submitter_name="Pathogen Informatics", | ||
submitter_title="Direct submission", | ||
submitter_location="Sanger"): | ||
|
||
header = self.blank_header() | ||
header_with_values = header % (genome_type, classification, num_bp,project+str(num_bp)+str(contig_number), project, description, contig_number,authors,title,publication,submitter_name,submitter_title,submitter_location ) | ||
return header_with_values | ||
|
||
def source_template(self, sequence_length = None, organism = None, taxon_id = None): | ||
source_template = """\ | ||
FT source 1..%d | ||
FT /organism="%s" | ||
FT /mol_type="genomic DNA" | ||
FT /db_xref="taxon:%d" | ||
""" % (sequence_length, organism,taxon_id) | ||
return source_template | ||
|
||
def construct_sequence(self,sequence): | ||
sequence_string = '' | ||
sequence_string += self.sequence_header(sequence) | ||
sequence_string += self.sequence_body(sequence) | ||
return sequence_string | ||
|
||
def sequence_header(self, sequence): | ||
sequence = sequence.lower() | ||
a = sequence.count('a') | ||
c = sequence.count('c') | ||
g = sequence.count('g') | ||
t = sequence.count('t') | ||
o = len(sequence) - a - c - g - t; | ||
return "SQ Sequence %d BP; %d A; %d C; %d G; %d T; %d other;\n" % \ | ||
(len(sequence), a, c, g, t, o) | ||
|
||
def sequence_body(self, sequence): | ||
sequence = sequence.lower() | ||
output = " " | ||
i = 1 | ||
for j in range(len(sequence)): | ||
output +=sequence[j] | ||
if (i) % 10 == 0: | ||
output += " " | ||
if (i) % 60 == 0 and i < len(sequence) : | ||
output += "%9s\n " % (i) | ||
elif (i) % 60 == 0 and i == len(sequence): | ||
output += "%9s\n" % (i) | ||
return output | ||
i += 1 | ||
|
||
if((i)%60 ==0): | ||
output += ' '*(66 -(((i-1)%60)/10) -((i-1)%60)) + "%9d\n" % (i - 1) | ||
return output | ||
else: | ||
output +=' '*(80-i%60-(i%60)/10-13) + "%9d\n" % (i - 1) | ||
return output | ||
|
||
def feature_header(self, feature_type = None, start = None, end = None, strand = None): | ||
string = "" | ||
cmp1 = '' | ||
cmp2 = '' | ||
if strand == '-': | ||
cmp1 = 'complement(' | ||
cmp2 = ')' | ||
string += "FT %s%s%s%d..%d%s\n" % (feature_type, ' ' * (16-len(feature_type)), cmp1, start, end, cmp2) | ||
return string | ||
|
||
def construct_feature(self, feature_type = None, start = None, end = None, strand = None, feature_attributes = {}): | ||
feature = '' | ||
if feature_type in self.features_to_ignore: | ||
return feature | ||
|
||
feature += self.feature_header( feature_type ,start, end, strand ) | ||
for attribute_key in feature_attributes.keys(): | ||
feature += self.construct_feature_attribute( attribute_key = attribute_key, attribute_value = feature_attributes[attribute_key]) | ||
|
||
return feature | ||
|
||
def update_locus_tag(self,attribute_value): | ||
if self.locus_tag == None: | ||
return attribute_value | ||
locus_tag_parts = attribute_value.split('_') | ||
new_attribute = self.locus_tag + '_' +str(locus_tag_parts[-1]) | ||
return new_attribute | ||
|
||
def construct_feature_attribute(self,attribute_key = None, attribute_value = None): | ||
feature_string = '' | ||
if attribute_key in self.feature_attributes_to_ignore: | ||
return feature_string | ||
if attribute_key in self.feature_attributes_translations: | ||
attribute_key = self.feature_attributes_translations[attribute_key] | ||
|
||
if attribute_key == 'locus_tag': | ||
attribute_value = self.update_locus_tag(attribute_value) | ||
|
||
split_attribute_values = attribute_value.split( ',') | ||
if attribute_key not in self.feature_attributes_to_split_on_multiple_lines: | ||
feature_string += self.create_multi_line_feature_attribute_string(attribute_key, split_attribute_values[0]) | ||
else: | ||
for split_attribute_value in split_attribute_values: | ||
feature_string += self.create_multi_line_feature_attribute_string(attribute_key, split_attribute_value) | ||
return feature_string | ||
|
||
|
||
def create_multi_line_feature_attribute_string(self,attribute_key = None, attribute_value = None): | ||
feature_string = '' | ||
attribute_value = '"' + attribute_value + '"' | ||
|
||
# First line < first_line_size | ||
first_line_size = 55 - ( len(attribute_key)) | ||
feature_string += "FT%s/%s=%s\n" % (' ' * 19, attribute_key, attribute_value[:first_line_size]) | ||
if attribute_value[first_line_size:] == None: | ||
return feature_string | ||
attribute_value = attribute_value[first_line_size:] | ||
|
||
while(len(attribute_value) > 0): | ||
feature_string += "FT%s%s\n" % (' ' * 19, attribute_value[:57]) | ||
if attribute_value[57:] == None: | ||
return feature_string | ||
attribute_value = attribute_value[57:] | ||
|
||
return feature_string | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import unittest | ||
import sys | ||
import os | ||
import filecmp | ||
from gff3toembl.EMBLWriter import EMBLWriter | ||
from gff3toembl import convert | ||
|
||
modules_dir = os.path.dirname(os.path.abspath(convert.__file__)) | ||
data_dir = os.path.join(modules_dir, 'tests', 'data') | ||
|
||
class TestEMBLWriter(unittest.TestCase): | ||
|
||
def test_single_feature(self): | ||
'''test that the script will convert from GFF3 to EMBL''' | ||
emblwriter = EMBLWriter(os.path.join(data_dir,'single_feature.gff'), | ||
'Organism', | ||
1234, | ||
'My project', | ||
'My description', | ||
'John', | ||
'Some title', | ||
'Some journal', | ||
'circular', | ||
'PROK', | ||
'Jane', | ||
'My institute', | ||
'UK', 'single_feature.embl', None ) | ||
emblwriter.parse_and_run() | ||
assert filecmp.cmp(os.path.join(data_dir, 'expected_single_feature.embl'), 'single_feature.embl', shallow=False) | ||
os.remove('single_feature.embl') | ||
|
||
def test_large_conversion(self): | ||
'''test a large gff3 file converts to EMBL''' | ||
emblwriter = EMBLWriter(os.path.join(data_dir,'large_annotation.gff'), | ||
'Organism', | ||
1234, | ||
'My project', | ||
'My description', | ||
'John', | ||
'Some title', | ||
'Some journal', | ||
'circular', | ||
'PROK', | ||
'Jane', | ||
'My institute', | ||
'UK', 'large_annotation.embl', None ) | ||
emblwriter.parse_and_run() | ||
assert filecmp.cmp(os.path.join(data_dir, 'expected_large_annotation.embl'), 'large_annotation.embl', shallow=False) | ||
os.remove('large_annotation.embl') |
Oops, something went wrong.