Merge pull request #1 from andrewjpage/master

initial working version
sanger-pathogens · Oct 6, 2014 · 34baebf · 34baebf
2 parents 57fe555 + 88550f7
commit 34baebf
Show file tree

Hide file tree

Showing 15 changed files with 20,855 additions and 0 deletions.
diff --git a/AUTHORS b/AUTHORS
diff --git a/README.md b/README.md
@@ -0,0 +1 @@
+Install Genometools, including python bindings. Make sure the library is in your LD_LIBRARY_PATH and your python is in PYTHONPATH
diff --git a/gff3toembl/EMBLConverter.py b/gff3toembl/EMBLConverter.py
@@ -0,0 +1,27 @@
+import sys
+from gt import CustomVisitor
+from collections import defaultdict
+from gff3toembl import convert
+
+class EMBLConverter(CustomVisitor):
+
+    def __init__(self):
+        CustomVisitor.__init__(self)
+        self.seqs = {}
+        self.feats = defaultdict(lambda: [], {})
+        self.regions = []
+        self.converter =  convert.Convert()
+
+    def visit_feature_node(self, fn):
+        feature_string = self.converter.construct_feature(feature_type = fn.get_type(), start = fn.get_start(), end = fn.get_end(), strand = fn.get_strand(), feature_attributes = fn.attribs)
+        if feature_string != '':
+          self.feats[fn.get_seqid()].append(feature_string)
+
+    def visit_region_node(self, rn):
+        self.regions.append
+
+    def visit_comment_node(self, cn):
+        pass  # for now
+
+    def visit_sequence_node(self, sn):
+        self.seqs[sn.get_description()] = sn.get_sequence()
diff --git a/gff3toembl/EMBLWriter.py b/gff3toembl/EMBLWriter.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+from collections import defaultdict
+from gt import GFF3InStream
+
+from gff3toembl import convert
+from gff3toembl.EMBLConverter import EMBLConverter
+from gff3toembl.VisitorStream import VisitorStream
+
+class EMBLWriter():
+
+    def __init__(self, gff3_file, organism, taxonid, project, description, authors, title,  publication, genome_type, classification, submitter_name, submitter_title,  submitter_location, output_filename, locus_tag = None):
+        self.locus_tag          = locus_tag
+        self.converter          = convert.Convert(locus_tag)
+        self.conv               = EMBLConverter()
+        self.gff3_file          = gff3_file
+        self.organism           = organism          
+        self.taxonid            = taxonid           
+        self.project            = project           
+        self.description        = description       
+        self.authors            = authors           
+        self.title              = title             
+        self.publication        = publication       
+        self.genome_type        = genome_type       
+        self.classification     = classification    
+        self.submitter_name     = submitter_name    
+        self.submitter_title    = submitter_title   
+        self.submitter_location = submitter_location
+        self.output_filename    = output_filename
+
+
+    def output_seq(self, seq):
+        sequence_string = self.converter.construct_sequence(seq)
+        return sequence_string
+
+    def output_source(self, sequence_length, organism, taxonid):
+        source_string = self.converter.source_template(sequence_length,organism, taxonid)
+        return source_string
+
+    def create_output_file(self, sequences, organism, taxonid, project, description, authors, title, publication, genome_type, classification, submitter_name, submitter_title, submitter_location):
+        i = 1
+        target = open(self.output_filename, 'w')
+        for seqid in sorted(sequences):
+            target.write(self.converter.populated_header(len(self.conv.seqs[seqid]),  project, description, i, authors, title, publication, genome_type, classification, submitter_name, submitter_title, submitter_location ) )
+            target.write(self.output_source(len(self.conv.seqs[seqid]), organism, taxonid))
+            for feat in self.conv.feats[seqid]:
+                target.write(feat)
+            target.write(self.output_seq(self.conv.seqs[seqid]))
+            target.write("//\n")
+            i +=1
+        target.close()
+
+    def parse_and_run(self):
+        ins = GFF3InStream(self.gff3_file)  
+        vs = VisitorStream(ins, self.conv)
+        try:
+            while (vs.next_tree()):
+                pass
+        except Exception, e:
+            print e
+            exit(1)
+        self.create_output_file(self.conv.seqs.keys(), self.organism, self.taxonid, self.project, self.description, self.authors, self.title, self.publication, self.genome_type, self.classification, self.submitter_name, self.submitter_title, self.submitter_location)
+
diff --git a/gff3toembl/VisitorStream.py b/gff3toembl/VisitorStream.py
@@ -0,0 +1,15 @@
+import sys
+from gt import CustomStream
+
+class VisitorStream(CustomStream):
+
+    def __init__(self, instream, visitor):
+        CustomStream.__init__(self)
+        self.instream = instream
+        self.visitor = visitor
+
+    def next(self):
+        node = self.instream.next_tree()
+        if node:
+            node.accept(self.visitor)
+        return node
diff --git a/gff3toembl/__init__.py b/gff3toembl/__init__.py
@@ -0,0 +1,4 @@
+__all__ = [
+    'convert',
+]
+from gff3toembl import *
diff --git a/gff3toembl/convert.py b/gff3toembl/convert.py
@@ -0,0 +1,171 @@
+import os
+import string
+
+class Convert:
+    features_to_ignore = {'ncRNA': 1}
+    feature_attributes_to_ignore = {'ID': 1, 'protein_id': 1}
+    feature_attributes_translations = {'eC_number': 'EC_number'}
+    feature_attributes_to_split_on_multiple_lines = {'inference': 1, 'EC_number': 1}
+
+    def __init__(self, locus_tag = None):
+        self.locus_tag = locus_tag
+
+    def blank_header(self):
+      header = """\
+ID   XXX; XXX; %s; genomic DNA; STD; %s; %d BP.
+XX
+AC   %s
+XX
+PR   Project:%s
+XX
+DE   %s contig %d
+XX
+RN   [1]
+RA   %s
+RT   "%s"
+RL   %s
+XX
+RN   [2]
+RA   %s
+RT   "%s"
+RL   %s
+XX
+CC   Data release policy http://www.sanger.ac.uk/legal/#t_2
+XX
+FH   Key             Location/Qualifiers
+FH
+"""
+      return header
+
+    def populated_header(self,
+        num_bp=1,
+        project="", 
+        description="",
+        contig_number=1, 
+        authors="Pathogen Genomics", 
+        title="Draft assembly with annotation from Prokka",
+        publication="Unpublished",
+        genome_type="circular",
+        classification="UNC",
+        submitter_name="Pathogen Informatics",
+        submitter_title="Direct submission",
+        submitter_location="Sanger"):
+
+        header = self.blank_header()
+        header_with_values = header % (genome_type, classification, num_bp,project+str(num_bp)+str(contig_number), project, description, contig_number,authors,title,publication,submitter_name,submitter_title,submitter_location )
+        return header_with_values
+
+    def source_template(self, sequence_length = None, organism = None, taxon_id = None):
+        source_template = """\
+FT   source          1..%d
+FT                   /organism="%s"
+FT                   /mol_type="genomic DNA"
+FT                   /db_xref="taxon:%d"
+"""   % (sequence_length, organism,taxon_id)
+        return source_template
+
+    def construct_sequence(self,sequence):
+      sequence_string = ''
+      sequence_string += self.sequence_header(sequence)
+      sequence_string += self.sequence_body(sequence)
+      return sequence_string
+
+    def sequence_header(self, sequence):
+      sequence = sequence.lower()
+      a = sequence.count('a')
+      c = sequence.count('c')
+      g = sequence.count('g')
+      t = sequence.count('t')
+      o = len(sequence) - a - c - g - t;
+      return "SQ   Sequence %d BP; %d A; %d C; %d G; %d T; %d other;\n" % \
+        (len(sequence), a, c, g, t, o)
+
+    def sequence_body(self, sequence):
+      sequence = sequence.lower()
+      output = "     "
+      i = 1
+      for j in range(len(sequence)):
+          output +=sequence[j]
+          if (i) % 10 == 0:
+              output += " "
+          if (i) % 60 == 0 and i < len(sequence) :
+              output += "%9s\n     " % (i)
+          elif (i) % 60 == 0  and i == len(sequence):
+             output += "%9s\n" % (i)
+             return output
+          i += 1
+
+      if((i)%60 ==0):
+        output += ' '*(66 -(((i-1)%60)/10) -((i-1)%60))  + "%9d\n" % (i - 1)
+        return output
+      else:
+        output +=' '*(80-i%60-(i%60)/10-13) + "%9d\n" % (i - 1)
+        return output
+
+    def feature_header(self, feature_type = None, start = None, end = None, strand = None):
+      string = ""
+      cmp1 = ''
+      cmp2 = ''
+      if strand == '-':
+          cmp1 = 'complement('
+          cmp2 = ')'
+      string += "FT   %s%s%s%d..%d%s\n" % (feature_type, ' ' * (16-len(feature_type)), cmp1, start, end, cmp2)
+      return string
+
+    def construct_feature(self, feature_type = None, start = None, end = None, strand = None, feature_attributes = {}):
+      feature = ''
+      if feature_type in self.features_to_ignore:
+        return feature
+
+      feature += self.feature_header( feature_type ,start, end, strand )
+      for attribute_key in feature_attributes.keys():
+        feature += self.construct_feature_attribute( attribute_key = attribute_key, attribute_value = feature_attributes[attribute_key])
+
+      return feature
+
+    def update_locus_tag(self,attribute_value):
+      if self.locus_tag == None:
+        return attribute_value
+      locus_tag_parts = attribute_value.split('_')
+      new_attribute = self.locus_tag + '_' +str(locus_tag_parts[-1])
+      return new_attribute
+
+    def construct_feature_attribute(self,attribute_key = None, attribute_value = None):
+      feature_string = ''
+      if attribute_key in self.feature_attributes_to_ignore:      
+        return feature_string
+      if attribute_key in self.feature_attributes_translations:
+        attribute_key = self.feature_attributes_translations[attribute_key]
+
+      if attribute_key == 'locus_tag':
+        attribute_value = self.update_locus_tag(attribute_value)
+
+      split_attribute_values = attribute_value.split( ',')
+      if attribute_key not in self.feature_attributes_to_split_on_multiple_lines:
+        feature_string += self.create_multi_line_feature_attribute_string(attribute_key, split_attribute_values[0])
+      else:
+        for split_attribute_value in split_attribute_values:
+          feature_string += self.create_multi_line_feature_attribute_string(attribute_key, split_attribute_value)
+      return feature_string
+
+
+    def create_multi_line_feature_attribute_string(self,attribute_key = None, attribute_value = None):
+      feature_string = ''
+      attribute_value = '"' + attribute_value + '"'
+
+      # First line < first_line_size
+      first_line_size = 55 - ( len(attribute_key))
+      feature_string += "FT%s/%s=%s\n" % (' ' * 19, attribute_key, attribute_value[:first_line_size])
+      if attribute_value[first_line_size:] == None:
+        return feature_string
+      attribute_value = attribute_value[first_line_size:]
+
+      while(len(attribute_value) > 0):
+        feature_string += "FT%s%s\n" % (' ' * 19, attribute_value[:57])
+        if attribute_value[57:] == None:
+          return feature_string
+        attribute_value = attribute_value[57:]
+
+      return feature_string    
+
+
diff --git a/gff3toembl/tests/EMBLWriter_test.py b/gff3toembl/tests/EMBLWriter_test.py
@@ -0,0 +1,49 @@
+import unittest
+import sys
+import os
+import filecmp
+from gff3toembl.EMBLWriter import EMBLWriter
+from gff3toembl import convert
+
+modules_dir = os.path.dirname(os.path.abspath(convert.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+class TestEMBLWriter(unittest.TestCase):
+
+    def test_single_feature(self):
+        '''test that the script will convert from GFF3 to EMBL'''
+        emblwriter = EMBLWriter(os.path.join(data_dir,'single_feature.gff'), 
+           'Organism', 
+           1234, 
+           'My project', 
+           'My description', 
+           'John', 
+           'Some title',  
+           'Some journal', 
+           'circular', 
+           'PROK', 
+           'Jane',
+           'My institute',  
+           'UK', 'single_feature.embl', None )
+        emblwriter.parse_and_run()
+        assert filecmp.cmp(os.path.join(data_dir, 'expected_single_feature.embl'), 'single_feature.embl', shallow=False)
+        os.remove('single_feature.embl')
+
+    def test_large_conversion(self):
+        '''test a large gff3 file converts to EMBL'''
+        emblwriter = EMBLWriter(os.path.join(data_dir,'large_annotation.gff'), 
+           'Organism', 
+           1234, 
+           'My project', 
+           'My description', 
+           'John', 
+           'Some title',  
+           'Some journal', 
+           'circular', 
+           'PROK', 
+           'Jane',
+           'My institute',  
+           'UK', 'large_annotation.embl', None )
+        emblwriter.parse_and_run()
+        assert filecmp.cmp(os.path.join(data_dir, 'expected_large_annotation.embl'), 'large_annotation.embl', shallow=False)
+        os.remove('large_annotation.embl')
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Install Genometools, including python bindings. Make sure the library is in your LD_LIBRARY_PATH and your python is in PYTHONPATH