Merge pull request #12 from andrewjpage/update_format

Update format
sanger-pathogens · Nov 19, 2014 · 12aa080 · 12aa080
2 parents 5ecd2ff + 841ffc5
commit 12aa080
Show file tree

Hide file tree

Showing 11 changed files with 192 additions and 342 deletions.
diff --git a/gff3toembl/EMBLWriter.py b/gff3toembl/EMBLWriter.py
@@ -10,7 +10,7 @@
 
 class EMBLWriter(object):
 
-    def __init__(self, gff3_file, organism, taxonid, project, description, authors, title,  publication, genome_type, classification, submitter_name, submitter_title,  submitter_location, output_filename, locus_tag = None, translation_table = 11, chromosome_list = None):
+    def __init__(self, gff3_file, organism, taxonid, project, description, authors, title,  publication, genome_type, classification,  output_filename, locus_tag = None, translation_table = 11, chromosome_list = None):
         self.locus_tag          = locus_tag
         self.translation_table  = translation_table
         self.converter          = convert.Convert(locus_tag,translation_table)
@@ -25,13 +25,8 @@ def __init__(self, gff3_file, organism, taxonid, project, description, authors,
         self.publication        = publication       
         self.genome_type        = genome_type       
         self.classification     = classification    
-        self.submitter_name     = submitter_name    
-        self.submitter_title    = submitter_title   
-        self.submitter_location = submitter_location
         self.output_filename    = output_filename
         self.chromosome_list    = chromosome_list
-
-
 
     def output_seq(self, seq):
         sequence_string = self.converter.construct_sequence(seq)
@@ -41,11 +36,11 @@ def output_source(self, sequence_length, organism, taxonid,sequence_name):
         source_string = self.converter.source_template(sequence_length,organism, taxonid,sequence_name)
         return source_string
 
-    def create_output_file(self, sequences, organism, taxonid, project, description, authors, title, publication, genome_type, classification, submitter_name, submitter_title, submitter_location):
+    def create_output_file(self, sequences, organism, taxonid, project, description, authors, title, publication, genome_type, classification):
         i = 1
         target = open(self.output_filename, 'w')
         for seqid in sorted(sequences):
-            target.write(self.converter.populated_header(len(self.conv.seqs[seqid]),  project, description, i, authors, title, publication, genome_type, classification, submitter_name, submitter_title, submitter_location ) )
+            target.write(self.converter.populated_header(len(self.conv.seqs[seqid]),  project, description, i, authors, title, publication, genome_type, classification, seqid ) )
             target.write(self.output_source(len(self.conv.seqs[seqid]), organism, taxonid,seqid ))
             for feat in self.conv.feats[seqid]:
                 target.write(feat)
@@ -65,7 +60,7 @@ def create_chromosome_list(self, chromosome_list_filename, embl_filename):
         object_accessions = []
 
         for embl_line in embl_file.readlines():
-          m = re.match("AC   \* _(\w+)", embl_line)
+          m = re.match("AC \* _(\w+)", embl_line)
           if m != None and m.group(1):
             object_accessions.append(m.group(1))
 
@@ -86,6 +81,6 @@ def parse_and_run(self):
         except Exception, e:
             print e
             exit(1)
-        self.create_output_file(self.conv.seqs.keys(), self.organism, self.taxonid, self.project, self.description, self.authors, self.title, self.publication, self.genome_type, self.classification, self.submitter_name, self.submitter_title, self.submitter_location)
+        self.create_output_file(self.conv.seqs.keys(), self.organism, self.taxonid, self.project, self.description, self.authors, self.title, self.publication, self.genome_type, self.classification)
         self.create_chromosome_list(self.chromosome_list, self.output_filename)
 
diff --git a/gff3toembl/convert.py b/gff3toembl/convert.py
@@ -20,29 +20,19 @@ def blank_header(self):
       header = """\
 ID   XXX; XXX; %s; genomic DNA; STD; %s; %d BP.
 XX
-AC   * _%s
+AC   XXX;
 XX
-PR   Project:%s
+AC * _%s
 XX
-DE   %s contig %d
+PR   Project:%s;
 XX
-RN   [1]
-RA   %s
-RT   "%s"
-RL   %s.
+DE   XXX;
 XX
-RN   [2]
-RA   %s
-RT   "%s"
+RN   [1]
+RA   %s;
+RT   "%s";
 RL   %s.
 XX
-RN   [3]
-RA   Torsten Seemann;
-RT   "Prokka: rapid prokaryotic genome annotation"
-RL    Bioinformatics. 2014 Jul 15;30(14):2068-9.;
-XX
-CC   Data release policy http://www.sanger.ac.uk/legal/#t_2
-XX
 FH   Key             Location/Qualifiers
 FH
 """
@@ -58,12 +48,12 @@ def populated_header(self,
         publication="Unpublished",
         genome_type="circular",
         classification="UNC",
-        submitter_name="Pathogen Informatics",
-        submitter_title="Direct submission",
-        submitter_location="Wellcome Trust Sanger Institute"):
+        sequence_identifier=""
+        ):
 
         header = self.blank_header()
-        header_with_values = header % (genome_type, classification, num_bp,project+str(num_bp)+str(contig_number), project, description, contig_number,authors,title,publication,submitter_name,submitter_title,submitter_location )
+        sequence_identifier_filtered  = re.sub(r'\W+', '', sequence_identifier)
+        header_with_values = header % (genome_type, classification, num_bp,sequence_identifier_filtered, project,authors,title,publication)
         return header_with_values
 
     def source_template(self, sequence_length = None, organism = None, taxon_id = None, sequence_name = None):
@@ -142,6 +132,15 @@ def update_locus_tag(self,attribute_value):
       locus_tag_parts = attribute_value.split('_')
       new_attribute = self.locus_tag + '_' +str(locus_tag_parts[-1])
       return new_attribute
+
+    def search_hypo_protein(self, attribute_value):
+      split_attribute_values = attribute_value.split( ',')
+
+      for split_attribute_value in split_attribute_values:
+        if split_attribute_value != 'hypothetical protein':
+          return split_attribute_value
+
+      return 'Uncharacterised protein'
 
     def construct_feature_attribute(self,attribute_key = None, attribute_value = None):
       feature_string = ''
@@ -150,6 +149,9 @@ def construct_feature_attribute(self,attribute_key = None, attribute_value = Non
       if attribute_key in self.feature_attributes_translations:
         attribute_key = self.feature_attributes_translations[attribute_key]
 
+      if attribute_key == 'product':
+          attribute_value = self.search_hypo_protein(attribute_value)
+
       if attribute_key == 'locus_tag':
         attribute_value = self.update_locus_tag(attribute_value)
 
@@ -160,6 +162,7 @@ def construct_feature_attribute(self,attribute_key = None, attribute_value = Non
           split_attribute_values = attribute_value.split( ',')
       if attribute_key not in self.feature_attributes_to_split_on_multiple_lines:
         feature_string += self.create_multi_line_feature_attribute_string(attribute_key, split_attribute_values[0])
+
       else:
         for split_attribute_value in split_attribute_values:
           feature_string += self.create_multi_line_feature_attribute_string(attribute_key, split_attribute_value)

diff --git a/gff3toembl/tests/EMBLWriter_test.py b/gff3toembl/tests/EMBLWriter_test.py
@@ -22,9 +22,7 @@ def test_single_feature(self):
            'Some journal', 
            'circular', 
            'PROK', 
-           'Jane',
-           'My institute',  
-           'UK', 'single_feature.embl', None,11,  None )
+           'single_feature.embl', None,11,  None )
         emblwriter.parse_and_run()
         assert filecmp.cmp(os.path.join(data_dir, 'expected_single_feature.embl'), 'single_feature.embl', shallow=False)
         os.remove('single_feature.embl')
@@ -41,9 +39,7 @@ def test_single_feature_new_locus_tag(self):
            'Some journal', 
            'circular', 
            'PROK', 
-           'Jane',
-           'My institute',  
-           'UK', 'single_feature.embl', 'new_locus_tag', 11, None )
+           'single_feature.embl', 'new_locus_tag', 11, None )
         emblwriter.parse_and_run()
         assert filecmp.cmp(os.path.join(data_dir, 'expected_single_feature_new_locus_tag.embl'), 'single_feature.embl', shallow=False)
         os.remove('single_feature.embl')
@@ -60,9 +56,7 @@ def test_single_feature_translation_table(self):
            'Some journal', 
            'circular', 
            'PROK', 
-           'Jane',
-           'My institute',
-           'UK', 'single_feature.embl', None, 1, None )
+           'single_feature.embl', None, 1, None )
         emblwriter.parse_and_run()
         assert filecmp.cmp(os.path.join(data_dir, 'expected_single_feature_translation_table.embl'), 'single_feature.embl', shallow=False)
         os.remove('single_feature.embl')
@@ -80,9 +74,7 @@ def test_large_conversion(self):
            'Some journal', 
            'circular', 
            'PROK', 
-           'Jane',
-           'My institute',  
-           'UK', 'large_annotation.embl', None, 11, None )
+           'large_annotation.embl', None, 11, None )
         emblwriter.parse_and_run()
         assert filecmp.cmp(os.path.join(data_dir, 'expected_large_annotation.embl'), 'large_annotation.embl', shallow=False)
         os.remove('large_annotation.embl')
@@ -100,9 +92,7 @@ def test_chromosome_list_conversion(self):
           'Some journal', 
           'circular', 
           'PROK', 
-          'Jane',
-          'My institute',  
-          'UK', 'chromosome_list.embl', None, 11, 'chromosome_list.txt' )
+          'chromosome_list.embl', None, 11, 'chromosome_list.txt' )
        emblwriter.parse_and_run()
        assert filecmp.cmp(os.path.join(data_dir, 'expected_chromosome_list.txt'), 'chromosome_list.txt', shallow=False)
        os.remove('chromosome_list.embl')

diff --git a/gff3toembl/tests/convert_test.py b/gff3toembl/tests/convert_test.py
@@ -15,29 +15,19 @@ def test_blank_header(self):
         expected_header = """\
 ID   XXX; XXX; %s; genomic DNA; STD; %s; %d BP.
 XX
-AC   * _%s
+AC   XXX;
 XX
-PR   Project:%s
+AC * _%s
 XX
-DE   %s contig %d
+PR   Project:%s;
 XX
-RN   [1]
-RA   %s
-RT   "%s"
-RL   %s.
+DE   XXX;
 XX
-RN   [2]
-RA   %s
-RT   "%s"
+RN   [1]
+RA   %s;
+RT   "%s";
 RL   %s.
 XX
-RN   [3]
-RA   Torsten Seemann;
-RT   "Prokka: rapid prokaryotic genome annotation"
-RL    Bioinformatics. 2014 Jul 15;30(14):2068-9.;
-XX
-CC   Data release policy http://www.sanger.ac.uk/legal/#t_2
-XX
 FH   Key             Location/Qualifiers
 FH
 """
@@ -58,36 +48,24 @@ def test_populate_header(self):
           publication="Unpublished",
           genome_type="circular",
           classification="UNC",
-          submitter_name="Jane Doe",
-          submitter_title="Direct submission",
-          submitter_location="Sanger"
+          sequence_identifier="contig123"
           )
         expected_populated_header = """\
 ID   XXX; XXX; circular; genomic DNA; STD; UNC; 1234 BP.
 XX
-AC   * _PRJ123412341
+AC   XXX;
 XX
-PR   Project:PRJ1234
+AC * _contig123
 XX
-DE   One line description contig 1
+PR   Project:PRJ1234;
+XX
+DE   XXX;
 XX
 RN   [1]
-RA   John Doe
-RT   "My title"
+RA   John Doe;
+RT   "My title";
 RL   Unpublished.
 XX
-RN   [2]
-RA   Jane Doe
-RT   "Direct submission"
-RL   Sanger.
-XX
-RN   [3]
-RA   Torsten Seemann;
-RT   "Prokka: rapid prokaryotic genome annotation"
-RL    Bioinformatics. 2014 Jul 15;30(14):2068-9.;
-XX
-CC   Data release policy http://www.sanger.ac.uk/legal/#t_2
-XX
 FH   Key             Location/Qualifiers
 FH
 """
@@ -163,6 +141,23 @@ def test_construct_feature_locus_tag_update(self):
 FT   tRNA            174883..174959
 FT                   /locus_tag="new_locus_tag_123"
 FT                   /transl_table=11
+"""
+
+    def test_hypo_search(self):
+      converter = convert.Convert()
+      assert converter.construct_feature_attribute(attribute_key = 'product', attribute_value = 'abc,efg,hij') == """\
+FT                   /product="abc"
+"""
+      assert converter.construct_feature_attribute(attribute_key = 'product', attribute_value = 'hypothetical protein,efg,hij') == """\
+FT                   /product="efg"
+"""
+
+      assert converter.construct_feature_attribute(attribute_key = 'product', attribute_value = 'hypothetical protein,hypothetical protein,hij') == """\
+FT                   /product="hij"
+"""
+
+      assert converter.construct_feature_attribute(attribute_key = 'product', attribute_value = 'hypothetical protein') == """\
+FT                   /product="Uncharacterised protein"
 """
 
     def test_create_db_xref_from_inference(self):

diff --git a/gff3toembl/tests/data/expected_chromosome_list.txt b/gff3toembl/tests/data/expected_chromosome_list.txt
@@ -1,4 +1,4 @@
-ABC3001	1	Chromosome
-ABC3002	2	Plasmid
-ABC3003	3	Plasmid
-ABC3004	4	Plasmid
+ERS1234SCcontig000003	1	Chromosome
+ERS1234SCcontig000004	2	Plasmid
+ERS1234SCcontig000005	3	Plasmid
+ERS1234SCcontig000006	4	Plasmid