Skip to content

Commit

Permalink
Merge pull request #12 from andrewjpage/update_format
Browse files Browse the repository at this point in the history
Update format
  • Loading branch information
andrewjpage committed Nov 19, 2014
2 parents 5ecd2ff + 841ffc5 commit 12aa080
Show file tree
Hide file tree
Showing 11 changed files with 192 additions and 342 deletions.
15 changes: 5 additions & 10 deletions gff3toembl/EMBLWriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

class EMBLWriter(object):

def __init__(self, gff3_file, organism, taxonid, project, description, authors, title, publication, genome_type, classification, submitter_name, submitter_title, submitter_location, output_filename, locus_tag = None, translation_table = 11, chromosome_list = None):
def __init__(self, gff3_file, organism, taxonid, project, description, authors, title, publication, genome_type, classification, output_filename, locus_tag = None, translation_table = 11, chromosome_list = None):
self.locus_tag = locus_tag
self.translation_table = translation_table
self.converter = convert.Convert(locus_tag,translation_table)
Expand All @@ -25,13 +25,8 @@ def __init__(self, gff3_file, organism, taxonid, project, description, authors,
self.publication = publication
self.genome_type = genome_type
self.classification = classification
self.submitter_name = submitter_name
self.submitter_title = submitter_title
self.submitter_location = submitter_location
self.output_filename = output_filename
self.chromosome_list = chromosome_list



def output_seq(self, seq):
sequence_string = self.converter.construct_sequence(seq)
Expand All @@ -41,11 +36,11 @@ def output_source(self, sequence_length, organism, taxonid,sequence_name):
source_string = self.converter.source_template(sequence_length,organism, taxonid,sequence_name)
return source_string

def create_output_file(self, sequences, organism, taxonid, project, description, authors, title, publication, genome_type, classification, submitter_name, submitter_title, submitter_location):
def create_output_file(self, sequences, organism, taxonid, project, description, authors, title, publication, genome_type, classification):
i = 1
target = open(self.output_filename, 'w')
for seqid in sorted(sequences):
target.write(self.converter.populated_header(len(self.conv.seqs[seqid]), project, description, i, authors, title, publication, genome_type, classification, submitter_name, submitter_title, submitter_location ) )
target.write(self.converter.populated_header(len(self.conv.seqs[seqid]), project, description, i, authors, title, publication, genome_type, classification, seqid ) )
target.write(self.output_source(len(self.conv.seqs[seqid]), organism, taxonid,seqid ))
for feat in self.conv.feats[seqid]:
target.write(feat)
Expand All @@ -65,7 +60,7 @@ def create_chromosome_list(self, chromosome_list_filename, embl_filename):
object_accessions = []

for embl_line in embl_file.readlines():
m = re.match("AC \* _(\w+)", embl_line)
m = re.match("AC \* _(\w+)", embl_line)
if m != None and m.group(1):
object_accessions.append(m.group(1))

Expand All @@ -86,6 +81,6 @@ def parse_and_run(self):
except Exception, e:
print e
exit(1)
self.create_output_file(self.conv.seqs.keys(), self.organism, self.taxonid, self.project, self.description, self.authors, self.title, self.publication, self.genome_type, self.classification, self.submitter_name, self.submitter_title, self.submitter_location)
self.create_output_file(self.conv.seqs.keys(), self.organism, self.taxonid, self.project, self.description, self.authors, self.title, self.publication, self.genome_type, self.classification)
self.create_chromosome_list(self.chromosome_list, self.output_filename)

45 changes: 24 additions & 21 deletions gff3toembl/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,29 +20,19 @@ def blank_header(self):
header = """\
ID XXX; XXX; %s; genomic DNA; STD; %s; %d BP.
XX
AC * _%s
AC XXX;
XX
PR Project:%s
AC * _%s
XX
DE %s contig %d
PR Project:%s;
XX
RN [1]
RA %s
RT "%s"
RL %s.
DE XXX;
XX
RN [2]
RA %s
RT "%s"
RN [1]
RA %s;
RT "%s";
RL %s.
XX
RN [3]
RA Torsten Seemann;
RT "Prokka: rapid prokaryotic genome annotation"
RL Bioinformatics. 2014 Jul 15;30(14):2068-9.;
XX
CC Data release policy http://www.sanger.ac.uk/legal/#t_2
XX
FH Key Location/Qualifiers
FH
"""
Expand All @@ -58,12 +48,12 @@ def populated_header(self,
publication="Unpublished",
genome_type="circular",
classification="UNC",
submitter_name="Pathogen Informatics",
submitter_title="Direct submission",
submitter_location="Wellcome Trust Sanger Institute"):
sequence_identifier=""
):

header = self.blank_header()
header_with_values = header % (genome_type, classification, num_bp,project+str(num_bp)+str(contig_number), project, description, contig_number,authors,title,publication,submitter_name,submitter_title,submitter_location )
sequence_identifier_filtered = re.sub(r'\W+', '', sequence_identifier)
header_with_values = header % (genome_type, classification, num_bp,sequence_identifier_filtered, project,authors,title,publication)
return header_with_values

def source_template(self, sequence_length = None, organism = None, taxon_id = None, sequence_name = None):
Expand Down Expand Up @@ -142,6 +132,15 @@ def update_locus_tag(self,attribute_value):
locus_tag_parts = attribute_value.split('_')
new_attribute = self.locus_tag + '_' +str(locus_tag_parts[-1])
return new_attribute

def search_hypo_protein(self, attribute_value):
split_attribute_values = attribute_value.split( ',')

for split_attribute_value in split_attribute_values:
if split_attribute_value != 'hypothetical protein':
return split_attribute_value

return 'Uncharacterised protein'

def construct_feature_attribute(self,attribute_key = None, attribute_value = None):
feature_string = ''
Expand All @@ -150,6 +149,9 @@ def construct_feature_attribute(self,attribute_key = None, attribute_value = Non
if attribute_key in self.feature_attributes_translations:
attribute_key = self.feature_attributes_translations[attribute_key]

if attribute_key == 'product':
attribute_value = self.search_hypo_protein(attribute_value)

if attribute_key == 'locus_tag':
attribute_value = self.update_locus_tag(attribute_value)

Expand All @@ -160,6 +162,7 @@ def construct_feature_attribute(self,attribute_key = None, attribute_value = Non
split_attribute_values = attribute_value.split( ',')
if attribute_key not in self.feature_attributes_to_split_on_multiple_lines:
feature_string += self.create_multi_line_feature_attribute_string(attribute_key, split_attribute_values[0])

else:
for split_attribute_value in split_attribute_values:
feature_string += self.create_multi_line_feature_attribute_string(attribute_key, split_attribute_value)
Expand Down
20 changes: 5 additions & 15 deletions gff3toembl/tests/EMBLWriter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ def test_single_feature(self):
'Some journal',
'circular',
'PROK',
'Jane',
'My institute',
'UK', 'single_feature.embl', None,11, None )
'single_feature.embl', None,11, None )
emblwriter.parse_and_run()
assert filecmp.cmp(os.path.join(data_dir, 'expected_single_feature.embl'), 'single_feature.embl', shallow=False)
os.remove('single_feature.embl')
Expand All @@ -41,9 +39,7 @@ def test_single_feature_new_locus_tag(self):
'Some journal',
'circular',
'PROK',
'Jane',
'My institute',
'UK', 'single_feature.embl', 'new_locus_tag', 11, None )
'single_feature.embl', 'new_locus_tag', 11, None )
emblwriter.parse_and_run()
assert filecmp.cmp(os.path.join(data_dir, 'expected_single_feature_new_locus_tag.embl'), 'single_feature.embl', shallow=False)
os.remove('single_feature.embl')
Expand All @@ -60,9 +56,7 @@ def test_single_feature_translation_table(self):
'Some journal',
'circular',
'PROK',
'Jane',
'My institute',
'UK', 'single_feature.embl', None, 1, None )
'single_feature.embl', None, 1, None )
emblwriter.parse_and_run()
assert filecmp.cmp(os.path.join(data_dir, 'expected_single_feature_translation_table.embl'), 'single_feature.embl', shallow=False)
os.remove('single_feature.embl')
Expand All @@ -80,9 +74,7 @@ def test_large_conversion(self):
'Some journal',
'circular',
'PROK',
'Jane',
'My institute',
'UK', 'large_annotation.embl', None, 11, None )
'large_annotation.embl', None, 11, None )
emblwriter.parse_and_run()
assert filecmp.cmp(os.path.join(data_dir, 'expected_large_annotation.embl'), 'large_annotation.embl', shallow=False)
os.remove('large_annotation.embl')
Expand All @@ -100,9 +92,7 @@ def test_chromosome_list_conversion(self):
'Some journal',
'circular',
'PROK',
'Jane',
'My institute',
'UK', 'chromosome_list.embl', None, 11, 'chromosome_list.txt' )
'chromosome_list.embl', None, 11, 'chromosome_list.txt' )
emblwriter.parse_and_run()
assert filecmp.cmp(os.path.join(data_dir, 'expected_chromosome_list.txt'), 'chromosome_list.txt', shallow=False)
os.remove('chromosome_list.embl')
Expand Down
69 changes: 32 additions & 37 deletions gff3toembl/tests/convert_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,29 +15,19 @@ def test_blank_header(self):
expected_header = """\
ID XXX; XXX; %s; genomic DNA; STD; %s; %d BP.
XX
AC * _%s
AC XXX;
XX
PR Project:%s
AC * _%s
XX
DE %s contig %d
PR Project:%s;
XX
RN [1]
RA %s
RT "%s"
RL %s.
DE XXX;
XX
RN [2]
RA %s
RT "%s"
RN [1]
RA %s;
RT "%s";
RL %s.
XX
RN [3]
RA Torsten Seemann;
RT "Prokka: rapid prokaryotic genome annotation"
RL Bioinformatics. 2014 Jul 15;30(14):2068-9.;
XX
CC Data release policy http://www.sanger.ac.uk/legal/#t_2
XX
FH Key Location/Qualifiers
FH
"""
Expand All @@ -58,36 +48,24 @@ def test_populate_header(self):
publication="Unpublished",
genome_type="circular",
classification="UNC",
submitter_name="Jane Doe",
submitter_title="Direct submission",
submitter_location="Sanger"
sequence_identifier="contig123"
)
expected_populated_header = """\
ID XXX; XXX; circular; genomic DNA; STD; UNC; 1234 BP.
XX
AC * _PRJ123412341
AC XXX;
XX
PR Project:PRJ1234
AC * _contig123
XX
DE One line description contig 1
PR Project:PRJ1234;
XX
DE XXX;
XX
RN [1]
RA John Doe
RT "My title"
RA John Doe;
RT "My title";
RL Unpublished.
XX
RN [2]
RA Jane Doe
RT "Direct submission"
RL Sanger.
XX
RN [3]
RA Torsten Seemann;
RT "Prokka: rapid prokaryotic genome annotation"
RL Bioinformatics. 2014 Jul 15;30(14):2068-9.;
XX
CC Data release policy http://www.sanger.ac.uk/legal/#t_2
XX
FH Key Location/Qualifiers
FH
"""
Expand Down Expand Up @@ -163,6 +141,23 @@ def test_construct_feature_locus_tag_update(self):
FT tRNA 174883..174959
FT /locus_tag="new_locus_tag_123"
FT /transl_table=11
"""

def test_hypo_search(self):
converter = convert.Convert()
assert converter.construct_feature_attribute(attribute_key = 'product', attribute_value = 'abc,efg,hij') == """\
FT /product="abc"
"""
assert converter.construct_feature_attribute(attribute_key = 'product', attribute_value = 'hypothetical protein,efg,hij') == """\
FT /product="efg"
"""

assert converter.construct_feature_attribute(attribute_key = 'product', attribute_value = 'hypothetical protein,hypothetical protein,hij') == """\
FT /product="hij"
"""

assert converter.construct_feature_attribute(attribute_key = 'product', attribute_value = 'hypothetical protein') == """\
FT /product="Uncharacterised protein"
"""

def test_create_db_xref_from_inference(self):
Expand Down
8 changes: 4 additions & 4 deletions gff3toembl/tests/data/expected_chromosome_list.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ABC3001 1 Chromosome
ABC3002 2 Plasmid
ABC3003 3 Plasmid
ABC3004 4 Plasmid
ERS1234SCcontig000003 1 Chromosome
ERS1234SCcontig000004 2 Plasmid
ERS1234SCcontig000005 3 Plasmid
ERS1234SCcontig000006 4 Plasmid
Loading

0 comments on commit 12aa080

Please sign in to comment.