Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update format #12

Merged
merged 13 commits into from
Nov 19, 2014
15 changes: 5 additions & 10 deletions gff3toembl/EMBLWriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

class EMBLWriter(object):

def __init__(self, gff3_file, organism, taxonid, project, description, authors, title, publication, genome_type, classification, submitter_name, submitter_title, submitter_location, output_filename, locus_tag = None, translation_table = 11, chromosome_list = None):
def __init__(self, gff3_file, organism, taxonid, project, description, authors, title, publication, genome_type, classification, output_filename, locus_tag = None, translation_table = 11, chromosome_list = None):
self.locus_tag = locus_tag
self.translation_table = translation_table
self.converter = convert.Convert(locus_tag,translation_table)
Expand All @@ -25,13 +25,8 @@ def __init__(self, gff3_file, organism, taxonid, project, description, authors,
self.publication = publication
self.genome_type = genome_type
self.classification = classification
self.submitter_name = submitter_name
self.submitter_title = submitter_title
self.submitter_location = submitter_location
self.output_filename = output_filename
self.chromosome_list = chromosome_list



def output_seq(self, seq):
sequence_string = self.converter.construct_sequence(seq)
Expand All @@ -41,11 +36,11 @@ def output_source(self, sequence_length, organism, taxonid,sequence_name):
source_string = self.converter.source_template(sequence_length,organism, taxonid,sequence_name)
return source_string

def create_output_file(self, sequences, organism, taxonid, project, description, authors, title, publication, genome_type, classification, submitter_name, submitter_title, submitter_location):
def create_output_file(self, sequences, organism, taxonid, project, description, authors, title, publication, genome_type, classification):
i = 1
target = open(self.output_filename, 'w')
for seqid in sorted(sequences):
target.write(self.converter.populated_header(len(self.conv.seqs[seqid]), project, description, i, authors, title, publication, genome_type, classification, submitter_name, submitter_title, submitter_location ) )
target.write(self.converter.populated_header(len(self.conv.seqs[seqid]), project, description, i, authors, title, publication, genome_type, classification, seqid ) )
target.write(self.output_source(len(self.conv.seqs[seqid]), organism, taxonid,seqid ))
for feat in self.conv.feats[seqid]:
target.write(feat)
Expand All @@ -65,7 +60,7 @@ def create_chromosome_list(self, chromosome_list_filename, embl_filename):
object_accessions = []

for embl_line in embl_file.readlines():
m = re.match("AC \* _(\w+)", embl_line)
m = re.match("AC \* _(\w+)", embl_line)
if m != None and m.group(1):
object_accessions.append(m.group(1))

Expand All @@ -86,6 +81,6 @@ def parse_and_run(self):
except Exception, e:
print e
exit(1)
self.create_output_file(self.conv.seqs.keys(), self.organism, self.taxonid, self.project, self.description, self.authors, self.title, self.publication, self.genome_type, self.classification, self.submitter_name, self.submitter_title, self.submitter_location)
self.create_output_file(self.conv.seqs.keys(), self.organism, self.taxonid, self.project, self.description, self.authors, self.title, self.publication, self.genome_type, self.classification)
self.create_chromosome_list(self.chromosome_list, self.output_filename)

45 changes: 24 additions & 21 deletions gff3toembl/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,29 +20,19 @@ def blank_header(self):
header = """\
ID XXX; XXX; %s; genomic DNA; STD; %s; %d BP.
XX
AC * _%s
AC XXX;
XX
PR Project:%s
AC * _%s
XX
DE %s contig %d
PR Project:%s;
XX
RN [1]
RA %s
RT "%s"
RL %s.
DE XXX;
XX
RN [2]
RA %s
RT "%s"
RN [1]
RA %s;
RT "%s";
RL %s.
XX
RN [3]
RA Torsten Seemann;
RT "Prokka: rapid prokaryotic genome annotation"
RL Bioinformatics. 2014 Jul 15;30(14):2068-9.;
XX
CC Data release policy http://www.sanger.ac.uk/legal/#t_2
XX
FH Key Location/Qualifiers
FH
"""
Expand All @@ -58,12 +48,12 @@ def populated_header(self,
publication="Unpublished",
genome_type="circular",
classification="UNC",
submitter_name="Pathogen Informatics",
submitter_title="Direct submission",
submitter_location="Wellcome Trust Sanger Institute"):
sequence_identifier=""
):

header = self.blank_header()
header_with_values = header % (genome_type, classification, num_bp,project+str(num_bp)+str(contig_number), project, description, contig_number,authors,title,publication,submitter_name,submitter_title,submitter_location )
sequence_identifier_filtered = re.sub(r'\W+', '', sequence_identifier)
header_with_values = header % (genome_type, classification, num_bp,sequence_identifier_filtered, project,authors,title,publication)
return header_with_values

def source_template(self, sequence_length = None, organism = None, taxon_id = None, sequence_name = None):
Expand Down Expand Up @@ -142,6 +132,15 @@ def update_locus_tag(self,attribute_value):
locus_tag_parts = attribute_value.split('_')
new_attribute = self.locus_tag + '_' +str(locus_tag_parts[-1])
return new_attribute

def search_hypo_protein(self, attribute_value):
split_attribute_values = attribute_value.split( ',')

for split_attribute_value in split_attribute_values:
if split_attribute_value != 'hypothetical protein':
return split_attribute_value

return 'Uncharacterised protein'

def construct_feature_attribute(self,attribute_key = None, attribute_value = None):
feature_string = ''
Expand All @@ -150,6 +149,9 @@ def construct_feature_attribute(self,attribute_key = None, attribute_value = Non
if attribute_key in self.feature_attributes_translations:
attribute_key = self.feature_attributes_translations[attribute_key]

if attribute_key == 'product':
attribute_value = self.search_hypo_protein(attribute_value)

if attribute_key == 'locus_tag':
attribute_value = self.update_locus_tag(attribute_value)

Expand All @@ -160,6 +162,7 @@ def construct_feature_attribute(self,attribute_key = None, attribute_value = Non
split_attribute_values = attribute_value.split( ',')
if attribute_key not in self.feature_attributes_to_split_on_multiple_lines:
feature_string += self.create_multi_line_feature_attribute_string(attribute_key, split_attribute_values[0])

else:
for split_attribute_value in split_attribute_values:
feature_string += self.create_multi_line_feature_attribute_string(attribute_key, split_attribute_value)
Expand Down
20 changes: 5 additions & 15 deletions gff3toembl/tests/EMBLWriter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ def test_single_feature(self):
'Some journal',
'circular',
'PROK',
'Jane',
'My institute',
'UK', 'single_feature.embl', None,11, None )
'single_feature.embl', None,11, None )
emblwriter.parse_and_run()
assert filecmp.cmp(os.path.join(data_dir, 'expected_single_feature.embl'), 'single_feature.embl', shallow=False)
os.remove('single_feature.embl')
Expand All @@ -41,9 +39,7 @@ def test_single_feature_new_locus_tag(self):
'Some journal',
'circular',
'PROK',
'Jane',
'My institute',
'UK', 'single_feature.embl', 'new_locus_tag', 11, None )
'single_feature.embl', 'new_locus_tag', 11, None )
emblwriter.parse_and_run()
assert filecmp.cmp(os.path.join(data_dir, 'expected_single_feature_new_locus_tag.embl'), 'single_feature.embl', shallow=False)
os.remove('single_feature.embl')
Expand All @@ -60,9 +56,7 @@ def test_single_feature_translation_table(self):
'Some journal',
'circular',
'PROK',
'Jane',
'My institute',
'UK', 'single_feature.embl', None, 1, None )
'single_feature.embl', None, 1, None )
emblwriter.parse_and_run()
assert filecmp.cmp(os.path.join(data_dir, 'expected_single_feature_translation_table.embl'), 'single_feature.embl', shallow=False)
os.remove('single_feature.embl')
Expand All @@ -80,9 +74,7 @@ def test_large_conversion(self):
'Some journal',
'circular',
'PROK',
'Jane',
'My institute',
'UK', 'large_annotation.embl', None, 11, None )
'large_annotation.embl', None, 11, None )
emblwriter.parse_and_run()
assert filecmp.cmp(os.path.join(data_dir, 'expected_large_annotation.embl'), 'large_annotation.embl', shallow=False)
os.remove('large_annotation.embl')
Expand All @@ -100,9 +92,7 @@ def test_chromosome_list_conversion(self):
'Some journal',
'circular',
'PROK',
'Jane',
'My institute',
'UK', 'chromosome_list.embl', None, 11, 'chromosome_list.txt' )
'chromosome_list.embl', None, 11, 'chromosome_list.txt' )
emblwriter.parse_and_run()
assert filecmp.cmp(os.path.join(data_dir, 'expected_chromosome_list.txt'), 'chromosome_list.txt', shallow=False)
os.remove('chromosome_list.embl')
Expand Down
69 changes: 32 additions & 37 deletions gff3toembl/tests/convert_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,29 +15,19 @@ def test_blank_header(self):
expected_header = """\
ID XXX; XXX; %s; genomic DNA; STD; %s; %d BP.
XX
AC * _%s
AC XXX;
XX
PR Project:%s
AC * _%s
XX
DE %s contig %d
PR Project:%s;
XX
RN [1]
RA %s
RT "%s"
RL %s.
DE XXX;
XX
RN [2]
RA %s
RT "%s"
RN [1]
RA %s;
RT "%s";
RL %s.
XX
RN [3]
RA Torsten Seemann;
RT "Prokka: rapid prokaryotic genome annotation"
RL Bioinformatics. 2014 Jul 15;30(14):2068-9.;
XX
CC Data release policy http://www.sanger.ac.uk/legal/#t_2
XX
FH Key Location/Qualifiers
FH
"""
Expand All @@ -58,36 +48,24 @@ def test_populate_header(self):
publication="Unpublished",
genome_type="circular",
classification="UNC",
submitter_name="Jane Doe",
submitter_title="Direct submission",
submitter_location="Sanger"
sequence_identifier="contig123"
)
expected_populated_header = """\
ID XXX; XXX; circular; genomic DNA; STD; UNC; 1234 BP.
XX
AC * _PRJ123412341
AC XXX;
XX
PR Project:PRJ1234
AC * _contig123
XX
DE One line description contig 1
PR Project:PRJ1234;
XX
DE XXX;
XX
RN [1]
RA John Doe
RT "My title"
RA John Doe;
RT "My title";
RL Unpublished.
XX
RN [2]
RA Jane Doe
RT "Direct submission"
RL Sanger.
XX
RN [3]
RA Torsten Seemann;
RT "Prokka: rapid prokaryotic genome annotation"
RL Bioinformatics. 2014 Jul 15;30(14):2068-9.;
XX
CC Data release policy http://www.sanger.ac.uk/legal/#t_2
XX
FH Key Location/Qualifiers
FH
"""
Expand Down Expand Up @@ -163,6 +141,23 @@ def test_construct_feature_locus_tag_update(self):
FT tRNA 174883..174959
FT /locus_tag="new_locus_tag_123"
FT /transl_table=11
"""

def test_hypo_search(self):
converter = convert.Convert()
assert converter.construct_feature_attribute(attribute_key = 'product', attribute_value = 'abc,efg,hij') == """\
FT /product="abc"
"""
assert converter.construct_feature_attribute(attribute_key = 'product', attribute_value = 'hypothetical protein,efg,hij') == """\
FT /product="efg"
"""

assert converter.construct_feature_attribute(attribute_key = 'product', attribute_value = 'hypothetical protein,hypothetical protein,hij') == """\
FT /product="hij"
"""

assert converter.construct_feature_attribute(attribute_key = 'product', attribute_value = 'hypothetical protein') == """\
FT /product="Uncharacterised protein"
"""

def test_create_db_xref_from_inference(self):
Expand Down
8 changes: 4 additions & 4 deletions gff3toembl/tests/data/expected_chromosome_list.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ABC3001 1 Chromosome
ABC3002 2 Plasmid
ABC3003 3 Plasmid
ABC3004 4 Plasmid
ERS1234SCcontig000003 1 Chromosome
ERS1234SCcontig000004 2 Plasmid
ERS1234SCcontig000005 3 Plasmid
ERS1234SCcontig000006 4 Plasmid
Loading