Skip to content

Commit

Permalink
Merge pull request #47 from andrewjpage/wrap_header_lines
Browse files Browse the repository at this point in the history
Wrap publication lines
  • Loading branch information
satta committed Dec 10, 2015
2 parents 5f41452 + d22f31f commit f861acd
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 9 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# gff3toembl
Converts gff3 files to emble files for uploading to EBI.
Converts PROKKA GFF3 files to EMBL files for uploading annotated assemblies to EBI.

[![Build Status](https://travis-ci.org/sanger-pathogens/gff3toembl.svg?branch=master)](https://travis-ci.org/sanger-pathogens/gff3toembl)

NB this implements some EBI specific conventions and is not a generic conversion tool.
NB this implements some EBI specific conventions and is not a generic conversion tool. Its also not a validator, so you need to pass in parameters which are acceptable to EMBL.

## Installation
gff3toembl only works with Python 2.7 and there are known issues on other versions of Python.
Expand Down
29 changes: 23 additions & 6 deletions gff3toembl/EMBLContig.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,32 +277,49 @@ def __init__(self,
source_attributes = self.build_source_attributes(organism, taxon_id, sequence_name)
self.source_feature = EMBLFeature(feature_type='source', start=1, end=sequence_length,
strand='+', feature_attributes=source_attributes)

self.header_template = """\
ID XXX; XXX; {genome_type}; genomic DNA; STD; {classification}; {sequence_length} BP.
XX
AC XXX;
XX
AC * _{sequence_identifier}
XX
PR Project:{project};
{project_line}
XX
DE XXX;
XX
RN [1]
RA {authors};
RT "{title}";
RL {publication}.
{publication_authors}
{publication_title}
{publication_name}
XX
FH Key Location/Qualifiers
FH
"""

def header_attribute_formatter(self, key, header_text, quote_character, final_character):
wrapper = TextWrapper()
wrapper.initial_indent=key + ' '
wrapper.subsequent_indent=key + ' '
wrapper.width=79
attribute_text_template='{attribute_quote_character}{attribute_header_text}{attribute_quote_character}{attribute_final_character}'
attribute_text=attribute_text_template.format(attribute_header_text = header_text,
attribute_quote_character = quote_character,
attribute_final_character = final_character)
return wrapper.fill(attribute_text)

def remove_non_word_characters(self, sequence_identifier):
return re.sub(r'\W+', '', sequence_identifier)

def format(self):
return self.header_template.format(**self.__dict__) + self.source_feature.format()
project_line = self.header_attribute_formatter("PR", "Project:" + self.project, '', ';' )
publication_authors = self.header_attribute_formatter("RA", self.authors,'',';' )
publication_title = self.header_attribute_formatter("RT", self.title,'"',';' )
publication_name = self.header_attribute_formatter("RL", self.publication,'','.' )
return self.header_template.format(project_line = project_line,
publication_authors = publication_authors,
publication_title = publication_title,
publication_name = publication_name, **self.__dict__) + self.source_feature.format()

def build_source_attributes(self, organism, taxon_id, sequence_name):
def empty_string_if_none(value):
Expand Down
56 changes: 56 additions & 0 deletions gff3toembl/tests/EMBLContig_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,9 @@ def test_format(self):
for calculated_line,expected_line in zip(calculated_header.split('\n'), expected_header.split('\n')):
self.assertEqual(calculated_line, expected_line)
self.assertEqual(len(calculated_header), len(expected_header))




def test_format_long_organism_name(self):
header = EMBLHeader()
Expand Down Expand Up @@ -336,6 +339,59 @@ def test_format_long_organism_name(self):
FT /mol_type="genomic DNA"
FT /db_xref="taxon:5678"
FT /note="chromX"
"""

calculated_header = header.format()
for calculated_line,expected_line in zip(calculated_header.split('\n'), expected_header.split('\n')):
self.assertEqual(calculated_line, expected_line)
self.assertEqual(len(calculated_header), len(expected_header))


def test_format_long_publication(self):
header = EMBLHeader()

header.authors="Jagger M., Richards K., Watts C., Wood R., Jones B., Stewart I., Wyman B., Taylor M."
header.classification="UNC"
header.genome_type="circular"
header.organism="organism"
header.project="PRJ1234"
header.publication="The Rolling Stones, 12 X 5, The Rolling Stones No. 2, Out of Our Heads, Aftermath, Between the Buttons, Their Satanic Majesties Request, Beggars Banquet"
header.sequence_identifier="contig123"
header.sequence_length=1234
header.sequence_name="chromX"
header.taxon_id=5678
header.title="Let It Bleed, Sticky Fingers, Exile on Main St., Goats Head Soup, It's Only Rock 'n Roll, Black and Blue, Some Girls, Emotional Rescue, Tattoo You, Undercover"
source_attributes = {"organism": header.organism, "db_xref": "taxon:5678", "note": "chromX"}
header.source_feature = EMBLFeature('source', 1, 1234, '+', source_attributes)

expected_header = """\
ID XXX; XXX; circular; genomic DNA; STD; UNC; 1234 BP.
XX
AC XXX;
XX
AC * _contig123
XX
PR Project:PRJ1234;
XX
DE XXX;
XX
RN [1]
RA Jagger M., Richards K., Watts C., Wood R., Jones B., Stewart I., Wyman B.,
RA Taylor M.;
RT "Let It Bleed, Sticky Fingers, Exile on Main St., Goats Head Soup, It's
RT Only Rock 'n Roll, Black and Blue, Some Girls, Emotional Rescue, Tattoo
RT You, Undercover";
RL The Rolling Stones, 12 X 5, The Rolling Stones No. 2, Out of Our Heads,
RL Aftermath, Between the Buttons, Their Satanic Majesties Request, Beggars
RL Banquet.
XX
FH Key Location/Qualifiers
FH
FT source 1..1234
FT /organism="organism"
FT /mol_type="genomic DNA"
FT /db_xref="taxon:5678"
FT /note="chromX"
"""

calculated_header = header.format()
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def read(fname):

setup(
name='gff3toembl',
version='1.0.1',
version='1.0.2',
description='Convert a GFF3 file to EMBL format for submission',
long_description=read('README.md'),
packages = find_packages(),
Expand Down

0 comments on commit f861acd

Please sign in to comment.