Release v0.2.3 (#23)

* Added Zenodo DOI badge * Fix for comma terminator in field 11 and 12 (#21) * Removed python 3.2 and 3.3 support * Added Gtf2bed field support (#22) * gtf2bed is able to convert also GFF2 coming from Pinfish Co-authored-by: Camilla Ugolini <57759922+camillaugolini-iit@users.noreply.github.com> Co-authored-by: Tommaso Leonardi <tommaso.leonardi@iit.it>
tleonardi · Jan 23, 2020 · b283370 · b283370
1 parent 5640b8a
commit b283370
Show file tree

Hide file tree

Showing 8 changed files with 57 additions and 30 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,8 +1,6 @@
 language: python
 
 python:
-          - 3.2
-          - 3.3
           - 3.4
           - 3.5
           - 3.6

diff --git a/Changelog.md b/Changelog.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## [0.2.3] - 2020/01/20 
+
+### Added
+- Added support for BED files without comma terminator in fields 11 and 12 (@camillaugolini-iit)
+- Added --transcript_feature_name argument for gtf2bed (@camillaugolini-iit)
+
 ## [0.2.2] - 2019/02/27
 
 ### Added

diff --git a/README.md b/README.md
@@ -3,7 +3,6 @@
 [![JOSS Status](http://joss.theoj.org/papers/22763a3b37fde13e548e884edd3221fa/status.svg)](http://joss.theoj.org/papers/22763a3b37fde13e548e884edd3221fa)
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.2578820.svg)](https://doi.org/10.5281/zenodo.2578820)
 [![License: MIT](https://img.shields.io/badge/License-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)
-
 # Bedparse
 
 ![](docs/bedparse.svg)
@@ -44,3 +43,10 @@ Our documentation is hosted on [Read the Docs](https://bedparse.readthedocs.io/e
 
 We also have a short [tutorial](https://bedparse.readthedocs.io/en/latest/Tutorial.html) to guide you through the basic functions.
 
+
+## Publications
+
+If you use bedparse please cite the following [paper](https://joss.theoj.org/papers/10.21105/joss.01228):
+
+_Leonardi, (2019). Bedparse: feature extraction from BED files. Journal of Open Source Software, 4(34), 1228, https://doi.org/10.21105/joss.01228_
+
diff --git a/bedparse/bedline.py b/bedparse/bedline.py
@@ -68,19 +68,21 @@ def __init__(self, line=None):
                 raise BEDexception("CDSstart is greater than CDSend for transcript "+self.name)
             if(self.cdsStart<self.start or self.cdsEnd>self.end):
                 raise BEDexception("The CDS range is bigger than the transcript for transcript "+self.name)
-            if(re.search(',$', self.exLengths) == None or re.search(',$', self.exStarts) == None ):
-                raise BEDexception("Exon lengths or starts do not end with ',' for transcript "+self.name)
-            # Check that number of blocks corresponds to the conent of fields 11 and 12
-            if(len(self.exLengths.split(","))-1!=self.nEx):
+            # Check that number of blocks corresponds to the content of fields 11 and 12
+            if(re.search(',$', self.exLengths) != None):
+                self.exLengths = re.sub(',$', '', self.exLengths)
+            if(re.search(',$', self.exStarts) != None):
+                self.exStarts = re.sub(',$', '', self.exStarts)
+            if(len(self.exLengths.split(","))!=self.nEx):
                 raise BEDexception("Exon lengths and number of exons mismatch for transcript "+self.name)
-            if(len(self.exStarts.split(","))-1!=self.nEx):
+            if(len(self.exStarts.split(","))!=self.nEx):
                 raise BEDexception("Exon starts and number of exons mismatch for transcript "+self.name)
             # Check that every element of exLengths and exStarts can be coerced to int
-            for ex in self.exStarts.split(",")[1:-1]:
+            for ex in self.exStarts.split(","):
                 try: int(ex)
                 except ValueError:
                     raise BEDexception("Exon starts are not int for transcript "+self.name)
-            for ex in self.exLengths.split(",")[1:-1]:
+            for ex in self.exLengths.split(","):
                 try: int(ex)
                 except ValueError:
                     raise BEDexception("Exon lengths are not int for transcript "+self.name)
@@ -104,6 +106,8 @@ def print(self, end='\n'):
         """
         out=[]
         for key in self.__fields[:self.bedType]:
+            if key=="exLengths": self.exLengths+=","
+            if key=="exStarts": self.exStarts+=","
             out.append(self.__dict__[key])
         return print(*out, sep="\t", end=end)
 
@@ -349,7 +353,18 @@ def introns(self):
 
         intronStarts.append("")
         intronLens.append("")
-        result = [self.chr, self.start+int(exLens[0]), self.end-int(exLens[-2]), self.name, self.score, self.strand, self.start+int(exLens[0]), self.start+int(exLens[0]), self.color, len(intronStarts)-1, ','.join(str(x) for x in intronLens), ','.join(str(x) for x in intronStarts)]
+        result = [self.chr, 
+                  self.start+int(exLens[0]), 
+                  self.end-int(exLens[self.nEx-1]), 
+                  self.name, 
+                  self.score, 
+                  self.strand, 
+                  self.start+int(exLens[0]), 
+                  self.start+int(exLens[0]), 
+                  self.color, 
+                  len(intronStarts)-1, 
+                  ','.join(str(x) for x in intronLens), 
+                  ','.join(str(x) for x in intronStarts)]
         return(bedline(result))
 
 
@@ -435,8 +450,8 @@ def bed12tobed6(self, appendExN=False, whichExon="all"):
             raise BEDexception("whichExon is only allowed if the transcripts are stranded. %s is not"%self.name)
 
         exons=list()
-        lengths=[int(x) for x in self.exLengths.split(",")[:-1]]
-        starts=[int(x) for x in self.exStarts.split(",")[:-1]]
+        lengths=[int(x) for x in self.exLengths.split(",")]
+        starts=[int(x) for x in self.exStarts.split(",")]
         for n in range(0,self.nEx):
             name=self.name
             if(appendExN == True): name+="_Exon"+'%03d'%(n+1)

diff --git a/bedparse/bedparse.py b/bedparse/bedparse.py
@@ -149,8 +149,9 @@ def main(args=None):
                    printed to stdout. For efficiency reasons this command doesn't perform BED validation."""
     desc_join="""Adds the content of an annotation file to a BED file as extra columns. The two files are joined by matching the BED Name field (column 4) with
                  a user-specified field of the annotation file."""
-    desc_gtf2bed="""Converts a GTF file to BED12 format. This tool supports the Ensembl GTF format. The GTF file must contain 'transcript' and 'exon' 
-                    features in field 3. If the GTF file also annotates 'CDS' 'start_codon' or 'stop_codon' these are used to annotate the thickStart and thickEnd in the BED file."""
+    desc_gtf2bed="""Converts a GTF file to BED12 format. This tool supports the Ensembl GTF format, which uses features of type 'transcript' (field 3) to define transcripts.
+                    In case the GTF file defines transcripts with a different feature type, it is possible to provide the feature name from the command line.
+                    If the GTF file also annotates 'CDS' 'start_codon' or 'stop_codon' these are used to annotate the thickStart and thickEnd in the BED file."""
     desc_bed12tobed6="Convert the BED12 format into BED6 by reporting a separate line for each block of the original record."
     desc_convertChr="""Convert chromosome names between UCSC and Ensembl formats.
                        The conversion supports the hg38 assembly up to patch 11 and the mm10 assembly up to patch 4. By default patches
@@ -220,7 +221,8 @@ def main(args=None):
     parser_gtf2bed.add_argument("--extraFields",type=str, default='', help="Comma separated list of extra GTF fields to be added after col 12 (e.g. gene_id,gene_name).")
     parser_gtf2bed.add_argument("--filterKey", type=str, default='transcript_biotype', help="GTF extra field on which to apply the filtering")
     parser_gtf2bed.add_argument("--filterType",type=str, default='', help="Comma separated list of filterKey field values to retain.")
-    parser_gtf2bed.set_defaults(func=lambda args: gtf2bed(args.gtf, extra=args.extraFields.split(','), filterKey=args.filterKey, filterType=args.filterType.split(',')))
+    parser_gtf2bed.add_argument("--transcript_feature_name",type=str, default='transcript', help="Transcript feature name. Features with this string in field 3 of the GTF file will be considered transcripts. (default 'transcript')")
+    parser_gtf2bed.set_defaults(func=lambda args: gtf2bed(args.gtf, extra=args.extraFields.split(','), filterKey=args.filterKey, filterType=args.filterType.split(','), transcript_feature_name=args.transcript_feature_name))
 
     parser_bed12tobed6 = subparsers.add_parser('bed12tobed6', 
             help="Converts a BED12 file to BED6 format", description=desc_bed12tobed6)

diff --git a/bedparse/converters.py b/bedparse/converters.py
@@ -3,7 +3,7 @@
 import re
 from bedparse import bedline
 
-def gtf2bed(gtf, extra=[''], filterKey="transcript_biotype", filterType=['']):
+def gtf2bed(gtf, extra=[''], filterKey="transcript_biotype", filterType=[''], transcript_feature_name= "transcript"):
     gtfRecords={'exon':list(), 'transcript': list(), 'cds':list()}
     transcripts=dict()
     exons=dict()
@@ -12,8 +12,8 @@ def gtf2bed(gtf, extra=[''], filterKey="transcript_biotype", filterType=['']):
     gtfReader = csv.reader((row for row in gtf if not row.startswith('#')), delimiter="\t")
     for line in gtfReader:
         # Store all transcript lines
-        if(line[2]=='transcript'):
-            txName=re.sub('.+transcript_id "([^"]+)";.+', "\\1", line[8])
+        if(line[2]== transcript_feature_name):
+            txName=re.sub('.*transcript_id "([^"]+)";.*', "\\1", line[8])
             if(line[6]!="+" and line[6]!="-"):
                 raise BEDexception("Transcript with unrecognized strand: "+txName)
             # Start-1 converts from 1-based to 0-based
@@ -30,7 +30,7 @@ def gtf2bed(gtf, extra=[''], filterKey="transcript_biotype", filterType=['']):
                 if(extrainfo[txName][filterKey] == line[8]): extrainfo[txName][field] = "."
         # Parse exon lines
         if(line[2]=='exon'):
-            txName=re.sub('.+transcript_id "([^"]+)";.+', "\\1", line[8])
+            txName=re.sub('.*transcript_id "([^"]+)";.*', "\\1", line[8])
             if(line[6]!=transcripts[txName][5]):
                 raise BEDexception("Exon has different strand from parent transcript: "+txName)
             start=int(line[3])-1
@@ -40,7 +40,7 @@ def gtf2bed(gtf, extra=[''], filterKey="transcript_biotype", filterType=['']):
         # Start CDS, start and stop codons
         # Any of these features extends the CDS
         if(line[2] in ['CDS', 'start_codon', 'stop_codon']):
-            txName=re.sub('.+transcript_id "([^"]+)";.+', "\\1", line[8])
+            txName=re.sub('.*transcript_id "([^"]+)";.*', "\\1", line[8])
             if(line[6]!=transcripts[txName][5]):
                 raise BEDexception(("%s has different strand from parent transcript: %s" % line[2], txName))
             start=int(line[3])
@@ -83,6 +83,8 @@ def gtf2bed(gtf, extra=[''], filterKey="transcript_biotype", filterType=['']):
         out=list()
         bed = bedline(transcripts[transcript])
         for key in bed._bedline__fields[:bed.bedType]:
+            if key=="exLengths": bed.exLengths+=","
+            if key=="exStarts": bed.exStarts+=","
             out.append(bed.__dict__[key])
         if(extra!=['']):
             for field in extra:

diff --git a/setup.py b/setup.py
@@ -7,23 +7,21 @@
       description='A simple library and CLI tool to manipulate BED files',
       long_description=long_description,
       long_description_content_type="text/markdown",
-      version="0.2.2",
+      version="0.2.3",
       url='https://github.com/tleonardi/bedparse',
       author='Tommaso Leonardi',
       author_email='tom@tleo.io',
       license='MIT',
       classifiers=[
           'Development Status :: 5 - Production/Stable',
           'License :: OSI Approved :: MIT License',
-          'Programming Language :: Python :: 3.2',
-          'Programming Language :: Python :: 3.3',
           'Programming Language :: Python :: 3.4',
           'Programming Language :: Python :: 3.5',
           'Programming Language :: Python :: 3.6'
       ],
       packages=['bedparse'],
       install_requires=['argparse', 'setuptools'],
-      python_requires='>=3',
+      python_requires='>=3.4',
       entry_points={
           'console_scripts': [
               'bedparse = bedparse.bedparse:main'

diff --git a/tests/tests.py b/tests/tests.py
@@ -32,8 +32,8 @@ class KnownValues(unittest.TestCase):
             ["chr1", 1000, 2000, "Name", 0, "+", 1000, 900, ".", 3, "10,10,10,", "0,100,200,"],
             ["chr1", 1000, 2000, "Name", 0, "+", 100, 900, ".", 3, "10,10,10,", "0,100,200,"],
             ["chr1", 1000, 2000, "Name", 0, "+", 1500, 2001, ".", 3, "10,10,10,", "0,100,200,"],
-            ["chr1", 1000, 2000, "Name", 0, "+", 1500, 2000, ".", 3, "10,10,10", "0,100,200,"],
-            ["chr1", 1000, 2000, "Name", 0, "+", 1500, 2000, ".", 3, "10,10,10,", "0,100,200"],
+            ["chr1", 1000, 2000, "Name", 0, "+", 1500, 2000, ".", 4, "10,10,10", "0,100,200,"],
+            ["chr1", 1000, 2000, "Name", 0, "+", 1500, 2000, ".", 4, "10,10,10,", "0,100,200"],
             ["chr1", 1000, 2000, "Name", 0, "+", 1500, 2000, ".", 3, "10,10,10,10,", "0,100,200,"],
             ["chr1", 1000, 2000, "Name", 0, "+", 1500, 2000, ".", 3, "10,10,", "0,100,200,"],
             ["chr1", 1000, 2000, "Name", 0, "+", 1500, 2000, ".", 3, "10,10,10,", "0,100,"],
@@ -54,14 +54,14 @@ class KnownValues(unittest.TestCase):
             (["chr1", 100, 500, "Name", 0, "+", 200, 300, ".", 1, "400,", "0,"], ["chr1", 100, 200, "Name", 0, "+", 100,100, ".", 1, "100,", "0,"]),
             (["chr1", 100, 500, "Name", 0, "-", 200, 300, ".", 1, "400,", "0,"], ["chr1", 300, 500, "Name", 0, "-", 300,300, ".", 1, "200,", "0,"]),
             (["chr1", 100, 420, "Name", 0, "-", 210, 310, ".", 4, "20,20,20,20,", "0,100,200,300,"], ["chr1", 310, 420, "Name", 0, "-", 310,310, ".", 2, "10,20,", "0,90,"]),
-            (["chr1", 100, 420, "Name", 0, "+", 100, 310, ".", 4, "20,20,20,20,", "0,100,200,300,"], None),
+            (["chr1", 100, 420, "Name", 0, "+", 100, 310, ".", 4, "20,20,20,20", "0,100,200,300,"], None),
             # This is a case where the 5'UTR end on the last base of an exon
             (["1", 100, 160, "Name", 0, "+", 150,160, ".", 2, "10,10,", "0,50,"], ["1", 100, 110, "Name", 0, "+", 100,100, ".", 1, "10,", "0,"]),
             (["1", 100, 160, "Name!", 0, "+", 109,160, ".", 2, "10,10,", "0,50,"], ["1", 100, 109, "Name!", 0, "+", 100,100, ".", 1, "9,", "0,"])
             )
 
     known_3pUTRs =(
-            (["chr1", 100, 420, "Name", 0, "+", 210, 310, ".", 4, "20,20,20,20,", "0,100,200,300,"], ["chr1", 310, 420, "Name", 0, "+", 310,310, ".", 2, "10,20,", "0,90,"]),
+            (["chr1", 100, 420, "Name", 0, "+", 210, 310, ".", 4, "20,20,20,20", "0,100,200,300"], ["chr1", 310, 420, "Name", 0, "+", 310,310, ".", 2, "10,20,", "0,90,"]),
             (["chr1", 100, 500, "Name", 0, "-", 200, 300, ".", 1, "400,", "0,"], ["chr1", 100, 200, "Name", 0, "-", 100,100, ".", 1, "100,", "0,"]),
             (["chr1", 100, 500, "Name", 0, "+", 200, 300, ".", 1, "400,", "0,"], ["chr1", 300, 500, "Name", 0, "+", 300,300, ".", 1, "200,", "0,"]),
             (["chr1", 100, 420, "Name", 0, "-", 210, 310, ".", 4, "20,20,20,20,", "0,100,200,300,"], ["chr1", 100, 210, "Name", 0, "-", 100,100, ".", 2, "20,10,", "0,100,"]),
@@ -72,7 +72,7 @@ class KnownValues(unittest.TestCase):
             )
 
     known_CDSs =(
-            (["chr1", 100, 420, "Name", 0, "+", 210, 310, ".", 4, "20,20,20,20,", "0,100,200,300,"], ["chr1", 210, 310, "Name", 0, "+", 210, 310, ".", 2, "10,10,", "0,90,"]),
+            (["chr1", 100, 420, "Name", 0, "+", 210, 310, ".", 4, "20,20,20,20", "0,100,200,300"], ["chr1", 210, 310, "Name", 0, "+", 210, 310, ".", 2, "10,10,", "0,90,"]),
             (["chr1", 100, 420, "Name", 0, "-", 210, 310, ".", 4, "20,20,20,20,", "0,100,200,300,"], ["chr1", 210, 310, "Name", 0, "-", 210, 310, ".", 2, "10,10,", "0,90,"]),
             (["chr1", 100, 500, "Name", 0, "-", 200, 300, ".", 1, "400,", "0,"], ["chr1", 200, 300, "Name", 0, "-", 200,300, ".", 1, "100,", "0,"]),
             (["chr1", 100, 500, "Name", 0, "+", 200, 300, ".", 1, "400,", "0,"], ["chr1", 200, 300, "Name", 0, "+", 200,300, ".", 1, "100,", "0,"]),
-Original file line number
+Diff line change
@@ -1,8 +1,6 @@
     language: python
     python:
-              - 3.2
-              - 3.3
               - 3.4
               - 3.5
               - 3.6
@@ Expand Down @@