Skip to content

Commit

Permalink
Release v0.2.3 (#23)
Browse files Browse the repository at this point in the history
* Added Zenodo DOI badge

* Fix for comma terminator in field 11 and 12 (#21)

* Removed python 3.2 and 3.3 support

* Added Gtf2bed field support (#22)

* gtf2bed is able to convert also GFF2 coming from Pinfish

Co-authored-by: Camilla Ugolini <57759922+camillaugolini-iit@users.noreply.github.com>
Co-authored-by: Tommaso Leonardi <tommaso.leonardi@iit.it>
  • Loading branch information
3 people authored Jan 23, 2020
1 parent 5640b8a commit b283370
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 30 deletions.
2 changes: 0 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
language: python

python:
- 3.2
- 3.3
- 3.4
- 3.5
- 3.6
Expand Down
6 changes: 6 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## [0.2.3] - 2020/01/20

### Added
- Added support for BED files without comma terminator in fields 11 and 12 (@camillaugolini-iit)
- Added --transcript_feature_name argument for gtf2bed (@camillaugolini-iit)

## [0.2.2] - 2019/02/27

### Added
Expand Down
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
[![JOSS Status](http://joss.theoj.org/papers/22763a3b37fde13e548e884edd3221fa/status.svg)](http://joss.theoj.org/papers/22763a3b37fde13e548e884edd3221fa)
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.2578820.svg)](https://doi.org/10.5281/zenodo.2578820)
[![License: MIT](https://img.shields.io/badge/License-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)

# Bedparse

![](docs/bedparse.svg)
Expand Down Expand Up @@ -44,3 +43,10 @@ Our documentation is hosted on [Read the Docs](https://bedparse.readthedocs.io/e

We also have a short [tutorial](https://bedparse.readthedocs.io/en/latest/Tutorial.html) to guide you through the basic functions.


## Publications

If you use bedparse please cite the following [paper](https://joss.theoj.org/papers/10.21105/joss.01228):

_Leonardi, (2019). Bedparse: feature extraction from BED files. Journal of Open Source Software, 4(34), 1228, https://doi.org/10.21105/joss.01228_

35 changes: 25 additions & 10 deletions bedparse/bedline.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,19 +68,21 @@ def __init__(self, line=None):
raise BEDexception("CDSstart is greater than CDSend for transcript "+self.name)
if(self.cdsStart<self.start or self.cdsEnd>self.end):
raise BEDexception("The CDS range is bigger than the transcript for transcript "+self.name)
if(re.search(',$', self.exLengths) == None or re.search(',$', self.exStarts) == None ):
raise BEDexception("Exon lengths or starts do not end with ',' for transcript "+self.name)
# Check that number of blocks corresponds to the conent of fields 11 and 12
if(len(self.exLengths.split(","))-1!=self.nEx):
# Check that number of blocks corresponds to the content of fields 11 and 12
if(re.search(',$', self.exLengths) != None):
self.exLengths = re.sub(',$', '', self.exLengths)
if(re.search(',$', self.exStarts) != None):
self.exStarts = re.sub(',$', '', self.exStarts)
if(len(self.exLengths.split(","))!=self.nEx):
raise BEDexception("Exon lengths and number of exons mismatch for transcript "+self.name)
if(len(self.exStarts.split(","))-1!=self.nEx):
if(len(self.exStarts.split(","))!=self.nEx):
raise BEDexception("Exon starts and number of exons mismatch for transcript "+self.name)
# Check that every element of exLengths and exStarts can be coerced to int
for ex in self.exStarts.split(",")[1:-1]:
for ex in self.exStarts.split(","):
try: int(ex)
except ValueError:
raise BEDexception("Exon starts are not int for transcript "+self.name)
for ex in self.exLengths.split(",")[1:-1]:
for ex in self.exLengths.split(","):
try: int(ex)
except ValueError:
raise BEDexception("Exon lengths are not int for transcript "+self.name)
Expand All @@ -104,6 +106,8 @@ def print(self, end='\n'):
"""
out=[]
for key in self.__fields[:self.bedType]:
if key=="exLengths": self.exLengths+=","
if key=="exStarts": self.exStarts+=","
out.append(self.__dict__[key])
return print(*out, sep="\t", end=end)

Expand Down Expand Up @@ -349,7 +353,18 @@ def introns(self):

intronStarts.append("")
intronLens.append("")
result = [self.chr, self.start+int(exLens[0]), self.end-int(exLens[-2]), self.name, self.score, self.strand, self.start+int(exLens[0]), self.start+int(exLens[0]), self.color, len(intronStarts)-1, ','.join(str(x) for x in intronLens), ','.join(str(x) for x in intronStarts)]
result = [self.chr,
self.start+int(exLens[0]),
self.end-int(exLens[self.nEx-1]),
self.name,
self.score,
self.strand,
self.start+int(exLens[0]),
self.start+int(exLens[0]),
self.color,
len(intronStarts)-1,
','.join(str(x) for x in intronLens),
','.join(str(x) for x in intronStarts)]
return(bedline(result))


Expand Down Expand Up @@ -435,8 +450,8 @@ def bed12tobed6(self, appendExN=False, whichExon="all"):
raise BEDexception("whichExon is only allowed if the transcripts are stranded. %s is not"%self.name)

exons=list()
lengths=[int(x) for x in self.exLengths.split(",")[:-1]]
starts=[int(x) for x in self.exStarts.split(",")[:-1]]
lengths=[int(x) for x in self.exLengths.split(",")]
starts=[int(x) for x in self.exStarts.split(",")]
for n in range(0,self.nEx):
name=self.name
if(appendExN == True): name+="_Exon"+'%03d'%(n+1)
Expand Down
8 changes: 5 additions & 3 deletions bedparse/bedparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,9 @@ def main(args=None):
printed to stdout. For efficiency reasons this command doesn't perform BED validation."""
desc_join="""Adds the content of an annotation file to a BED file as extra columns. The two files are joined by matching the BED Name field (column 4) with
a user-specified field of the annotation file."""
desc_gtf2bed="""Converts a GTF file to BED12 format. This tool supports the Ensembl GTF format. The GTF file must contain 'transcript' and 'exon'
features in field 3. If the GTF file also annotates 'CDS' 'start_codon' or 'stop_codon' these are used to annotate the thickStart and thickEnd in the BED file."""
desc_gtf2bed="""Converts a GTF file to BED12 format. This tool supports the Ensembl GTF format, which uses features of type 'transcript' (field 3) to define transcripts.
In case the GTF file defines transcripts with a different feature type, it is possible to provide the feature name from the command line.
If the GTF file also annotates 'CDS' 'start_codon' or 'stop_codon' these are used to annotate the thickStart and thickEnd in the BED file."""
desc_bed12tobed6="Convert the BED12 format into BED6 by reporting a separate line for each block of the original record."
desc_convertChr="""Convert chromosome names between UCSC and Ensembl formats.
The conversion supports the hg38 assembly up to patch 11 and the mm10 assembly up to patch 4. By default patches
Expand Down Expand Up @@ -220,7 +221,8 @@ def main(args=None):
parser_gtf2bed.add_argument("--extraFields",type=str, default='', help="Comma separated list of extra GTF fields to be added after col 12 (e.g. gene_id,gene_name).")
parser_gtf2bed.add_argument("--filterKey", type=str, default='transcript_biotype', help="GTF extra field on which to apply the filtering")
parser_gtf2bed.add_argument("--filterType",type=str, default='', help="Comma separated list of filterKey field values to retain.")
parser_gtf2bed.set_defaults(func=lambda args: gtf2bed(args.gtf, extra=args.extraFields.split(','), filterKey=args.filterKey, filterType=args.filterType.split(',')))
parser_gtf2bed.add_argument("--transcript_feature_name",type=str, default='transcript', help="Transcript feature name. Features with this string in field 3 of the GTF file will be considered transcripts. (default 'transcript')")
parser_gtf2bed.set_defaults(func=lambda args: gtf2bed(args.gtf, extra=args.extraFields.split(','), filterKey=args.filterKey, filterType=args.filterType.split(','), transcript_feature_name=args.transcript_feature_name))

parser_bed12tobed6 = subparsers.add_parser('bed12tobed6',
help="Converts a BED12 file to BED6 format", description=desc_bed12tobed6)
Expand Down
12 changes: 7 additions & 5 deletions bedparse/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
from bedparse import bedline

def gtf2bed(gtf, extra=[''], filterKey="transcript_biotype", filterType=['']):
def gtf2bed(gtf, extra=[''], filterKey="transcript_biotype", filterType=[''], transcript_feature_name= "transcript"):
gtfRecords={'exon':list(), 'transcript': list(), 'cds':list()}
transcripts=dict()
exons=dict()
Expand All @@ -12,8 +12,8 @@ def gtf2bed(gtf, extra=[''], filterKey="transcript_biotype", filterType=['']):
gtfReader = csv.reader((row for row in gtf if not row.startswith('#')), delimiter="\t")
for line in gtfReader:
# Store all transcript lines
if(line[2]=='transcript'):
txName=re.sub('.+transcript_id "([^"]+)";.+', "\\1", line[8])
if(line[2]== transcript_feature_name):
txName=re.sub('.*transcript_id "([^"]+)";.*', "\\1", line[8])
if(line[6]!="+" and line[6]!="-"):
raise BEDexception("Transcript with unrecognized strand: "+txName)
# Start-1 converts from 1-based to 0-based
Expand All @@ -30,7 +30,7 @@ def gtf2bed(gtf, extra=[''], filterKey="transcript_biotype", filterType=['']):
if(extrainfo[txName][filterKey] == line[8]): extrainfo[txName][field] = "."
# Parse exon lines
if(line[2]=='exon'):
txName=re.sub('.+transcript_id "([^"]+)";.+', "\\1", line[8])
txName=re.sub('.*transcript_id "([^"]+)";.*', "\\1", line[8])
if(line[6]!=transcripts[txName][5]):
raise BEDexception("Exon has different strand from parent transcript: "+txName)
start=int(line[3])-1
Expand All @@ -40,7 +40,7 @@ def gtf2bed(gtf, extra=[''], filterKey="transcript_biotype", filterType=['']):
# Start CDS, start and stop codons
# Any of these features extends the CDS
if(line[2] in ['CDS', 'start_codon', 'stop_codon']):
txName=re.sub('.+transcript_id "([^"]+)";.+', "\\1", line[8])
txName=re.sub('.*transcript_id "([^"]+)";.*', "\\1", line[8])
if(line[6]!=transcripts[txName][5]):
raise BEDexception(("%s has different strand from parent transcript: %s" % line[2], txName))
start=int(line[3])
Expand Down Expand Up @@ -83,6 +83,8 @@ def gtf2bed(gtf, extra=[''], filterKey="transcript_biotype", filterType=['']):
out=list()
bed = bedline(transcripts[transcript])
for key in bed._bedline__fields[:bed.bedType]:
if key=="exLengths": bed.exLengths+=","
if key=="exStarts": bed.exStarts+=","
out.append(bed.__dict__[key])
if(extra!=['']):
for field in extra:
Expand Down
6 changes: 2 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,21 @@
description='A simple library and CLI tool to manipulate BED files',
long_description=long_description,
long_description_content_type="text/markdown",
version="0.2.2",
version="0.2.3",
url='https://github.com/tleonardi/bedparse',
author='Tommaso Leonardi',
author_email='tom@tleo.io',
license='MIT',
classifiers=[
'Development Status :: 5 - Production/Stable',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3.2',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6'
],
packages=['bedparse'],
install_requires=['argparse', 'setuptools'],
python_requires='>=3',
python_requires='>=3.4',
entry_points={
'console_scripts': [
'bedparse = bedparse.bedparse:main'
Expand Down
10 changes: 5 additions & 5 deletions tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ class KnownValues(unittest.TestCase):
["chr1", 1000, 2000, "Name", 0, "+", 1000, 900, ".", 3, "10,10,10,", "0,100,200,"],
["chr1", 1000, 2000, "Name", 0, "+", 100, 900, ".", 3, "10,10,10,", "0,100,200,"],
["chr1", 1000, 2000, "Name", 0, "+", 1500, 2001, ".", 3, "10,10,10,", "0,100,200,"],
["chr1", 1000, 2000, "Name", 0, "+", 1500, 2000, ".", 3, "10,10,10", "0,100,200,"],
["chr1", 1000, 2000, "Name", 0, "+", 1500, 2000, ".", 3, "10,10,10,", "0,100,200"],
["chr1", 1000, 2000, "Name", 0, "+", 1500, 2000, ".", 4, "10,10,10", "0,100,200,"],
["chr1", 1000, 2000, "Name", 0, "+", 1500, 2000, ".", 4, "10,10,10,", "0,100,200"],
["chr1", 1000, 2000, "Name", 0, "+", 1500, 2000, ".", 3, "10,10,10,10,", "0,100,200,"],
["chr1", 1000, 2000, "Name", 0, "+", 1500, 2000, ".", 3, "10,10,", "0,100,200,"],
["chr1", 1000, 2000, "Name", 0, "+", 1500, 2000, ".", 3, "10,10,10,", "0,100,"],
Expand All @@ -54,14 +54,14 @@ class KnownValues(unittest.TestCase):
(["chr1", 100, 500, "Name", 0, "+", 200, 300, ".", 1, "400,", "0,"], ["chr1", 100, 200, "Name", 0, "+", 100,100, ".", 1, "100,", "0,"]),
(["chr1", 100, 500, "Name", 0, "-", 200, 300, ".", 1, "400,", "0,"], ["chr1", 300, 500, "Name", 0, "-", 300,300, ".", 1, "200,", "0,"]),
(["chr1", 100, 420, "Name", 0, "-", 210, 310, ".", 4, "20,20,20,20,", "0,100,200,300,"], ["chr1", 310, 420, "Name", 0, "-", 310,310, ".", 2, "10,20,", "0,90,"]),
(["chr1", 100, 420, "Name", 0, "+", 100, 310, ".", 4, "20,20,20,20,", "0,100,200,300,"], None),
(["chr1", 100, 420, "Name", 0, "+", 100, 310, ".", 4, "20,20,20,20", "0,100,200,300,"], None),
# This is a case where the 5'UTR end on the last base of an exon
(["1", 100, 160, "Name", 0, "+", 150,160, ".", 2, "10,10,", "0,50,"], ["1", 100, 110, "Name", 0, "+", 100,100, ".", 1, "10,", "0,"]),
(["1", 100, 160, "Name!", 0, "+", 109,160, ".", 2, "10,10,", "0,50,"], ["1", 100, 109, "Name!", 0, "+", 100,100, ".", 1, "9,", "0,"])
)

known_3pUTRs =(
(["chr1", 100, 420, "Name", 0, "+", 210, 310, ".", 4, "20,20,20,20,", "0,100,200,300,"], ["chr1", 310, 420, "Name", 0, "+", 310,310, ".", 2, "10,20,", "0,90,"]),
(["chr1", 100, 420, "Name", 0, "+", 210, 310, ".", 4, "20,20,20,20", "0,100,200,300"], ["chr1", 310, 420, "Name", 0, "+", 310,310, ".", 2, "10,20,", "0,90,"]),
(["chr1", 100, 500, "Name", 0, "-", 200, 300, ".", 1, "400,", "0,"], ["chr1", 100, 200, "Name", 0, "-", 100,100, ".", 1, "100,", "0,"]),
(["chr1", 100, 500, "Name", 0, "+", 200, 300, ".", 1, "400,", "0,"], ["chr1", 300, 500, "Name", 0, "+", 300,300, ".", 1, "200,", "0,"]),
(["chr1", 100, 420, "Name", 0, "-", 210, 310, ".", 4, "20,20,20,20,", "0,100,200,300,"], ["chr1", 100, 210, "Name", 0, "-", 100,100, ".", 2, "20,10,", "0,100,"]),
Expand All @@ -72,7 +72,7 @@ class KnownValues(unittest.TestCase):
)

known_CDSs =(
(["chr1", 100, 420, "Name", 0, "+", 210, 310, ".", 4, "20,20,20,20,", "0,100,200,300,"], ["chr1", 210, 310, "Name", 0, "+", 210, 310, ".", 2, "10,10,", "0,90,"]),
(["chr1", 100, 420, "Name", 0, "+", 210, 310, ".", 4, "20,20,20,20", "0,100,200,300"], ["chr1", 210, 310, "Name", 0, "+", 210, 310, ".", 2, "10,10,", "0,90,"]),
(["chr1", 100, 420, "Name", 0, "-", 210, 310, ".", 4, "20,20,20,20,", "0,100,200,300,"], ["chr1", 210, 310, "Name", 0, "-", 210, 310, ".", 2, "10,10,", "0,90,"]),
(["chr1", 100, 500, "Name", 0, "-", 200, 300, ".", 1, "400,", "0,"], ["chr1", 200, 300, "Name", 0, "-", 200,300, ".", 1, "100,", "0,"]),
(["chr1", 100, 500, "Name", 0, "+", 200, 300, ".", 1, "400,", "0,"], ["chr1", 200, 300, "Name", 0, "+", 200,300, ".", 1, "100,", "0,"]),
Expand Down

0 comments on commit b283370

Please sign in to comment.