diff --git a/Changelog.md b/Changelog.md index e64f258..3a3cb34 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,5 +1,21 @@ # Changelog +## [0.2.2] - 2019/02/27 + +### Added +- Better documentation and examples of API methods + +### Fixed +- Fixed shadowing of built-in `all` function by translateChr() + +## [0.2.1] - 2019/02/25 + +### Added +- Added support for strandedness in tx2genome() + +### Fixed +- Fixed bug in tx2genome() strand handling + ## [0.2.0] - 2019/01/19 ### Fixed diff --git a/README.md b/README.md index 1f3129f..f094c96 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,45 @@ [![Build Status](https://travis-ci.org/tleonardi/bedparse.svg?branch=master)](https://travis-ci.org/tleonardi/bedparse) -[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) - +[![Docs Status](https://readthedocs.org/projects/bedparse/badge/?version=master&style=flat)](https://bedparse.readthedocs.io/en/master/) +[![JOSS Status](http://joss.theoj.org/papers/22763a3b37fde13e548e884edd3221fa/status.svg)](http://joss.theoj.org/papers/22763a3b37fde13e548e884edd3221fa) +[![License: MIT](https://img.shields.io/badge/License-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT) + # Bedparse ![](docs/bedparse.svg) Bedparse is a simple python module and CLI tool to perform common operations on BED files. -It offers the following functionality: -* Filtering of transcripts based on annotations -* Joining of annotation files based on transcript names -* Conversion from GTF to BED format -* Conversion from UCSC to Ensembl chromosome names (and viceversa) -* Conversion from bed12 to bed6 -* Promoter reporting -* Intron reporting -* CDS reporting -* UTR reporting +It offers 11 sub-commands that implement the following functionality: +* `filter`: Filtering of transcripts based on annotations +* `join`: Joining of annotation files based on transcript names +* `gtf2bed`: Conversion from GTF to BED format +* `convertChr`: Conversion from UCSC to Ensembl chromosome names (and viceversa) +* `bed12tobed6`: Conversion from bed12 to bed6 +* `promoter`: Promoter reporting +* `introns`: Intron reporting +* `cds`: CDS reporting +* `3pUTR` and `5pUTR`: UTR reporting +* `validateFormat`: Check that the file conforms with the BED format ## Installation +Installing is as simple as: + ``` pip install bedparse ``` +## Basic usage + +The basic syntax in the form: `bedparse subcommand [parameters]`. + +For a list of all subcommands and a brief explanation of what they do, use: `bedparse --help`. + +For a detailed explanation of each subcommand and a list of its parameters, use the `--help` option after the subcommand's name, e.g.: `bedparse promoter --help` + ## Documentation Our documentation is hosted on [Read the Docs](https://bedparse.readthedocs.io/en/latest/). We also have a short [tutorial](https://bedparse.readthedocs.io/en/latest/Tutorial.html) to guide you through the basic functions. - diff --git a/bedparse/bedline.py b/bedparse/bedline.py index 6964683..331bf7e 100644 --- a/bedparse/bedline.py +++ b/bedparse/bedline.py @@ -3,9 +3,14 @@ from bedparse import chrnames class bedline(object): - """An object to represent a BED 12 line""" - fields = ("chr", "start", "end", "name", "score", "strand", "cdsStart", "cdsEnd", "color", "nEx", "exLengths", "exStarts") + """The bedline class defines an object that represents a single BED[3,4,6,12] line + """ + __fields = ("chr", "start", "end", "name", "score", "strand", "cdsStart", "cdsEnd", "color", "nEx", "exLengths", "exStarts") def __init__(self, line=None): + """ + :param line: List where each element corresponds to one field of a BED file + :type line: list + """ if(line is None): return None elif(type(line) is not list): @@ -18,7 +23,7 @@ def __init__(self, line=None): self.bedType=len(line) for n in range(self.bedType): - self.__dict__[self.fields[n]] = line[n] + self.__dict__[self.__fields[n]] = line[n] # If the file format is bed3 set the name to "NoName" if(self.bedType<4): @@ -88,29 +93,48 @@ def __init__(self, line=None): def __str__(self): out=[] - for key in self.fields[:self.bedType]: + for key in self.__fields[:self.bedType]: out.append(self.__dict__[key]) return str(out) def print(self, end='\n'): + """Prints a bedline object + + :param end: Line terminator character + """ out=[] - for key in self.fields[:self.bedType]: + for key in self.__fields[:self.bedType]: out.append(self.__dict__[key]) return print(*out, sep="\t", end=end) def pprint(self): + """Prints a bedline object formatted as a python list + """ import pprint pp = pprint.PrettyPrinter(indent=4) out=[] - for key in self.fields[:self.bedType]: + for key in self.__fields[:self.bedType]: out.append(self.__dict__[key]) return pp.pprint(out) def __eq__(self, other): return self.__dict__ == other.__dict__ - def promoter(self, up=500, down=500, strand=1): - """ Returns a bedline of the promoters""" + def promoter(self, up=500, down=500, strand=True): + """ Returns the promoter of a bedline object + + Args: + up (int): Number of upstream bases + down (int): Number of donwstream bases + strand (bool): If false strandedness is ignored + Returns: + bedline: The promoter as a bedline object + Examples: + >>> bl = bedline(['chr1', 1000, 2000, 'Tx1', '0', '+']) + >>> print(bl.promoter()) + ['chr1', 500, 1500, 'Tx1'] + """ + if strand and self.bedType<6: raise BEDexception("You requested stranded promoters, but the BED file appears to be unstranded") if not strand or self.strand=="+": @@ -124,7 +148,18 @@ def promoter(self, up=500, down=500, strand=1): return bedline([self.chr, start, end, self.name]) def utr(self, which=None): - """ Returns the 5p UTR of coding transcripts (i.e. those with a CDS) """ + """ Returns the UTR of coding transcripts (i.e. those with a CDS) + + Args: + which (int): Which UTR to return: 3 for 3'UTR or 5 for 5' UTR + Returns: + bedline: The UTR as a bedline object + Examples: + >>> bl = bedline(["chr1", 100, 500, "Tx1", 0, "+", 200, 300, ".", 1, "400,", "0,"]) + >>> print(bl.utr(which=5)) + ['chr1', 100, 200, 'Tx1', 0, '+', 100, 100, '.', 1, '100,', '0,'] + + """ if(not self.stranded): raise BEDexception("UTRs for an unstranded transcript make little sense: "+self.name) if(which!=5 and which!=3): @@ -221,7 +256,17 @@ def utr(self, which=None): def cds(self, ignoreCDSonly=False): - """Return the CDS of a coding transcript. Transcripts that are only CDS are NOT reported.""" + """Return the CDS of a coding transcript. Transcripts without CDS are not reported + + Args: + ignoreCDSonly (bool): If True return None when the entire transcript is CDS + Returns: + bedline: The CDS as a bedline object + Examples: + >>> bl = bedline(["chr1", 100, 500, "Tx1", 0, "+", 200, 300, ".", 1, "400,", "0,"]) + >>> print(bl.cds()) + ['chr1', 200, 300, 'Tx1', 0, '+', 200, 300, '.', 1, '100,', '0,'] + """ if(not self.stranded): raise BEDexception("CDS for an unstranded transcript makes little sense: "+self.name) @@ -278,7 +323,18 @@ def cds(self, ignoreCDSonly=False): return result def introns(self): - """ Returns the introns of a transcript """ + """ Returns a bedline object of the introns of a transcript + + Returns: + bedline: The introns of the transcripts as a bedline object + Examples: + >>> bl = bedline(["chr1", 100, 420, "Name", 0, "+", 210, 310, ".", 4, "20,20,20,20,", "0,100,200,300,"]) + >>> print(bl.introns()) + ['chr1', 120, 400, 'Name', 0, '+', 120, 120, '.', 3, '80,80,80,', '0,100,200,'] + >>> bl = bedline(["chr1", 100, 420, "Name", 0, "-", 210, 310, ".", 1, "320,", "0,"]) + >>> print(bl.introns()) + None + """ if(self.bedType<12 or self.nEx<2): return None exStarts=self.exStarts.split(',') @@ -301,7 +357,20 @@ def introns(self): def tx2genome(self, coord, stranded=False): """ Given a position in transcript coordinates returns the equivalent in genome coordinates. The transcript coordinates are considered without regard to strand, i.e. 0 is the leftmost - position for both + and - strand transcripts.""" + position for both + and - strand transcripts, unless the stranded options is set to True. + + Args: + coord (int): Coordinate to convert from transcript-space to genome space + stranded (bool): If True use the rightmost base of negative strand trascripts as 0 + Returns: + int: Coordinate in genome-space + Examples: + >>> bl = bedline(['chr1', 1000, 2000, 'Tx1', '0', '-']) + >>> bl.tx2genome(10) + 1010 + >>> bl.tx2genome(10, stranded=True) + 1989 + """ if not isinstance(coord, int): raise BEDexception("coord must be of type integer") @@ -343,7 +412,22 @@ def tx2genome(self, coord, stranded=False): def bed12tobed6(self, appendExN=False, whichExon="all"): - """ Returns a list of bedlines (bed6) corresponding to the exons.""" + """ Returns a list of bedlines (bed6) corresponding to the exons. + + Args: + appendExN (bool): Appends the exon number to the transcript name + whichExon (str): Which exon to return. One of ["all", "first", "last"]. First and last respectively report the first or last exon relative to the TSS (i.e. taking strand into account). + Returns: + list: list of bedline objects, one per exon + Examples: + >>> bl = bedline(["chr1", 100, 420, "Name", 0, "+", 210, 310, ".", 4, "20,20,20,20,", "0,100,200,300,"]) + >>> for i in bl.bed12tobed6(appendExN=True): print(i) + ... + ['chr1', 100, 120, 'Name_Exon001', 0, '+'] + ['chr1', 200, 220, 'Name_Exon002', 0, '+'] + ['chr1', 300, 320, 'Name_Exon003', 0, '+'] + ['chr1', 400, 420, 'Name_Exon004', 0, '+'] + """ if(self.bedType!=12): raise BEDexception("Only BED12 lines can be coverted to BED6") if whichExon not in ("all", "first", "last"): raise BEDexception("whichExon has to be one of [all, first, last]") @@ -371,14 +455,31 @@ def bed12tobed6(self, appendExN=False, whichExon="all"): elif self.strand == "-": return([exons[0]]) - def translateChr(self, assembly, target, suppress=False, all=False, patches=False): - """ Convert the chromosome name to Ensembl or UCSC """ + def translateChr(self, assembly, target, suppress=False, ignore=False, patches=False): + """ Convert the chromosome name to Ensembl or UCSC + + Args: + assembly (str): Assembly of the BED file (either hg38 or mm10). + target (str): Desidered chromosome name convention (ucsc or ens). + suppress (bool): When a chromosome name can't be matched between USCS and Ensembl set it to 'NA' (by default throws as error) + ignore (bool): When a chromosome name can't be matched between USCS and Ensembl do not report it in the output (by default throws an error) + patches (bool): Allows conversion of all patches up to p11 for hg38 and p4 for mm10. Without this option, if the BED file contains contigs added by a patch the conversion terminates with an error (unless the -a or -s flags are present + Returns: + bedline: A bedline object with the converted chromosome + Examples: + >>> bl = bedline(['chr1', 1000, 2000, 'Tx1', '0', '-']) + >>> print(bl.translateChr(assembly="hg38", target="ens")) + ['1', 1000, 2000, 'Tx1', '0', '-'] + >>> bl = bedline(['chr19_GL000209v2_alt', 1000, 2000, 'Tx1', '0', '-']) + >>> print(bl.translateChr(assembly="hg38", target="ens")) + ['CHR_HSCHR19KIR_RP5_B_HAP_CTG3_1', 1000, 2000, 'Tx1', '0', '-'] + """ if(assembly not in ("hg38", "mm10")): raise BEDexception("The specified assembly is not supported") if(target not in ("ucsc", "ens")): raise BEDexception("The specified target naming convention is not supported") - if(all and suppress): + if(ignore and suppress): raise BEDexception("Only one of allowMissing and suppressMissing is allowed") if(assembly=="hg38" and target=="ucsc"): @@ -400,7 +501,7 @@ def translateChr(self, assembly, target, suppress=False, all=False, patches=Fals if(self.chr in convDict.keys()): self.chr=convDict[self.chr] - elif(all): + elif(ignore): self.chr="NA" elif(suppress): return None diff --git a/bedparse/bedparse.py b/bedparse/bedparse.py index a3c4355..e2ce0c9 100755 --- a/bedparse/bedparse.py +++ b/bedparse/bedparse.py @@ -118,7 +118,7 @@ def join(args): def convertChr(args): with args.bedfile as tsvfile: for line in tsvfile: - translatedLine=bedline(line.split('\t')).translateChr(assembly=args.assembly, target=args.target, suppress=args.suppressMissing, all=args.allowMissing, patches=args.patches) + translatedLine=bedline(line.split('\t')).translateChr(assembly=args.assembly, target=args.target, suppress=args.suppressMissing, ignore=args.allowMissing, patches=args.patches) if(translatedLine): translatedLine.print() tsvfile.close() diff --git a/bedparse/converters.py b/bedparse/converters.py index fc3e8df..8a2e3fa 100644 --- a/bedparse/converters.py +++ b/bedparse/converters.py @@ -82,7 +82,7 @@ def gtf2bed(gtf, extra=[''], filterKey="transcript_biotype", filterType=['']): # Convert to bedline for format validation out=list() bed = bedline(transcripts[transcript]) - for key in bed.fields[:bed.bedType]: + for key in bed._bedline__fields[:bed.bedType]: out.append(bed.__dict__[key]) if(extra!=['']): for field in extra: diff --git a/docs/Tutorial.md b/docs/Tutorial.md index 51a94fe..765bb7e 100644 --- a/docs/Tutorial.md +++ b/docs/Tutorial.md @@ -124,3 +124,32 @@ chr1 943907 944575 ENST00000342066.7_Exon014 0 + ``` The optional flag --appendExN adds ExonNNN to the end of each transcript name. + +## APIs + +Bedparse can also be imported as a python module. The API documentation contains detailed information of the bedline class and its methods. The following is simple example of how to use it: + +``` +In [1]: from bedparse import bedline + +In [2]: l = bedline(['chr1', 1000, 2000, 'Tx1', '0', '+']) + +In [3]: prom = l.promoter() + +In [4]: prom.print() +chr1 500 1500 Tx1 + +In [5]: prom.pprint() +['chr1', 500, 1500, 'Tx1'] + +In [6]: ens_prom = prom.translateChr(assembly="hg38", target="ens") + +In [7]: ens_prom.print() +1 500 1500 Tx1 + +``` + + + + + diff --git a/docs/Usage.md b/docs/Usage.md index 1ff0205..b1afe30 100644 --- a/docs/Usage.md +++ b/docs/Usage.md @@ -30,11 +30,11 @@ optional arguments: --version, -v show program's version number and exit ``` -The basic syntax in the form: `bedparse subcommand [parameters]`. +The basic syntax in the form: `bedparse sub-command [parameters]`. -For a list of all subcommands and a brief explanation of what they do, use: `bedparse --help` +For a list of all sub-commands and a brief explanation of what they do, use: `bedparse --help` -For a detailed explanation of each subcommand and a list of its paramters, use the `--help` options after the subcommand's name, e.g.: `bedparse promoter --help` +For a detailed explanation of each subcommand and a list of its parameters, use the `--help` option after the subcommand's name, e.g.: `bedparse promoter --help` --- diff --git a/docs/bedparse.bedline.rst b/docs/bedparse.bedline.rst new file mode 100644 index 0000000..f253fe8 --- /dev/null +++ b/docs/bedparse.bedline.rst @@ -0,0 +1,7 @@ +bedparse.bedline module +======================= + +.. automodule:: bedparse.bedline + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/conf.py b/docs/conf.py index b999d85..1374170 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,10 +14,10 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) - +import os +import sys +sys.path.insert(0,os.path.abspath('..')) +print(sys.path) # -- Project information ----------------------------------------------------- @@ -40,8 +40,8 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = [ -] +extensions = ['sphinx.ext.napoleon'] +autoclass_content = 'both' # Add any paths that contain templates here, relative to this directory. templates_path = ['.templates'] diff --git a/docs/index.rst b/docs/index.rst index ae4a35a..cffa4cb 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -30,6 +30,7 @@ Contents Installation Motivation Usage + APIs reference Tutorial diff --git a/setup.py b/setup.py index 15b662c..41710b4 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ description='A simple library and CLI tool to manipulate BED files', long_description=long_description, long_description_content_type="text/markdown", - version="0.2.1", + version="0.2.2", url='https://github.com/tleonardi/bedparse', author='Tommaso Leonardi', author_email='tom@tleo.io',