From e6788af91f63e33ae438f1558b56646169cf4b85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 10 Oct 2017 16:50:48 -0400 Subject: [PATCH 1/8] Restructure tests --- tests/Makefile | 37 +++++------------------------------ tests/profiling/Makefile | 15 ++++++++++++++ tests/tests/.gitignore | 2 ++ tests/tests/Makefile | 39 +++++++++++++++++++++++++++++++++++++ tests/{ => tests}/test.bam | Bin tests/{ => tests}/test.sam | 0 6 files changed, 61 insertions(+), 32 deletions(-) create mode 100644 tests/profiling/Makefile create mode 100644 tests/tests/.gitignore create mode 100644 tests/tests/Makefile rename tests/{ => tests}/test.bam (100%) rename tests/{ => tests}/test.sam (100%) diff --git a/tests/Makefile b/tests/Makefile index 769be43..d5f1545 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -1,39 +1,12 @@ -.PHONY: all clean basic columns id +.PHONY: all SHELL=/usr/bin/env bash -euc -o pipefail .SECONDARY: -SIFT=../samsift/samsift.py -NORM=../samsift/samsift_norm_sam.py -H -SV=samtools view +all: + $(MAKE) -C tests -all: basic columns id +%: + $(MAKE) -C tests $@ -basic: - $(SIFT) -i test.bam -c 'ln=len(SEQ);ab=sum(a.query_qualities)/ln' - $(SIFT) -i test.bam -o test_out.bam -c 'ln=len(SEQ);ab=sum(a.query_qualities)/ln' - -id: - $(SIFT) -i test.bam | grep -v samsift > _test.sam - diff <($(NORM) -i test.sam) <($(NORM) -i _test.sam) - -columns: test.debug_samtools.sam test.debug_samsift.sam - diff $^ - -%.debug_samtools.sam: %.bam - $(SV) $< \ - | cut -f -11 \ - > $@ - -%.debug_samsift.sam: %.bam - ($(SIFT) -i $< -d '"\t".join(map(str,[QNAME, FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL]))' \ - > /dev/null) 2>&1 \ - | grep -v samsift \ - | cut -f 3- \ - | cut -f -11 \ - > $@ - - -clean: - rm -f *.sam test_out.bam diff --git a/tests/profiling/Makefile b/tests/profiling/Makefile new file mode 100644 index 0000000..22c311f --- /dev/null +++ b/tests/profiling/Makefile @@ -0,0 +1,15 @@ +.PHONY: all clean + +SF=../../samsift/samsift.py + +SHELL=/usr/bin/env bash -euc -o pipefail + +.SECONDARY: + +all: + $(SF) -h + + +clean: + + diff --git a/tests/tests/.gitignore b/tests/tests/.gitignore new file mode 100644 index 0000000..1acc04f --- /dev/null +++ b/tests/tests/.gitignore @@ -0,0 +1,2 @@ +!test.bam +!test.sam diff --git a/tests/tests/Makefile b/tests/tests/Makefile new file mode 100644 index 0000000..537ed75 --- /dev/null +++ b/tests/tests/Makefile @@ -0,0 +1,39 @@ +.PHONY: all clean basic columns id + +SHELL=/usr/bin/env bash -euc -o pipefail + +.SECONDARY: + +SIFT=../../samsift/samsift.py +NORM=../../samsift/samsift_norm_sam.py -H +SV=samtools view + +all: basic columns id + +basic: + $(SIFT) -i test.bam -c 'ln=len(SEQ);ab=sum(a.query_qualities)/ln' + $(SIFT) -i test.bam -o _test_out.bam -c 'ln=len(SEQ);ab=sum(a.query_qualities)/ln' + +id: + $(SIFT) -i test.bam | grep -v samsift > _test.sam + diff <($(NORM) -i test.sam) <($(NORM) -i _test.sam) + +columns: _test.debug_samtools.sam _test.debug_samsift.sam + diff $^ + +_%.debug_samtools.sam: %.bam + $(SV) $< \ + | cut -f -11 \ + > $@ + +_%.debug_samsift.sam: %.bam + ($(SIFT) -i $< -d '"\t".join(map(str,[QNAME, FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL]))' \ + > /dev/null) 2>&1 \ + | grep -v samsift \ + | cut -f 3- \ + | cut -f -11 \ + > $@ + + +clean: + rm -f _* diff --git a/tests/test.bam b/tests/tests/test.bam similarity index 100% rename from tests/test.bam rename to tests/tests/test.bam diff --git a/tests/test.sam b/tests/tests/test.sam similarity index 100% rename from tests/test.sam rename to tests/tests/test.sam From f12e860cfaf41b47f77b3937e4aac547efb3f148 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 10 Oct 2017 17:14:58 -0400 Subject: [PATCH 2/8] Add a profiling Makefile --- tests/profiling/.gitignore | 1 + tests/profiling/Makefile | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) create mode 100644 tests/profiling/.gitignore diff --git a/tests/profiling/.gitignore b/tests/profiling/.gitignore new file mode 100644 index 0000000..c627000 --- /dev/null +++ b/tests/profiling/.gitignore @@ -0,0 +1 @@ +*.prof diff --git a/tests/profiling/Makefile b/tests/profiling/Makefile index 22c311f..ec8dbf9 100644 --- a/tests/profiling/Makefile +++ b/tests/profiling/Makefile @@ -1,15 +1,24 @@ -.PHONY: all clean +.PHONY: all clean filt SF=../../samsift/samsift.py SHELL=/usr/bin/env bash -euc -o pipefail +BAM=test.bam +SMALLBAM=test.small.bam .SECONDARY: -all: - $(SF) -h +all: filt +filt: $(BAM) + /usr/bin/env python3 -m cProfile -o _filt.prof $(SF) -i "$<" -f 'NM>0' > /dev/null -clean: +$(BAM): + wget -O $(BAM) ftp://ftp.sra.ebi.ac.uk/vol1/ERZ004/ERZ004000/accepted_hits.bam + +$(SMALLBAM): $(BAM) + samtools view -h $< | head -n 10000 | samtools view -b > $@ +clean: + rm -f _* From 78bd7ae218f29eda69717687a6acb8b96ae39e5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 10 Oct 2017 17:43:59 -0400 Subject: [PATCH 3/8] Refactoring --- samsift/samsift.py | 260 ++++++++++++++++++++++++++------------------- 1 file changed, 152 insertions(+), 108 deletions(-) diff --git a/samsift/samsift.py b/samsift/samsift.py index 479d1cf..ff34b1b 100755 --- a/samsift/samsift.py +++ b/samsift/samsift.py @@ -30,11 +30,153 @@ def info(msg): print("[samsift]", fdt, msg, file=sys.stderr) +class SamSift: + + def __init__(self, sam_header, out_sam_fn, filter, code, dexpr, dtrig, mode, initialization): + self.sam_header=sam_header + self.output_sam_fn + self.filter=filter + self.code=code + self.dexpr=dexpr + self.dtrig=dtrig + self.mode=mode + self.initialization=initialization + + self.nt=0 + self.np=0 + self.nf=0 + self.ne=0 + + info("Starting.") + self.init_vardict={} + exec(BASIC_INIT + initialization, self.init_vardict) + exec(self.initialization, self.init_vardict) + + def _init_alignment(self, alignment): + self.alignment=alignment + + self.err=False + self.nt+=1 + + #print(alignment.qual) + self.vardict=self.init_vardict + self.vardict.update({ + 'a': alignment, + 'QNAME': alignment.query_name, + 'FLAG': alignment.flag, + # RNAME will be set later, + 'POS': alignment.reference_start+1, + 'MAPQ': alignment.mapping_quality, + 'CIGAR': alignment.cigarstring, + # RNEXT will be set later, + 'PNEXT': alignment.next_reference_start+1, + 'TLEN': alignment.template_length, + 'SEQ': alignment.query_sequence, + + # integer id's + 'RNAMEi': alignment.reference_id, + 'RNEXTi': alignment.next_reference_id, + }) + + # the specific implementation depends on the specific version of PySam, we want the same behaviour + if isinstance(alignment.qual, str): + self.vardict['QUAL']=alignment.qual + self.vardict['QUALa']=[ord(x) for x in alignment.qual] + self.vardict['QUALs']=alignment.qqual + self.vardict['QUALsa']=[ord(x) for x in alignment.qqual] + else: + self.vardict['QUAL']=pysam.qualities_to_qualitystring(alignment.qual, offset=0) + self.vardict['QUALa']=alignment.qual + self.vardict['QUALs']=pysam.qualities_to_qualitystring(alignment.qqual, offset=0) + self.vardict['QUALsa']=alignment.qqual + + if vardict['RNAMEi']==-1: + self.vardict['RNAME']='*' + else: + self.vardict['RNAME']=self.in_sam.get_reference_name(vardict['RNAMEi']) + + if vardict['RNEXTi']==-1: + self.vardict['RNEXT']='*' + else: + self.vardict['RNEXT']=self.in_sam.get_reference_name(vardict['RNEXTi']) + + # clean cache + keys_to_delete=[] + for k in self.vardict: + if len(k)==2: + keys_to_delete.append(k) + for k in keys_to_delete: + del self.vardict[k] + + self.vardict.update(alignment.get_tags()) + + def _filter(self): + try: + passes=eval(filter, vardict) + except: + self.err=True + if mode=="strict": + raise + elif mode=="nonstop-keep": + self.passes=True + self.ne+=1 + elif mode=="nonstop-remove": + sefl.passes=False + self.ne+=1 + else: + raise NotImplementedError + + def _debug(self): + if self.dexpr != "": + trig=eval(str(dtrig), vardict) + if trig: + try: + dbg_res=eval(dexpr, vardict) + except: + # todo: add a better message + dbg_res="evaluation_failed" + print(alignment.query_name, bool(passes), dbg_res, file=sys.stderr, sep="\t") + + def _code(self): + if self.code is not None: + try: + exec(self.code, self.vardict) + except: + if self.mode=="strict": + raise + else: + if self.err==False: + self.ne+=1 + self.err=True + info("Alignment '{}' - code error ('{}')".format(self.alignment.query_name, sys.exc_info()[0])) + + for k, v in vardict.items(): + if len(k)==2: + if isinstance(v, float): + # workaround (see "https://github.com/pysam-developers/pysam/issues/531") + self.alignment.set_tag(k, v, value_type="f") + else: + self.alignment.set_tag(k, v) + + + def process_alignment(self, alignment): + self._init_alignment(alignment) + + + (err, passes)=self._filter() + self._debug(alignment) + + if passes: + self._code() + out_sam.write(alignment) + if not self.err: + self.np+=1 + else: + if not self.err: + self.nf+=1 + + def sam_sift(in_sam_fn, out_sam_fn, filter, code, dexpr, dtrig, mode, initialization): - info("Starting.") - init_vardict={} - exec(BASIC_INIT + initialization, init_vardict) - exec(initialization, init_vardict) if in_sam_fn=='-': info("Reading from standard input. " + ("Press Ctrl+D to finish reading or run '{} -h' for help.".format(PROGRAM) if in_sam_fn=="-" else"")) @@ -60,115 +202,14 @@ def sam_sift(in_sam_fn, out_sam_fn, filter, code, dexpr, dtrig, mode, initializa out_mode="w" with pysam.AlignmentFile(out_sam_fn, out_mode, header=header) as out_sam: - - nt,np,nf,ne=0,0,0,0 for alignment in in_sam.fetch(until_eof=True): - err=False - nt+=1 - #print(alignment.qual) - vardict=init_vardict - vardict.update({ - 'a': alignment, - 'QNAME': alignment.query_name, - 'FLAG': alignment.flag, - # RNAME will be set later, - 'POS': alignment.reference_start+1, - 'MAPQ': alignment.mapping_quality, - 'CIGAR': alignment.cigarstring, - # RNEXT will be set later, - 'PNEXT': alignment.next_reference_start+1, - 'TLEN': alignment.template_length, - 'SEQ': alignment.query_sequence, - - # integer id's - 'RNAMEi': alignment.reference_id, - 'RNEXTi': alignment.next_reference_id, - }) - - # the exact implementation depends on specific version of PySam, we want the same behaviour - if isinstance(alignment.qual, str): - vardict['QUAL']=alignment.qual - vardict['QUALa']=[ord(x) for x in alignment.qual] - vardict['QUALs']=alignment.qqual - vardict['QUALsa']=[ord(x) for x in alignment.qqual] - else: - vardict['QUAL']=pysam.qualities_to_qualitystring(alignment.qual, offset=0) - vardict['QUALa']=alignment.qual - vardict['QUALs']=pysam.qualities_to_qualitystring(alignment.qqual, offset=0) - vardict['QUALsa']=alignment.qqual + self.process_alignment(alignment) - if vardict['RNAMEi']==-1: - vardict['RNAME']='*' - else: - vardict['RNAME']=in_sam.get_reference_name(vardict['RNAMEi']) + info("Finishing. {} alignments processed. {} alignments passed. {} alignments filtered out. {} alignments caused errors.".format(self.nt, self.np, self.nf, self.ne)) - if vardict['RNEXTi']==-1: - vardict['RNEXT']='*' - else: - vardict['RNEXT']=in_sam.get_reference_name(vardict['RNEXTi']) - # clean cache - keys_to_delete=[] - for k in vardict: - if len(k)==2: - keys_to_delete.append(k) - for k in keys_to_delete: - del vardict[k] - vardict.update(alignment.get_tags()) - try: - passes=eval(filter, vardict) - except: - err=True - if mode=="strict": - raise - elif mode=="nonstop-keep": - passes=True - ne+=1 - elif mode=="nonstop-remove": - passes=False - ne+=1 - else: - raise NotImplementedError - if dexpr != "": - trig=eval(str(dtrig), vardict) - if trig: - try: - dbg_res=eval(dexpr, vardict) - except: - # todo: add a better message - dbg_res="evaluation_failed" - print(alignment.query_name, bool(passes), dbg_res, file=sys.stderr, sep="\t") - if passes: - if code is not None: - try: - exec(code, vardict) - except: - if mode=="strict": - raise - else: - if err==False: - ne+=1 - err=True - info("Alignment '{}' - code error ('{}')".format(alignment.query_name, sys.exc_info()[0])) - - - for k, v in vardict.items(): - if len(k)==2: - if isinstance(v, float): - # workaround (see "https://github.com/pysam-developers/pysam/issues/531") - alignment.set_tag(k, v, value_type="f") - else: - alignment.set_tag(k, v) - out_sam.write(alignment) - if not err: - np+=1 - else: - if not err: - nf+=1 - info("Finishing. {} alignments processed. {} alignments passed. {} alignments filtered out. {} alignments caused errors.".format(nt, np, nf, ne)) - -def main(): +def parse_args(): class CustomArgumentParser (argparse.ArgumentParser): def print_help(self): @@ -285,6 +326,9 @@ def format_help(self): ) args = parser.parse_args() + +def main (): + args=parse_args() sam_sift( in_sam_fn=args.in_sam_fn, out_sam_fn=args.out_sam_fn, From e4890f1c971385d7a9deafd2aa2d2bd4a565eb91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 10 Oct 2017 18:50:25 -0400 Subject: [PATCH 4/8] Finish refactoring --- samsift/samsift.py | 117 +++++++++++++++++++++++++-------------------- 1 file changed, 65 insertions(+), 52 deletions(-) diff --git a/samsift/samsift.py b/samsift/samsift.py index ff34b1b..f462377 100755 --- a/samsift/samsift.py +++ b/samsift/samsift.py @@ -32,26 +32,37 @@ def info(msg): class SamSift: - def __init__(self, sam_header, out_sam_fn, filter, code, dexpr, dtrig, mode, initialization): - self.sam_header=sam_header - self.output_sam_fn + def __init__(self, in_sam_fn, out_sam_fn, filter, code, dexpr, dtrig, mode, initialization): + self.in_sam_fn=in_sam_fn + self.out_sam_fn=out_sam_fn + self.filter=filter self.code=code self.dexpr=dexpr self.dtrig=dtrig - self.mode=mode self.initialization=initialization + self.mode=mode + + self.nt=0 self.np=0 self.nf=0 self.ne=0 - info("Starting.") + if in_sam_fn=='-': + info("Reading from standard input. " + ("Press Ctrl+D to finish reading or run '{} -h' for help.".format(PROGRAM) if in_sam_fn=="-" else"")) + + if self.out_sam_fn[-4:]==".bam": + self.pysam_out_mode="wb" + else: + self.pysam_out_mode="w" + self.init_vardict={} exec(BASIC_INIT + initialization, self.init_vardict) exec(self.initialization, self.init_vardict) + def _init_alignment(self, alignment): self.alignment=alignment @@ -90,15 +101,15 @@ def _init_alignment(self, alignment): self.vardict['QUALs']=pysam.qualities_to_qualitystring(alignment.qqual, offset=0) self.vardict['QUALsa']=alignment.qqual - if vardict['RNAMEi']==-1: + if self.vardict['RNAMEi']==-1: self.vardict['RNAME']='*' else: - self.vardict['RNAME']=self.in_sam.get_reference_name(vardict['RNAMEi']) + self.vardict['RNAME']=self.in_sam.get_reference_name(self.vardict['RNAMEi']) - if vardict['RNEXTi']==-1: + if self.vardict['RNEXTi']==-1: self.vardict['RNEXT']='*' else: - self.vardict['RNEXT']=self.in_sam.get_reference_name(vardict['RNEXTi']) + self.vardict['RNEXT']=self.in_sam.get_reference_name(self.vardict['RNEXTi']) # clean cache keys_to_delete=[] @@ -110,32 +121,35 @@ def _init_alignment(self, alignment): self.vardict.update(alignment.get_tags()) + def _filter(self): try: - passes=eval(filter, vardict) + self.passes=eval(self.filter, self.vardict) except: self.err=True - if mode=="strict": + if self.mode=="strict": raise - elif mode=="nonstop-keep": + elif self.mode=="nonstop-keep": self.passes=True self.ne+=1 - elif mode=="nonstop-remove": - sefl.passes=False + elif self.mode=="nonstop-remove": + self.passes=False self.ne+=1 else: raise NotImplementedError + def _debug(self): if self.dexpr != "": - trig=eval(str(dtrig), vardict) + trig=eval(str(self.dtrig), self.vardict) if trig: try: - dbg_res=eval(dexpr, vardict) + dbg_res=eval(self.dexpr, self.vardict) except: # todo: add a better message dbg_res="evaluation_failed" - print(alignment.query_name, bool(passes), dbg_res, file=sys.stderr, sep="\t") + print(self.alignment.query_name, bool(self.passes), dbg_res, file=sys.stderr, sep="\t") + def _code(self): if self.code is not None: @@ -150,7 +164,7 @@ def _code(self): self.err=True info("Alignment '{}' - code error ('{}')".format(self.alignment.query_name, sys.exc_info()[0])) - for k, v in vardict.items(): + for k, v in self.vardict.items(): if len(k)==2: if isinstance(v, float): # workaround (see "https://github.com/pysam-developers/pysam/issues/531") @@ -159,16 +173,20 @@ def _code(self): self.alignment.set_tag(k, v) + def _print_alignment(self): + self.out_sam.write(self.alignment) + + + def process_alignment(self, alignment): self._init_alignment(alignment) - - (err, passes)=self._filter() - self._debug(alignment) + self._filter() + self._debug() - if passes: + if self.passes: self._code() - out_sam.write(alignment) + self._print_alignment() if not self.err: self.np+=1 else: @@ -176,37 +194,30 @@ def process_alignment(self, alignment): self.nf+=1 -def sam_sift(in_sam_fn, out_sam_fn, filter, code, dexpr, dtrig, mode, initialization): - if in_sam_fn=='-': - info("Reading from standard input. " + ("Press Ctrl+D to finish reading or run '{} -h' for help.".format(PROGRAM) if in_sam_fn=="-" else"")) - - with pysam.AlignmentFile(in_sam_fn, "rb") as in_sam: #check_sq=False) - #print("@PG", "ID:{}".format(PROGRAM), "PN:{}".format(PROGRAM), "VN:{}".format(VERSION), "CL:{}".format(" ".join(sys.argv)), sep="\t") - header=in_sam.header + def run(self): + info("SAMsift is starting.") + with pysam.AlignmentFile(self.in_sam_fn, "rb") as in_sam: #check_sq=False) + self.in_sam=in_sam + #print("@PG", "ID:{}".format(PROGRAM), "PN:{}".format(PROGRAM), "VN:{}".format(VERSION), "CL:{}".format(" ".join(sys.argv)), sep="\t") + header=in_sam.header - pg={ - "ID":PROGRAM, - "PN":PROGRAM, - "VN":VERSION, - "CL":" ".join(map(lambda x:"'{}'".format(x),sys.argv)), - } + pg={ + "ID":PROGRAM, + "PN":PROGRAM, + "VN":VERSION, + "CL":" ".join(map(lambda x:"'{}'".format(x),sys.argv)), + } - try: - header['PG'].insert(0,pg) - except KeyError: - header['PG']=[pg] - - if out_sam_fn[-4:]==".bam": - out_mode="wb" - else: - out_mode="w" - - with pysam.AlignmentFile(out_sam_fn, out_mode, header=header) as out_sam: - for alignment in in_sam.fetch(until_eof=True): - self.process_alignment(alignment) - - info("Finishing. {} alignments processed. {} alignments passed. {} alignments filtered out. {} alignments caused errors.".format(self.nt, self.np, self.nf, self.ne)) + try: + header['PG'].insert(0,pg) + except KeyError: + header['PG']=[pg] + with pysam.AlignmentFile(self.out_sam_fn, self.pysam_out_mode, header=header) as out_sam: + self.out_sam=out_sam + for alignment in in_sam.fetch(until_eof=True): + self.process_alignment(alignment) + info("Finishing. {} alignments processed. {} alignments passed. {} alignments filtered out. {} alignments caused errors.".format(self.nt, self.np, self.nf, self.ne)) def parse_args(): @@ -325,11 +336,12 @@ def format_help(self): default=["True"], ) args = parser.parse_args() + return args def main (): args=parse_args() - sam_sift( + sam_sift=SamSift( in_sam_fn=args.in_sam_fn, out_sam_fn=args.out_sam_fn, filter=" ".join(args.filter_l), @@ -339,6 +351,7 @@ def main (): initialization=" ".join(args.init_l), mode=args.mode, ) + sam_sift.run() if __name__ == "__main__": From f06f39c0a32d5c5c6339991429bafa227b459c92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 10 Oct 2017 18:54:14 -0400 Subject: [PATCH 5/8] Fix tests --- .gitignore | 2 ++ Makefile | 4 ++-- tests/profiling/Makefile | 4 ++-- tests/test.bam | 1 + tests/test.sam | 1 + 5 files changed, 8 insertions(+), 4 deletions(-) create mode 120000 tests/test.bam create mode 120000 tests/test.sam diff --git a/.gitignore b/.gitignore index 6324c0c..bf916dd 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ README.html README.sh *.sam *.bam +!test.sam +!test.bam # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/Makefile b/Makefile index e5b3a87..9e1690b 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ SAMS=./samsift/samsift.py .SECONDARY: -all: readme tests/test.bam +all: readme tests/tests/test.bam clean: rm -fr build/ dist/ samsift/__pycache__ samsift.egg-info @@ -16,7 +16,7 @@ clean: pypi: /usr/bin/env python3 setup.py sdist bdist_wheel upload -tests/test.bam: tests/test.sam +tests/tests/test.bam: tests/tests/test.sam samtools view -b "$<" > "$@" readme: rst html diff --git a/tests/profiling/Makefile b/tests/profiling/Makefile index ec8dbf9..2dac6bc 100644 --- a/tests/profiling/Makefile +++ b/tests/profiling/Makefile @@ -3,8 +3,8 @@ SF=../../samsift/samsift.py SHELL=/usr/bin/env bash -euc -o pipefail -BAM=test.bam -SMALLBAM=test.small.bam +BAM=_test.bam +SMALLBAM=_test.small.bam .SECONDARY: diff --git a/tests/test.bam b/tests/test.bam new file mode 120000 index 0000000..aaebe4f --- /dev/null +++ b/tests/test.bam @@ -0,0 +1 @@ +tests/test.bam \ No newline at end of file diff --git a/tests/test.sam b/tests/test.sam new file mode 120000 index 0000000..0d08074 --- /dev/null +++ b/tests/test.sam @@ -0,0 +1 @@ +tests/test.sam \ No newline at end of file From 9288e20c16fa640a3be8920518a3f2ed09ec233d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 10 Oct 2017 22:23:19 -0400 Subject: [PATCH 6/8] Add variable filtering --- samsift/samsift.py | 123 ++++++++++++++++++++++++++++++--------------- 1 file changed, 83 insertions(+), 40 deletions(-) diff --git a/samsift/samsift.py b/samsift/samsift.py index f462377..899a0ee 100755 --- a/samsift/samsift.py +++ b/samsift/samsift.py @@ -24,12 +24,24 @@ BASIC_INIT="import random;" +ALIGNMENT_VARIABLE_NAMES=set([ + 'a', + 'QNAME', 'FLAG', 'POS', 'MAPQ', 'CIGAR', 'PNEXT', 'TLEN', 'SEQ', + 'RNAMEi', 'RNEXTi', + 'QUAL', 'QUALa', 'QUALs', 'QUALsa', 'QUAL', 'QUALa', 'QUALs', 'QUALsa', + ]) + def info(msg): dt = datetime.datetime.now() fdt = dt.strftime("%Y-%m-%d %H:%M:%S") print("[samsift]", fdt, msg, file=sys.stderr) +def possible_vars(string): + f=filter(lambda x: string.find(x)!=-1, ALIGNMENT_VARIABLE_NAMES) + return set(f) + + class SamSift: def __init__(self, in_sam_fn, out_sam_fn, filter, code, dexpr, dtrig, mode, initialization): @@ -58,60 +70,90 @@ def __init__(self, in_sam_fn, out_sam_fn, filter, code, dexpr, dtrig, mode, init else: self.pysam_out_mode="w" + self.filter_possible_vars=possible_vars(self.filter) + self.code_possible_vars=possible_vars(self.code) + self.possible_vars = self.filter_possible_vars | self.code_possible_vars + self.init_vardict={} exec(BASIC_INIT + initialization, self.init_vardict) exec(self.initialization, self.init_vardict) - def _init_alignment(self, alignment): - self.alignment=alignment + def _init_vardict(self, alignment): + """Init the variable dictionary (context for eval/code exec). - self.err=False - self.nt+=1 + Tricks: + - init only those variable that appear as a substring + """ - #print(alignment.qual) self.vardict=self.init_vardict - self.vardict.update({ - 'a': alignment, - 'QNAME': alignment.query_name, - 'FLAG': alignment.flag, - # RNAME will be set later, - 'POS': alignment.reference_start+1, - 'MAPQ': alignment.mapping_quality, - 'CIGAR': alignment.cigarstring, - # RNEXT will be set later, - 'PNEXT': alignment.next_reference_start+1, - 'TLEN': alignment.template_length, - 'SEQ': alignment.query_sequence, - - # integer id's - 'RNAMEi': alignment.reference_id, - 'RNEXTi': alignment.next_reference_id, - }) + + if 'a' in self.possible_vars: + self.vardict['a'] = alignment + if 'QNAME' in self.possible_vars: + self.vardict['QNAME'] = alignment.query_name + if 'FLAG' in self.possible_vars: + self.vardict['FLAG'] = alignment.flag + if 'POS' in self.possible_vars: + self.vardict['POS'] = alignment.reference_start+1 + if 'MAPQ' in self.possible_vars: + self.vardict['MAPQ'] = alignment.mapping_quality + if 'CIGAR' in self.possible_vars: + self.vardict['CIGAR'] = alignment.cigarstring + if 'PNEXT' in self.possible_vars: + self.vardict['PNEXT'] = alignment.next_reference_start+1 + if 'TLEN' in self.possible_vars: + self.vardict['TLEN'] = alignment.template_length + if 'SEQ' in self.possible_vars: + self.vardict['SEQ'] = alignment.query_sequence + if 'RNAMEi' in self.possible_vars: + self.vardict['RNAMEi'] = alignment.reference_id + if 'RNEXTi' in self.possible_vars: + self.vardict['RNEXTi'] = alignment.next_reference_id # the specific implementation depends on the specific version of PySam, we want the same behaviour if isinstance(alignment.qual, str): - self.vardict['QUAL']=alignment.qual - self.vardict['QUALa']=[ord(x) for x in alignment.qual] - self.vardict['QUALs']=alignment.qqual - self.vardict['QUALsa']=[ord(x) for x in alignment.qqual] + if 'QUAL' in self.possible_vars: + self.vardict['QUAL']=alignment.qual + if 'QUALa' in self.possible_vars: + self.vardict['QUALa']=[ord(x) for x in alignment.qual] + if 'QUALs' in self.possible_vars: + self.vardict['QUALs']=alignment.qqual + if 'QUALsa' in self.possible_vars: + self.vardict['QUALsa']=[ord(x) for x in alignment.qqual] else: - self.vardict['QUAL']=pysam.qualities_to_qualitystring(alignment.qual, offset=0) - self.vardict['QUALa']=alignment.qual - self.vardict['QUALs']=pysam.qualities_to_qualitystring(alignment.qqual, offset=0) - self.vardict['QUALsa']=alignment.qqual + if 'QUAL' in self.possible_vars: + self.vardict['QUAL']=pysam.qualities_to_qualitystring(alignment.qual, offset=0) + if 'QUALa' in self.possible_vars: + self.vardict['QUALa']=alignment.qual + if 'QUALs' in self.possible_vars: + self.vardict['QUALs']=pysam.qualities_to_qualitystring(alignment.qqual, offset=0) + if 'QUALsa' in self.possible_vars: + self.vardict['QUALsa']=alignment.qqual + + if 'RNAME' in self.possible_vars: + if alignment.reference_id==-1: + self.vardict['RNAME']='*' + else: + self.vardict['RNAME']=self.in_sam.get_reference_name(alignment.reference_id) - if self.vardict['RNAMEi']==-1: - self.vardict['RNAME']='*' - else: - self.vardict['RNAME']=self.in_sam.get_reference_name(self.vardict['RNAMEi']) + if 'RNEXT' in self.possible_vars: + if alignment.next_reference_id==-1: + self.vardict['RNEXT']='*' + else: + self.vardict['RNEXT']=self.in_sam.get_reference_name(alignment.next_reference_id) - if self.vardict['RNEXTi']==-1: - self.vardict['RNEXT']='*' - else: - self.vardict['RNEXT']=self.in_sam.get_reference_name(self.vardict['RNEXTi']) - # clean cache + def _init_alignment(self, alignment): + self.alignment=alignment + + self.err=False + self.nt+=1 + + # 1. init variable dictionary + self._init_vardict() + + # 2. remove old tags keys_to_delete=[] for k in self.vardict: if len(k)==2: @@ -119,6 +161,7 @@ def _init_alignment(self, alignment): for k in keys_to_delete: del self.vardict[k] + # 3. load tags self.vardict.update(alignment.get_tags()) @@ -183,7 +226,7 @@ def process_alignment(self, alignment): self._filter() self._debug() - + if self.passes: self._code() self._print_alignment() From df1c242534e9f78c9d879a70e8ccc69fbd36ad9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 10 Oct 2017 22:34:42 -0400 Subject: [PATCH 7/8] Fix variable detetion --- samsift/samsift.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/samsift/samsift.py b/samsift/samsift.py index 899a0ee..2c46cdf 100755 --- a/samsift/samsift.py +++ b/samsift/samsift.py @@ -27,7 +27,7 @@ ALIGNMENT_VARIABLE_NAMES=set([ 'a', 'QNAME', 'FLAG', 'POS', 'MAPQ', 'CIGAR', 'PNEXT', 'TLEN', 'SEQ', - 'RNAMEi', 'RNEXTi', + 'RNAME', 'RNAMEi', 'RNEXT', 'RNEXTi', 'QUAL', 'QUALa', 'QUALs', 'QUALsa', 'QUAL', 'QUALa', 'QUALs', 'QUALsa', ]) @@ -72,14 +72,16 @@ def __init__(self, in_sam_fn, out_sam_fn, filter, code, dexpr, dtrig, mode, init self.filter_possible_vars=possible_vars(self.filter) self.code_possible_vars=possible_vars(self.code) - self.possible_vars = self.filter_possible_vars | self.code_possible_vars + self.debug_possible_vars=possible_vars(self.dexpr) | possible_vars(self.dtrig) + self.possible_vars = self.filter_possible_vars | self.code_possible_vars | self.debug_possible_vars + #info(self.possible_vars) self.init_vardict={} exec(BASIC_INIT + initialization, self.init_vardict) exec(self.initialization, self.init_vardict) - def _init_vardict(self, alignment): + def _init_vardict(self): """Init the variable dictionary (context for eval/code exec). Tricks: @@ -88,6 +90,8 @@ def _init_vardict(self, alignment): self.vardict=self.init_vardict + alignment=self.alignment + if 'a' in self.possible_vars: self.vardict['a'] = alignment if 'QNAME' in self.possible_vars: @@ -188,9 +192,9 @@ def _debug(self): if trig: try: dbg_res=eval(self.dexpr, self.vardict) - except: + except Exception as e: # todo: add a better message - dbg_res="evaluation_failed" + dbg_res="evaluation_failed ({})".format(e) print(self.alignment.query_name, bool(self.passes), dbg_res, file=sys.stderr, sep="\t") From ba5e3f3bc4967b3a1b31d2fe3222cf970d3834fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 10 Oct 2017 22:56:21 -0400 Subject: [PATCH 8/8] Another optimization --- README.rst | 2 +- samsift/samsift.py | 81 ++++++++++++++++++++++++++++------------------ samsift/version.py | 2 +- 3 files changed, 51 insertions(+), 34 deletions(-) diff --git a/README.rst b/README.rst index 99b55c2..bdaf8b4 100644 --- a/README.rst +++ b/README.rst @@ -84,7 +84,7 @@ Command-line parameters .. code-block:: Program: samsift (advanced filtering and tagging of SAM/BAM alignments using Python expressions) - Version: 0.2.1 + Version: 0.2.2 Author: Karel Brinda Usage: samsift.py [-i FILE] [-o FILE] [-f PY_EXPR] [-c PY_CODE] [-m STR] diff --git a/samsift/samsift.py b/samsift/samsift.py index 2c46cdf..cc31013 100755 --- a/samsift/samsift.py +++ b/samsift/samsift.py @@ -149,6 +149,8 @@ def _init_vardict(self): def _init_alignment(self, alignment): + """Init context for the alignment. + """ self.alignment=alignment self.err=False @@ -170,6 +172,12 @@ def _init_alignment(self, alignment): def _filter(self): + """Evaluate FILTER. + """ + if self.filter=="": + self.passes=True + return + try: self.passes=eval(self.filter, self.vardict) except: @@ -187,44 +195,53 @@ def _filter(self): def _debug(self): - if self.dexpr != "": - trig=eval(str(self.dtrig), self.vardict) - if trig: - try: - dbg_res=eval(self.dexpr, self.vardict) - except Exception as e: - # todo: add a better message - dbg_res="evaluation_failed ({})".format(e) - print(self.alignment.query_name, bool(self.passes), dbg_res, file=sys.stderr, sep="\t") + """eval(DEBUG_TRIGER) & print(eval(DEBUG_EXPR)) + """ + if self.dexpr=="": + return + + trig=eval(str(self.dtrig), self.vardict) if self.dtrig!="" else True + if trig: + try: + dbg_res=eval(self.dexpr, self.vardict) + except Exception as e: + # todo: add a better message + dbg_res="evaluation_failed ({})".format(e) + print(self.alignment.query_name, bool(self.passes), dbg_res, file=sys.stderr, sep="\t") def _code(self): - if self.code is not None: - try: - exec(self.code, self.vardict) - except: - if self.mode=="strict": - raise - else: - if self.err==False: - self.ne+=1 - self.err=True - info("Alignment '{}' - code error ('{}')".format(self.alignment.query_name, sys.exc_info()[0])) + """exec(CODE) + """ + if self.code=="": + return + + # 1. exec(CODE) + try: + exec(self.code, self.vardict) + except: + if self.mode=="strict": + raise + else: + if self.err==False: + self.ne+=1 + self.err=True + info("Alignment '{}' - code error ('{}')".format(self.alignment.query_name, sys.exc_info()[0])) - for k, v in self.vardict.items(): - if len(k)==2: - if isinstance(v, float): - # workaround (see "https://github.com/pysam-developers/pysam/issues/531") - self.alignment.set_tag(k, v, value_type="f") - else: - self.alignment.set_tag(k, v) + # 2. backpropagate tags + for k, v in self.vardict.items(): + if len(k)==2: + if isinstance(v, float): + # workaround (see "https://github.com/pysam-developers/pysam/issues/531") + self.alignment.set_tag(k, v, value_type="f") + else: + self.alignment.set_tag(k, v) def _print_alignment(self): self.out_sam.write(self.alignment) - def process_alignment(self, alignment): self._init_alignment(alignment) @@ -336,7 +353,7 @@ def format_help(self): help='filter [True]', dest='filter_l', nargs='*', - default=['True'], + default=[], ) parser.add_argument('-c', @@ -345,7 +362,7 @@ def format_help(self): help="code to be executed (e.g., assigning new tags) [None]", dest='code_l', nargs='*', - default=['None'], + default=[], ) parser.add_argument('-m', @@ -362,7 +379,7 @@ def format_help(self): help='initialization [None]', dest='init_l', nargs='*', - default=['None'], + default=[], ) parser.add_argument('-d', @@ -380,7 +397,7 @@ def format_help(self): help='debugging trigger [True]', dest='dtrig_l', nargs='*', - default=["True"], + default=[], ) args = parser.parse_args() return args diff --git a/samsift/version.py b/samsift/version.py index 20827a9..dbb0783 100644 --- a/samsift/version.py +++ b/samsift/version.py @@ -1 +1 @@ -VERSION="0.2.1" +VERSION="0.2.2" \ No newline at end of file