diff --git a/.gitignore b/.gitignore index 6324c0c..bf916dd 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ README.html README.sh *.sam *.bam +!test.sam +!test.bam # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/Makefile b/Makefile index e5b3a87..9e1690b 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ SAMS=./samsift/samsift.py .SECONDARY: -all: readme tests/test.bam +all: readme tests/tests/test.bam clean: rm -fr build/ dist/ samsift/__pycache__ samsift.egg-info @@ -16,7 +16,7 @@ clean: pypi: /usr/bin/env python3 setup.py sdist bdist_wheel upload -tests/test.bam: tests/test.sam +tests/tests/test.bam: tests/tests/test.sam samtools view -b "$<" > "$@" readme: rst html diff --git a/README.rst b/README.rst index 99b55c2..bdaf8b4 100644 --- a/README.rst +++ b/README.rst @@ -84,7 +84,7 @@ Command-line parameters .. code-block:: Program: samsift (advanced filtering and tagging of SAM/BAM alignments using Python expressions) - Version: 0.2.1 + Version: 0.2.2 Author: Karel Brinda Usage: samsift.py [-i FILE] [-o FILE] [-f PY_EXPR] [-c PY_CODE] [-m STR] diff --git a/samsift/samsift.py b/samsift/samsift.py index 479d1cf..cc31013 100755 --- a/samsift/samsift.py +++ b/samsift/samsift.py @@ -24,151 +24,267 @@ BASIC_INIT="import random;" +ALIGNMENT_VARIABLE_NAMES=set([ + 'a', + 'QNAME', 'FLAG', 'POS', 'MAPQ', 'CIGAR', 'PNEXT', 'TLEN', 'SEQ', + 'RNAME', 'RNAMEi', 'RNEXT', 'RNEXTi', + 'QUAL', 'QUALa', 'QUALs', 'QUALsa', 'QUAL', 'QUALa', 'QUALs', 'QUALsa', + ]) + def info(msg): dt = datetime.datetime.now() fdt = dt.strftime("%Y-%m-%d %H:%M:%S") print("[samsift]", fdt, msg, file=sys.stderr) -def sam_sift(in_sam_fn, out_sam_fn, filter, code, dexpr, dtrig, mode, initialization): - info("Starting.") - init_vardict={} - exec(BASIC_INIT + initialization, init_vardict) - exec(initialization, init_vardict) - if in_sam_fn=='-': - info("Reading from standard input. " + ("Press Ctrl+D to finish reading or run '{} -h' for help.".format(PROGRAM) if in_sam_fn=="-" else"")) +def possible_vars(string): + f=filter(lambda x: string.find(x)!=-1, ALIGNMENT_VARIABLE_NAMES) + return set(f) - with pysam.AlignmentFile(in_sam_fn, "rb") as in_sam: #check_sq=False) - #print("@PG", "ID:{}".format(PROGRAM), "PN:{}".format(PROGRAM), "VN:{}".format(VERSION), "CL:{}".format(" ".join(sys.argv)), sep="\t") - header=in_sam.header - pg={ - "ID":PROGRAM, - "PN":PROGRAM, - "VN":VERSION, - "CL":" ".join(map(lambda x:"'{}'".format(x),sys.argv)), - } +class SamSift: - try: - header['PG'].insert(0,pg) - except KeyError: - header['PG']=[pg] + def __init__(self, in_sam_fn, out_sam_fn, filter, code, dexpr, dtrig, mode, initialization): + self.in_sam_fn=in_sam_fn + self.out_sam_fn=out_sam_fn + + self.filter=filter + self.code=code + self.dexpr=dexpr + self.dtrig=dtrig + self.initialization=initialization + + self.mode=mode - if out_sam_fn[-4:]==".bam": - out_mode="wb" + + self.nt=0 + self.np=0 + self.nf=0 + self.ne=0 + + if in_sam_fn=='-': + info("Reading from standard input. " + ("Press Ctrl+D to finish reading or run '{} -h' for help.".format(PROGRAM) if in_sam_fn=="-" else"")) + + if self.out_sam_fn[-4:]==".bam": + self.pysam_out_mode="wb" else: - out_mode="w" - - with pysam.AlignmentFile(out_sam_fn, out_mode, header=header) as out_sam: - - nt,np,nf,ne=0,0,0,0 - for alignment in in_sam.fetch(until_eof=True): - err=False - nt+=1 - #print(alignment.qual) - vardict=init_vardict - vardict.update({ - 'a': alignment, - 'QNAME': alignment.query_name, - 'FLAG': alignment.flag, - # RNAME will be set later, - 'POS': alignment.reference_start+1, - 'MAPQ': alignment.mapping_quality, - 'CIGAR': alignment.cigarstring, - # RNEXT will be set later, - 'PNEXT': alignment.next_reference_start+1, - 'TLEN': alignment.template_length, - 'SEQ': alignment.query_sequence, - - # integer id's - 'RNAMEi': alignment.reference_id, - 'RNEXTi': alignment.next_reference_id, - }) - - # the exact implementation depends on specific version of PySam, we want the same behaviour - if isinstance(alignment.qual, str): - vardict['QUAL']=alignment.qual - vardict['QUALa']=[ord(x) for x in alignment.qual] - vardict['QUALs']=alignment.qqual - vardict['QUALsa']=[ord(x) for x in alignment.qqual] - else: - vardict['QUAL']=pysam.qualities_to_qualitystring(alignment.qual, offset=0) - vardict['QUALa']=alignment.qual - vardict['QUALs']=pysam.qualities_to_qualitystring(alignment.qqual, offset=0) - vardict['QUALsa']=alignment.qqual + self.pysam_out_mode="w" + + self.filter_possible_vars=possible_vars(self.filter) + self.code_possible_vars=possible_vars(self.code) + self.debug_possible_vars=possible_vars(self.dexpr) | possible_vars(self.dtrig) + self.possible_vars = self.filter_possible_vars | self.code_possible_vars | self.debug_possible_vars + #info(self.possible_vars) + + self.init_vardict={} + exec(BASIC_INIT + initialization, self.init_vardict) + exec(self.initialization, self.init_vardict) + + + def _init_vardict(self): + """Init the variable dictionary (context for eval/code exec). + + Tricks: + - init only those variable that appear as a substring + """ + + self.vardict=self.init_vardict + + alignment=self.alignment + + if 'a' in self.possible_vars: + self.vardict['a'] = alignment + if 'QNAME' in self.possible_vars: + self.vardict['QNAME'] = alignment.query_name + if 'FLAG' in self.possible_vars: + self.vardict['FLAG'] = alignment.flag + if 'POS' in self.possible_vars: + self.vardict['POS'] = alignment.reference_start+1 + if 'MAPQ' in self.possible_vars: + self.vardict['MAPQ'] = alignment.mapping_quality + if 'CIGAR' in self.possible_vars: + self.vardict['CIGAR'] = alignment.cigarstring + if 'PNEXT' in self.possible_vars: + self.vardict['PNEXT'] = alignment.next_reference_start+1 + if 'TLEN' in self.possible_vars: + self.vardict['TLEN'] = alignment.template_length + if 'SEQ' in self.possible_vars: + self.vardict['SEQ'] = alignment.query_sequence + if 'RNAMEi' in self.possible_vars: + self.vardict['RNAMEi'] = alignment.reference_id + if 'RNEXTi' in self.possible_vars: + self.vardict['RNEXTi'] = alignment.next_reference_id + + # the specific implementation depends on the specific version of PySam, we want the same behaviour + if isinstance(alignment.qual, str): + if 'QUAL' in self.possible_vars: + self.vardict['QUAL']=alignment.qual + if 'QUALa' in self.possible_vars: + self.vardict['QUALa']=[ord(x) for x in alignment.qual] + if 'QUALs' in self.possible_vars: + self.vardict['QUALs']=alignment.qqual + if 'QUALsa' in self.possible_vars: + self.vardict['QUALsa']=[ord(x) for x in alignment.qqual] + else: + if 'QUAL' in self.possible_vars: + self.vardict['QUAL']=pysam.qualities_to_qualitystring(alignment.qual, offset=0) + if 'QUALa' in self.possible_vars: + self.vardict['QUALa']=alignment.qual + if 'QUALs' in self.possible_vars: + self.vardict['QUALs']=pysam.qualities_to_qualitystring(alignment.qqual, offset=0) + if 'QUALsa' in self.possible_vars: + self.vardict['QUALsa']=alignment.qqual + + if 'RNAME' in self.possible_vars: + if alignment.reference_id==-1: + self.vardict['RNAME']='*' + else: + self.vardict['RNAME']=self.in_sam.get_reference_name(alignment.reference_id) + + if 'RNEXT' in self.possible_vars: + if alignment.next_reference_id==-1: + self.vardict['RNEXT']='*' + else: + self.vardict['RNEXT']=self.in_sam.get_reference_name(alignment.next_reference_id) + + + def _init_alignment(self, alignment): + """Init context for the alignment. + """ + self.alignment=alignment + + self.err=False + self.nt+=1 + + # 1. init variable dictionary + self._init_vardict() + + # 2. remove old tags + keys_to_delete=[] + for k in self.vardict: + if len(k)==2: + keys_to_delete.append(k) + for k in keys_to_delete: + del self.vardict[k] + + # 3. load tags + self.vardict.update(alignment.get_tags()) + + + def _filter(self): + """Evaluate FILTER. + """ + if self.filter=="": + self.passes=True + return - if vardict['RNAMEi']==-1: - vardict['RNAME']='*' + try: + self.passes=eval(self.filter, self.vardict) + except: + self.err=True + if self.mode=="strict": + raise + elif self.mode=="nonstop-keep": + self.passes=True + self.ne+=1 + elif self.mode=="nonstop-remove": + self.passes=False + self.ne+=1 + else: + raise NotImplementedError + + + def _debug(self): + """eval(DEBUG_TRIGER) & print(eval(DEBUG_EXPR)) + """ + if self.dexpr=="": + return + + trig=eval(str(self.dtrig), self.vardict) if self.dtrig!="" else True + if trig: + try: + dbg_res=eval(self.dexpr, self.vardict) + except Exception as e: + # todo: add a better message + dbg_res="evaluation_failed ({})".format(e) + print(self.alignment.query_name, bool(self.passes), dbg_res, file=sys.stderr, sep="\t") + + + def _code(self): + """exec(CODE) + """ + if self.code=="": + return + + # 1. exec(CODE) + try: + exec(self.code, self.vardict) + except: + if self.mode=="strict": + raise + else: + if self.err==False: + self.ne+=1 + self.err=True + info("Alignment '{}' - code error ('{}')".format(self.alignment.query_name, sys.exc_info()[0])) + + # 2. backpropagate tags + for k, v in self.vardict.items(): + if len(k)==2: + if isinstance(v, float): + # workaround (see "https://github.com/pysam-developers/pysam/issues/531") + self.alignment.set_tag(k, v, value_type="f") else: - vardict['RNAME']=in_sam.get_reference_name(vardict['RNAMEi']) + self.alignment.set_tag(k, v) + + + def _print_alignment(self): + self.out_sam.write(self.alignment) - if vardict['RNEXTi']==-1: - vardict['RNEXT']='*' - else: - vardict['RNEXT']=in_sam.get_reference_name(vardict['RNEXTi']) - - # clean cache - keys_to_delete=[] - for k in vardict: - if len(k)==2: - keys_to_delete.append(k) - for k in keys_to_delete: - del vardict[k] - - vardict.update(alignment.get_tags()) - try: - passes=eval(filter, vardict) - except: - err=True - if mode=="strict": - raise - elif mode=="nonstop-keep": - passes=True - ne+=1 - elif mode=="nonstop-remove": - passes=False - ne+=1 - else: - raise NotImplementedError - if dexpr != "": - trig=eval(str(dtrig), vardict) - if trig: - try: - dbg_res=eval(dexpr, vardict) - except: - # todo: add a better message - dbg_res="evaluation_failed" - print(alignment.query_name, bool(passes), dbg_res, file=sys.stderr, sep="\t") - if passes: - if code is not None: - try: - exec(code, vardict) - except: - if mode=="strict": - raise - else: - if err==False: - ne+=1 - err=True - info("Alignment '{}' - code error ('{}')".format(alignment.query_name, sys.exc_info()[0])) - - - for k, v in vardict.items(): - if len(k)==2: - if isinstance(v, float): - # workaround (see "https://github.com/pysam-developers/pysam/issues/531") - alignment.set_tag(k, v, value_type="f") - else: - alignment.set_tag(k, v) - out_sam.write(alignment) - if not err: - np+=1 - else: - if not err: - nf+=1 - info("Finishing. {} alignments processed. {} alignments passed. {} alignments filtered out. {} alignments caused errors.".format(nt, np, nf, ne)) -def main(): + def process_alignment(self, alignment): + self._init_alignment(alignment) + + self._filter() + self._debug() + + if self.passes: + self._code() + self._print_alignment() + if not self.err: + self.np+=1 + else: + if not self.err: + self.nf+=1 + + + def run(self): + info("SAMsift is starting.") + with pysam.AlignmentFile(self.in_sam_fn, "rb") as in_sam: #check_sq=False) + self.in_sam=in_sam + #print("@PG", "ID:{}".format(PROGRAM), "PN:{}".format(PROGRAM), "VN:{}".format(VERSION), "CL:{}".format(" ".join(sys.argv)), sep="\t") + header=in_sam.header + + pg={ + "ID":PROGRAM, + "PN":PROGRAM, + "VN":VERSION, + "CL":" ".join(map(lambda x:"'{}'".format(x),sys.argv)), + } + + try: + header['PG'].insert(0,pg) + except KeyError: + header['PG']=[pg] + + with pysam.AlignmentFile(self.out_sam_fn, self.pysam_out_mode, header=header) as out_sam: + self.out_sam=out_sam + for alignment in in_sam.fetch(until_eof=True): + self.process_alignment(alignment) + info("Finishing. {} alignments processed. {} alignments passed. {} alignments filtered out. {} alignments caused errors.".format(self.nt, self.np, self.nf, self.ne)) + + +def parse_args(): class CustomArgumentParser (argparse.ArgumentParser): def print_help(self): @@ -237,7 +353,7 @@ def format_help(self): help='filter [True]', dest='filter_l', nargs='*', - default=['True'], + default=[], ) parser.add_argument('-c', @@ -246,7 +362,7 @@ def format_help(self): help="code to be executed (e.g., assigning new tags) [None]", dest='code_l', nargs='*', - default=['None'], + default=[], ) parser.add_argument('-m', @@ -263,7 +379,7 @@ def format_help(self): help='initialization [None]', dest='init_l', nargs='*', - default=['None'], + default=[], ) parser.add_argument('-d', @@ -281,11 +397,15 @@ def format_help(self): help='debugging trigger [True]', dest='dtrig_l', nargs='*', - default=["True"], + default=[], ) args = parser.parse_args() + return args + - sam_sift( +def main (): + args=parse_args() + sam_sift=SamSift( in_sam_fn=args.in_sam_fn, out_sam_fn=args.out_sam_fn, filter=" ".join(args.filter_l), @@ -295,6 +415,7 @@ def format_help(self): initialization=" ".join(args.init_l), mode=args.mode, ) + sam_sift.run() if __name__ == "__main__": diff --git a/samsift/version.py b/samsift/version.py index 20827a9..dbb0783 100644 --- a/samsift/version.py +++ b/samsift/version.py @@ -1 +1 @@ -VERSION="0.2.1" +VERSION="0.2.2" \ No newline at end of file diff --git a/tests/Makefile b/tests/Makefile index 769be43..d5f1545 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -1,39 +1,12 @@ -.PHONY: all clean basic columns id +.PHONY: all SHELL=/usr/bin/env bash -euc -o pipefail .SECONDARY: -SIFT=../samsift/samsift.py -NORM=../samsift/samsift_norm_sam.py -H -SV=samtools view +all: + $(MAKE) -C tests -all: basic columns id +%: + $(MAKE) -C tests $@ -basic: - $(SIFT) -i test.bam -c 'ln=len(SEQ);ab=sum(a.query_qualities)/ln' - $(SIFT) -i test.bam -o test_out.bam -c 'ln=len(SEQ);ab=sum(a.query_qualities)/ln' - -id: - $(SIFT) -i test.bam | grep -v samsift > _test.sam - diff <($(NORM) -i test.sam) <($(NORM) -i _test.sam) - -columns: test.debug_samtools.sam test.debug_samsift.sam - diff $^ - -%.debug_samtools.sam: %.bam - $(SV) $< \ - | cut -f -11 \ - > $@ - -%.debug_samsift.sam: %.bam - ($(SIFT) -i $< -d '"\t".join(map(str,[QNAME, FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL]))' \ - > /dev/null) 2>&1 \ - | grep -v samsift \ - | cut -f 3- \ - | cut -f -11 \ - > $@ - - -clean: - rm -f *.sam test_out.bam diff --git a/tests/profiling/.gitignore b/tests/profiling/.gitignore new file mode 100644 index 0000000..c627000 --- /dev/null +++ b/tests/profiling/.gitignore @@ -0,0 +1 @@ +*.prof diff --git a/tests/profiling/Makefile b/tests/profiling/Makefile new file mode 100644 index 0000000..2dac6bc --- /dev/null +++ b/tests/profiling/Makefile @@ -0,0 +1,24 @@ +.PHONY: all clean filt + +SF=../../samsift/samsift.py + +SHELL=/usr/bin/env bash -euc -o pipefail +BAM=_test.bam +SMALLBAM=_test.small.bam + +.SECONDARY: + +all: filt + +filt: $(BAM) + /usr/bin/env python3 -m cProfile -o _filt.prof $(SF) -i "$<" -f 'NM>0' > /dev/null + +$(BAM): + wget -O $(BAM) ftp://ftp.sra.ebi.ac.uk/vol1/ERZ004/ERZ004000/accepted_hits.bam + +$(SMALLBAM): $(BAM) + samtools view -h $< | head -n 10000 | samtools view -b > $@ + +clean: + rm -f _* + diff --git a/tests/test.bam b/tests/test.bam deleted file mode 100644 index 84a9d56..0000000 Binary files a/tests/test.bam and /dev/null differ diff --git a/tests/test.bam b/tests/test.bam new file mode 120000 index 0000000..aaebe4f --- /dev/null +++ b/tests/test.bam @@ -0,0 +1 @@ +tests/test.bam \ No newline at end of file diff --git a/tests/test.sam b/tests/test.sam deleted file mode 100644 index 2e1b8d4..0000000 --- a/tests/test.sam +++ /dev/null @@ -1,11 +0,0 @@ -@SQ SN:gi|561108321|ref|NC_018143.2| LN:4411709 -r00001 0 gi|561108321|ref|NC_018143.2| 2798553 60 100M * 0 0 AAAGGAATGCATCACCAATCTCGTCGGCGAGGCTTCGTGCCTCCTCGCCCGGCGCGCGGGTCGCGCCCGGGTCACTCTCGCCGGCGAACCCGACATAGGC 2/454483235524234201022/1252422523223533243212222421513222236220215432131222242315235121223130142647 NM:i:2 MD:Z:51C23C24 AS:i:90 XS:i:0 -r00002 0 gi|561108321|ref|NC_018143.2| 445039 60 100M * 0 0 TCATCGGCAGATGCCCCCCGAGCGCAGCTTCCACGCGGCGCCGCGACGCCGTGTGCTGGTTCGTCACCGCCCGACCGACGCGGGCCCAGTGGTCGAGCTG 11/242522323122622254122142322321121236232321205072426233223223204322022022633343113212322232-462422 NM:i:2 MD:Z:17G45A36 AS:i:90 XS:i:0 -r00003 0 gi|561108321|ref|NC_018143.2| 2014256 60 100M * 0 0 GGCAGCCCCGATGGGGGACCAGATCGGCCCCTGGGCCGCCTCGCTCACCGCGAGGCTCGCGAGGAGTCCGACCAGCGCGGCGCCCACGGCCACAATCACG 2220242203322212123344242421/2-0042322131152322232426231022262222244315/353/227312462213575444122223 NM:i:2 MD:Z:19G50G29 AS:i:90 XS:i:0 -r00004 0 gi|561108321|ref|NC_018143.2| 3675128 60 100M * 0 0 GTTGGGCGGGGTGGAATTCATCCATTTCATTCAGTGCCCGTTGCGAATCCCCAAGCTACCCCGACGGCGACCAGAGGATGTCGATGGGGACGGCGGCGAG 28542253205043252212122151202.212332342342273312355423123122235524002520/2421/242130/623202/22221632 NM:i:0 MD:Z:100 AS:i:100 XS:i:0 -r00005 0 gi|561108321|ref|NC_018143.2| 1622793 60 100M * 0 0 TCAATGCGGGCGAGAATTTGTTCCGGCCTCTGTTGCTGCGGGTGACATCGGAAGGTGTGGGGACAGGGATCAGCCCGAGATCATGCAATCACTGTCCTGC 222112522331421427632224324323225122651242242/423423422222522430212723545/331522025543365/3422221242 NM:i:4 MD:Z:35A25T1A35A0 AS:i:84 XS:i:0 -r00006 16 gi|561108321|ref|NC_018143.2| 3625422 60 100M * 0 0 CCAGCAACTCGATACCCGCCTCCTCGGCCAGGTGGCCCACATTGCCGAGCGCGTTGTTGACCGTTGTGCGCAAGTCGACCGCCTCAAACGACAACTCGGC 5102650324303230122042406020//22242254134302/4225132201343264233465222111342525354/2452222113,542204 NM:i:2 MD:Z:16G70C12 AS:i:90 XS:i:0 -r00007 0 gi|561108321|ref|NC_018143.2| 4211791 60 100M * 0 0 GCACAACCACATCGGGCCGCTGGCGGCGGGCCGCCGCAATCGCCGACGATCCGTCACCGGCGGTGGTGATGTTCCAACCTTCATACCGCAATGCCAGGGA 423422233223252122313420321/1035013262413442/2422121542/.322222223202234124232342.220131262221113112 NM:i:1 MD:Z:96T3 AS:i:96 XS:i:0 -r00008 0 gi|561108321|ref|NC_018143.2| 3766207 60 100M * 0 0 CTGCCCGTCCCCGCATTACCGACGCCGGTGTTGTAATTACCCGAGTTGAACACCCCGAAGTTCCCGGTCCCCGAATTGAAGAACCCCACATTCCCGGTGC 226312132135034333332573.42226224-23/30242232647123433023342321432222.236624143324112235322120224220 NM:i:2 MD:Z:3G19A76 AS:i:91 XS:i:52 -r00009 16 gi|561108321|ref|NC_018143.2| 2626290 0 100M * 0 0 GCGAACCAGCCCGTCACGCACCCCGTGCAGCATGTTCACGATGATGCGAAACGCCTGATTCAACTGGGCCATGGTGTCTAGCGAGGTCGCCTCGGCCATG 2/35303431+2132326023221141010224244613225532532005140222/1252222763242222201423/1333672201216544351 NM:i:2 MD:Z:43T18T37 AS:i:90 XS:i:90 XA:Z:gi|561108321|ref|NC_018143.2|,+1340805,100M,2;gi|561108321|ref|NC_018143.2|,-4060531,100M,3;gi|561108321|ref|NC_018143.2|,-1160916,100M,3;gi|561108321|ref|NC_018143.2|,+2030494,100M,4; -r0000a 16 gi|561108321|ref|NC_018143.2| 2556229 60 100M * 0 0 CACGATCAACGCATATAGGCCGCTGGCGCGCAACGGATTCGCTTCGCTGTGGTCGTGGTTTATTGGCCTGGTGGTTACCGAGTTTCCGTTACCGACGCTG 333422402043/221160023222234525304051215103232534255212/3443422244012223552/400252320223106350240302 NM:i:1 MD:Z:42A57 AS:i:95 XS:i:0 diff --git a/tests/test.sam b/tests/test.sam new file mode 120000 index 0000000..0d08074 --- /dev/null +++ b/tests/test.sam @@ -0,0 +1 @@ +tests/test.sam \ No newline at end of file diff --git a/tests/tests/.gitignore b/tests/tests/.gitignore new file mode 100644 index 0000000..1acc04f --- /dev/null +++ b/tests/tests/.gitignore @@ -0,0 +1,2 @@ +!test.bam +!test.sam diff --git a/tests/tests/Makefile b/tests/tests/Makefile new file mode 100644 index 0000000..537ed75 --- /dev/null +++ b/tests/tests/Makefile @@ -0,0 +1,39 @@ +.PHONY: all clean basic columns id + +SHELL=/usr/bin/env bash -euc -o pipefail + +.SECONDARY: + +SIFT=../../samsift/samsift.py +NORM=../../samsift/samsift_norm_sam.py -H +SV=samtools view + +all: basic columns id + +basic: + $(SIFT) -i test.bam -c 'ln=len(SEQ);ab=sum(a.query_qualities)/ln' + $(SIFT) -i test.bam -o _test_out.bam -c 'ln=len(SEQ);ab=sum(a.query_qualities)/ln' + +id: + $(SIFT) -i test.bam | grep -v samsift > _test.sam + diff <($(NORM) -i test.sam) <($(NORM) -i _test.sam) + +columns: _test.debug_samtools.sam _test.debug_samsift.sam + diff $^ + +_%.debug_samtools.sam: %.bam + $(SV) $< \ + | cut -f -11 \ + > $@ + +_%.debug_samsift.sam: %.bam + ($(SIFT) -i $< -d '"\t".join(map(str,[QNAME, FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL]))' \ + > /dev/null) 2>&1 \ + | grep -v samsift \ + | cut -f 3- \ + | cut -f -11 \ + > $@ + + +clean: + rm -f _* diff --git a/tests/tests/test.bam b/tests/tests/test.bam new file mode 100644 index 0000000..84a9d56 Binary files /dev/null and b/tests/tests/test.bam differ diff --git a/tests/tests/test.sam b/tests/tests/test.sam new file mode 100644 index 0000000..2e1b8d4 --- /dev/null +++ b/tests/tests/test.sam @@ -0,0 +1,11 @@ +@SQ SN:gi|561108321|ref|NC_018143.2| LN:4411709 +r00001 0 gi|561108321|ref|NC_018143.2| 2798553 60 100M * 0 0 AAAGGAATGCATCACCAATCTCGTCGGCGAGGCTTCGTGCCTCCTCGCCCGGCGCGCGGGTCGCGCCCGGGTCACTCTCGCCGGCGAACCCGACATAGGC 2/454483235524234201022/1252422523223533243212222421513222236220215432131222242315235121223130142647 NM:i:2 MD:Z:51C23C24 AS:i:90 XS:i:0 +r00002 0 gi|561108321|ref|NC_018143.2| 445039 60 100M * 0 0 TCATCGGCAGATGCCCCCCGAGCGCAGCTTCCACGCGGCGCCGCGACGCCGTGTGCTGGTTCGTCACCGCCCGACCGACGCGGGCCCAGTGGTCGAGCTG 11/242522323122622254122142322321121236232321205072426233223223204322022022633343113212322232-462422 NM:i:2 MD:Z:17G45A36 AS:i:90 XS:i:0 +r00003 0 gi|561108321|ref|NC_018143.2| 2014256 60 100M * 0 0 GGCAGCCCCGATGGGGGACCAGATCGGCCCCTGGGCCGCCTCGCTCACCGCGAGGCTCGCGAGGAGTCCGACCAGCGCGGCGCCCACGGCCACAATCACG 2220242203322212123344242421/2-0042322131152322232426231022262222244315/353/227312462213575444122223 NM:i:2 MD:Z:19G50G29 AS:i:90 XS:i:0 +r00004 0 gi|561108321|ref|NC_018143.2| 3675128 60 100M * 0 0 GTTGGGCGGGGTGGAATTCATCCATTTCATTCAGTGCCCGTTGCGAATCCCCAAGCTACCCCGACGGCGACCAGAGGATGTCGATGGGGACGGCGGCGAG 28542253205043252212122151202.212332342342273312355423123122235524002520/2421/242130/623202/22221632 NM:i:0 MD:Z:100 AS:i:100 XS:i:0 +r00005 0 gi|561108321|ref|NC_018143.2| 1622793 60 100M * 0 0 TCAATGCGGGCGAGAATTTGTTCCGGCCTCTGTTGCTGCGGGTGACATCGGAAGGTGTGGGGACAGGGATCAGCCCGAGATCATGCAATCACTGTCCTGC 222112522331421427632224324323225122651242242/423423422222522430212723545/331522025543365/3422221242 NM:i:4 MD:Z:35A25T1A35A0 AS:i:84 XS:i:0 +r00006 16 gi|561108321|ref|NC_018143.2| 3625422 60 100M * 0 0 CCAGCAACTCGATACCCGCCTCCTCGGCCAGGTGGCCCACATTGCCGAGCGCGTTGTTGACCGTTGTGCGCAAGTCGACCGCCTCAAACGACAACTCGGC 5102650324303230122042406020//22242254134302/4225132201343264233465222111342525354/2452222113,542204 NM:i:2 MD:Z:16G70C12 AS:i:90 XS:i:0 +r00007 0 gi|561108321|ref|NC_018143.2| 4211791 60 100M * 0 0 GCACAACCACATCGGGCCGCTGGCGGCGGGCCGCCGCAATCGCCGACGATCCGTCACCGGCGGTGGTGATGTTCCAACCTTCATACCGCAATGCCAGGGA 423422233223252122313420321/1035013262413442/2422121542/.322222223202234124232342.220131262221113112 NM:i:1 MD:Z:96T3 AS:i:96 XS:i:0 +r00008 0 gi|561108321|ref|NC_018143.2| 3766207 60 100M * 0 0 CTGCCCGTCCCCGCATTACCGACGCCGGTGTTGTAATTACCCGAGTTGAACACCCCGAAGTTCCCGGTCCCCGAATTGAAGAACCCCACATTCCCGGTGC 226312132135034333332573.42226224-23/30242232647123433023342321432222.236624143324112235322120224220 NM:i:2 MD:Z:3G19A76 AS:i:91 XS:i:52 +r00009 16 gi|561108321|ref|NC_018143.2| 2626290 0 100M * 0 0 GCGAACCAGCCCGTCACGCACCCCGTGCAGCATGTTCACGATGATGCGAAACGCCTGATTCAACTGGGCCATGGTGTCTAGCGAGGTCGCCTCGGCCATG 2/35303431+2132326023221141010224244613225532532005140222/1252222763242222201423/1333672201216544351 NM:i:2 MD:Z:43T18T37 AS:i:90 XS:i:90 XA:Z:gi|561108321|ref|NC_018143.2|,+1340805,100M,2;gi|561108321|ref|NC_018143.2|,-4060531,100M,3;gi|561108321|ref|NC_018143.2|,-1160916,100M,3;gi|561108321|ref|NC_018143.2|,+2030494,100M,4; +r0000a 16 gi|561108321|ref|NC_018143.2| 2556229 60 100M * 0 0 CACGATCAACGCATATAGGCCGCTGGCGCGCAACGGATTCGCTTCGCTGTGGTCGTGGTTTATTGGCCTGGTGGTTACCGAGTTTCCGTTACCGACGCTG 333422402043/221160023222234525304051215103232534255212/3443422244012223552/400252320223106350240302 NM:i:1 MD:Z:42A57 AS:i:95 XS:i:0