CHHFilter.py

#!/usr/bin/env python

import sys, csv

# Utils

def getZS(line):
    for v in line[11:]:
        if v[:2] == "ZS":
            return v
    return ""

# Main class

class CHHFilter(object):
    fastafile = ""
    maxhits = 3
    ref = []

    def usage(self):
        sys.stdout.write("""CHHFilter.py - Filter aligned BS reads based on consecutive unconverted CHH sites

Usage: CHHFilter.py [options] reference.fa

Where reference.fa is the fasta file that the reads were aligned to. Options:

  -m M | Discard a read if M consecutive unconverted sites are found (default: {})

This program is designed to work as a filter, receiving SAM lines on standard input and writing filtered SAM
lines to standard output. To filter a BAM file to another BAM file use:

  samtools view -h input.bam | CHHFilter.py reference.fa | samtools view -b > output.bam

When processing is done, the program will write the number of input reads and the number and percentage of
output reads to standard error.

Notes:

* This program requires reads to have the ZS tag, added by bsmap to indicate the read strand.

* The reference sequence should have a samtools .fai index (generated by samtools faidx reference.fa).

* Information on read pairing may not be correct after filtering: if a read is discarded and its mate
  is not, the mate will still be flagged as being properly paired. This can be fixed with the Picard
  FixMateInformation tool.

""".format(self.maxhits))

    def parseArgs(self, args):
        prev = ""
        for a in args:
            if prev == "-m":
                self.maxhits = int(a)
                prev = ""
            elif a in ["-m"]:
                prev = a
            else:
                self.fastafile = a

    def loadReference(self, chrom):
        chromlen = 0
        startpos = 0
        fai = self.fastafile + ".fai"
        with open(fai, "r") as f:
            c = csv.reader(f, delimiter='\t')
            for line in c:
                if line[0] == chrom:
                    chromlen = int(line[1])
                    startpos = int(line[2])
        if not startpos:
            return

        self.ref = ['']*chromlen

        sys.stderr.write("Loading reference for {}...\n".format(chrom))
        with open(self.fastafile, "r") as f:
            f.seek(startpos)
            idx = 0
            while True:
                line = f.readline()
                if not line:
                    break
                if line[0] == '>':
                    break
                line = line.rstrip("\n")
                for ch in line:
                    self.ref[idx] = ch.upper()
                    idx += 1
        return self.ref

    def findUnconvertedTop(self, gread, read):
        nchh = 0
        for i in range(len(gread) - 3):
            if gread[i] == 'C' and gread[i+1] != 'G' and gread[i+2] != 'G':
                if read[i] == 'C':
                    nchh += 1
                    if nchh == self.maxhits:
                        return True
                else:
                    nchh = 0
        return False

    def findUnconvertedBot(self, gread, read):
        nchh = 0
        for i in range(2, len(gread)):
            try:
                if gread[i] == 'G' and gread[i-1] != 'C' and gread[i-2] != 'C':
                    if read[i] == 'G':
                        nchh += 1
                        if nchh == self.maxhits:
                            return True
                    else:
                        nchh = 0
            except:
                sys.stderr.write("Error:\n{}\n{}\n".format(gread, read))
        return False

    def processSAM(self):
        current = ""
        ref = []
        nr = 0
        nf = 0
        c = csv.reader(sys.stdin, delimiter="\t")
        for line in c:
            if line[0][0] == '@':
                sys.stdout.write("\t".join(line) + "\n")
                continue
            chrom = line[2]
            pos = int(line[3]) - 1
            read = line[9]
            if chrom != current:
                self.loadReference(chrom)
                if not self.ref:
                    sys.stderr.write("Error: could not find reference sequence for `{}'\n".format(chrom))
                    break
                current = chrom
            gread = self.ref[pos:pos+len(read)]
            zs = getZS(line)
            nr += 1
            if zs[5] == '+':
                filt = self.findUnconvertedTop(gread, read)
            else:
                filt = self.findUnconvertedBot(gread, read)
            if filt:
                nf += 1
            else:
                sys.stdout.write("\t".join(line) + "\n")
            #sys.stdout.write("{} {} {}\n{} {}\n\n".format(read, read.count("C"), read.count("G"), "".join(gread), getZS(line)))
        sys.stderr.write("{} reads, {} reads filtered ({:.2f}%)\n".format(nr, nf, 100.0*nf/nr))

if __name__ == "__main__":
    args = sys.argv[1:]
    F = CHHFilter()
    F.parseArgs(args)
    if F.fastafile:
        F.processSAM()
    else:
        F.usage()


# ++ C->T
# +- C->T
# -+ G->A
# -- G->A