-
Notifications
You must be signed in to change notification settings - Fork 8
/
pipeclip.py
executable file
·130 lines (122 loc) · 6.92 KB
/
pipeclip.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#Main pipeline connects all the scripts together
#Original Programmer: beibei.chen@utsouthwestern.edu
#Refactored by: eric.roos@utsouthwestern.edu
#Usage: python pipeclip.py input.sam output_prefix match_length mismatch_number pcr_rm fdr_cluster clip_type fdr_mutation species
#Required packages: pysam, ghmm, pybedtools
import sys
import argparse
import logging
import os
from time import gmtime, strftime
from lib import *
def prepare_argparser():
description = "Find mutations"
epilog = "For command line options of each command, type %(prog)s COMMAND -h"
argparser = argparse.ArgumentParser(description=description, epilog = epilog)
argparser.add_argument("-i","--input",dest = "infile", type = str, required = True, help = "input bam file")
argparser.add_argument("-t","--control",dest="ctrlfile",type=str,required=False, help = "control bam file")
argparser.add_argument("-o","--output",dest = "outfile", type = str,required = True, help = "output file, default is stdout")
argparser.add_argument("-l","--matchLength",dest = "matchLength", type = int ,required = True, help = "shorted matched segment length")
argparser.add_argument("-m","--mismatch",dest = "mismatch", type = int,required = True, help = "maximum mismatch number")
argparser.add_argument("-c","--clipType",dest = "clipType", type = int,required = True, help = "CLIP type (0)HITS-CLIP; (1)PAR-4SU; (2)PAR-6SG; (3)iCLIP", choices=[0,1,2,3])
argparser.add_argument("-r","--rmdup",dest = "dupRemove", type = int,required = True, help = "Remove PCR duplicate (0)No removal; (1)Remove by read start; (2)Remove by sequence; ", choices=[0,1,2])
argparser.add_argument("-M","--fdrMutation",dest = "fdrMutation", type = float,required = True, help = "FDR for reliable mutations")
argparser.add_argument("-C","--fdrCluster",dest = "fdrCluster", type = float,required = True, help = "FDR for enriched clusters")
argparser.add_argument("-s","--species",dest = "species", type = str, help = "Species [\"mm10\",\"hg19\"]",choices=["mm10","hg19"])
return(argparser)
def runPipeClip(infile,control,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,clipType,fdrReliableMutation,species):
myClip = CLIP.CLIP(infile,outputPrefix)
controlFlag = False
if control != None:
controlClip = CLIP.CLIP(control,outputPrefix+"Control")
logging.info("Start to run")
if myClip.testInput():#check input
logging.info("Input file OK,start to run PIPE-CLIP")
logging.info("Species info %s" % species)
if control != None: #test control file
if controlClip.testInput():
logging.info("Control file OK. Use control in mutation enrichment.")
controlFlag = True
else:
logging.info("Control file format error. Continue without control.")
if myClip.readfile():
myClip.filter2(matchLength,mismatch,clipType,rmdup)
if controlFlag:
logging.info("Read in control file")
controlClip.readfile()
controlClip.filter2(matchLength,mismatch,clipType,rmdup)
#myClip.printClusters()
#myClip.printMutations()
if myClip.clusterCount>0:
logging.info("Get enriched clusters")
status = Enrich.clusterEnrich_outsource(myClip,fdrEnrichedCluster)
if status:
logging.info("Found %d enriched clusters" % myClip.sigClusterCount)
myClip.printEnrichedClusters()
else:
logging.error("There is no enriched cluster found. Exit program")
sys.exit(1)
else:
logging.error("There is no clusters found. Please check input.Exit program.")
sys.exit(1)
if myClip.mutationCount>0:
logging.info("Get reliable mutations")
if controlFlag: #use control
Enrich.mutationEnrichWCtrl(myClip,controlClip,fdrReliableMutation)
else:
Enrich.mutationEnrich(myClip,fdrReliableMutation)
logging.info("There are %d reliable mutations" % myClip.sigMutationCount)
myClip.printReliableMutations()
else:
logging.warning("There is no mutation found in this BAM file.")
#Start to get crosslinking sites
if myClip.sigClusterCount > 0 and myClip.sigMutationCount>0:
logging.info("Get cross-linking sites")
myClip.getCrosslinking()
if (len(myClip.crosslinking.keys())>0):
outfilelist = myClip.printCrosslinkingSites()
myClip.printCrosslinkingMutations()
else:
logging.warning("There is no crosslinking found. May be caused by no reliable mutations in enriched clusters. Print out enriched clusters instead.")
outfilelist = myClip.printEnrichClusters()
else:
if myClip.sigClusterCount <= 0:
logging.error("There is no enriched clusters for this sample, please check your input file. Exit.")
sys.exit(2)
elif myClip.sigMutationCount <=0:
logging.warning("There is no reliable mutations found. PIPE-CLIP will provide enriched clusters as crosslinking candidates.")
outfilelist = myClip.printEnrichClusters()
#annotation if possible
if species in ["mm10","mm9","hg19"]:
logging.info("Started to annotate cross-linking sits using HOMER")
for name in outfilelist:
#logging.debug("Start to do annotation for %s" % name)
Utils.annotation(name,species)
#output a status log file
logfile = open(outputPrefix+".pipeclip.summary.log","w")
print >>logfile,"PIPE-CLIP run finished. Parameters are:"
print >> logfile,"Input BAM: %s \nOutput prefix: %s \nMinimum matched length: %d \nMaximum mismatch count: %d \nPCR duplicate removal code: %d \nFDR for enriched clusters: %f \nFDR for reliable mutations: %f" % (infile,outputPrefix,matchLength,mismatch,rmdup,fdrEnrichedCluster,fdrReliableMutation)
print >> logfile, "There are %d mapped reads in input BAM file. After filtering,%d reads left" % (myClip.originalMapped,myClip.filteredAlignment)
print >> logfile, "%d out of %d clusters are enriched." % (myClip.sigClusterCount,len(myClip.clusters))
print >> logfile, "%d out of %d mutations are reliable." % (myClip.sigMutationCount,myClip.mutationCount)
print >> logfile, "%d crosslinking site candidates are found, with %d supporting reliable mutations." % (len(myClip.crosslinking.keys()),len(myClip.crosslinkingMutations))
logfile.close()
logging.info("PIPE-CLIP finished the job, please check your results. :)")
else:
logging.error("File corruputed, program exit.")
sys.exit(0)
if __name__=="__main__":
arg_parser = prepare_argparser()
args = arg_parser.parse_args()
OptValidator.opt_validate()
infile = args.infile # Input SAM/BAM file
control = args.ctrlfile
outputPrefix = args.outfile # Output prefix
matchLength = args.matchLength # Shorted matched segment length
mismatch = args.mismatch # Maximum mismatch number
rmcode = args.dupRemove
fdrEnrichedCluster = args.fdrCluster # FDR for enriched clusters
clipType =args.clipType # CLIP type (0)HITS-CLIP; (1)PAR-4SU; (2)PAR-6SG; (3)iCLIP
fdrReliableMutation = args.fdrMutation# FDR for reliable mutations
species = args.species # Species ["mm10","hg19"]
runPipeClip(infile,control,outputPrefix,matchLength,mismatch,rmcode,fdrEnrichedCluster,clipType,fdrReliableMutation,species)