forked from FredHutch/SEACR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SEACR_1.0.sh
193 lines (145 loc) · 7.64 KB
/
SEACR_1.0.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/bash
set -ue
if [ $# -lt 5 ]
then
echo "
SEACR: Sparse Enrichment Analysis for CUT&RUN
Usage: bash SEACR_1.0.sh <experimental bedgraph>.bg [<control bedgraph>.bg | <FDR threshold>] ["norm" | "non"] ["union" | "AUC"] output prefix
Description of input fields:
Field 1: Target data bedgraph file in UCSC bedgraph format (https://genome.ucsc.edu/goldenpath/help/bedgraph.html) that omits regions containing 0 signal.
Field 2: Control (IgG) data bedgraph file to generate an empirical threshold for peak calling. Alternatively, a numeric threshold n between 0 and 1 returns the top n fraction of peaks based on total signal within peaks.
Field 3: “norm” denotes normalization of control to target data, “non” skips this behavior. "norm" is recommended unless experimental and control data are already rigorously normalized to each other (e.g. via spike-in).
Field 4: “union” forces implementation of a maximum signal threshold in addition to the total signal threshold, and corresponds to the “union” mode described in the text, whereas “AUC” avoids this behavior, and corresponds to “AUC only” mode.
Field 5: Output prefix
Output file:
<output prefix>.auc.threshold.merge.bed (Bed file of enriched regions)
Output data structure:
<chr> <start> <end> <AUC> <max signal> <max signal region>
Description of output fields:
Field 1: Chromosome
Field 2: Start coordinate
Field 3: End coordinate
Field 4: Total signal contained within denoted coordinates
Field 5: Maximum bedgraph signal attained at any base pair within denoted coordinates
Field 6: Region representing the farthest upstream and farthest downstream bases within the denoted coordinates that are represented by the maximum bedgraph signal
Examples:
bash SEACR_1.0.sh target.bedgraph IgG.bedgraph norm AUC output
Calls enriched regions in target data using normalized IgG control track with AUC threshold
bash SEACR_1.0.sh target.bedgraph IgG.bedgraph non union output
Calls enriched regions in target data using non-normalized IgG control track with AUC and max signal thresholds
bash SEACR_1.0.sh target.bedgraph 0.01 non AUC output
Calls enriched regions in target data by selecting the top 1% of regions by area under the curve (AUC)
"
exit 1
fi
password=`head /dev/urandom | tr -dc A-Za-z0-9 | head -c 13; echo ''`
password2=`head /dev/urandom | tr -dc A-Za-z0-9 | head -c 13; echo ''`
exp=`basename $1`
if [[ $2 =~ ^[0-9]+([.][0-9]+)?$ ]]
then
echo "Calling enriched regions without control file"
elif [[ -f $2 ]]
then
echo "Calling enriched regions with control file"
ctrl=`basename $2`
else
echo "$2 is not a number or a file"
exit 1
fi
norm=`echo $3`
if [[ $norm == "norm" ]]
then
echo "Normalizing control to experimental bedgraph"
elif [[ $norm == "non" ]]
then
echo "Proceeding without normalization of control to experimental bedgraph"
else
echo "Must specify \"norm\" for normalized or \"non\" for non-normalized data processing in third input"
exit 1
fi
height=`echo $4`
if [[ $height == "union" ]]
then
echo "Using peak height in addition to AUC threshold"
elif [[ $height == "AUC" ]]
then
echo "Proceeding without peak height threshold"
else
echo "Must specify \"union\" to include max signal threshold or \"AUC\" for no max signal threshold in fourth input"
exit 1
fi
echo "Creating experimental AUC file: $(date)"
awk 'BEGIN{s=1}; {if(s==1){s++}else if(s==2){chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2); s++}else{if(chr==$1 && $2==stop){stop=$3; auc=auc+($4*($3-$2)); if ($4 > max){max=$4; coord=$1":"$2"-"$3}else if($4 == max){split(coord,t,"-"); coord=t[1]"-"$3}}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord; chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2)}}}' $1 > $password.auc.bed
cut -f 4,5 $password.auc.bed > $password.auc
if [[ -f $2 ]]
then
echo "Creating control AUC file: $(date)"
awk 'BEGIN{s=1}; {if(s==1){s++}else if(s==2){chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2); s++}else{if(chr==$1 && $2==stop){stop=$3; auc=auc+($4*($3-$2)); if ($4 > max){max=$4; coord=$1":"$2"-"$3}else if($4 == max){split(coord,t,"-"); coord=t[1]"-"$3}}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord; chr=$1; start=$2; stop=$3; max=$4; coord=$1":"$2"-"$3; auc=$4*($3-$2)}}}' $2 > $password2.auc.bed
cut -f 4,5 $password2.auc.bed > $password2.auc
fi
# module load R ## For use on cluster
echo "Calculating optimal AUC threshold: $(date)"
path=`dirname $0`
if [[ -f $2 ]] && [[ $norm == "norm" ]]
then
echo "Calculating threshold using normalized control: $(date)"
Rscript $path/SEACR_1.0.R --exp=$password.auc --ctrl=$password2.auc --norm=yes --output=$password
elif [[ -f $2 ]]
then
echo "Calculating threshold using non-normalized control: $(date)"
Rscript $path/SEACR_1.0.R --exp=$password.auc --ctrl=$password2.auc --norm=no --output=$password
else
echo "Using user-provided threshold: $(date)"
Rscript $path/SEACR_1.0.R --exp=$password.auc --ctrl=$2 --norm=no --output=$password
fi
#thresh=`cat $exp.threshold.txt`
thresh=`cat $password.threshold.txt | sed -n '1p'`
thresh2=`cat $password.threshold.txt | sed -n '2p'`
echo "Creating thresholded feature file: $(date)"
if [[ $height == "union" ]]
then
# awk -v value=$thresh '$4 > value {print $0}' $password.auc.bed | awk -v value2=$thresh2 '$5 < value2 {print $0}' > $password.auc.threshold.bed (Previous behavior)
awk -v value=$thresh -v value2=$thresh2 '$4 > value || $5 > value2 {print $0}' $password.auc.bed > $password.auc.threshold.bed #(Current behavior as of 2/7/19)
else
awk -v value=$thresh '$4 > value {print $0}' $password.auc.bed > $password.auc.threshold.bed
fi
if [[ -f $2 ]]
then
# if [[ $height == "union" ]]
# then
# awk -v value=$thresh -v value2=$thresh2 '$4 > value || $5 > value2 {print $0}' $password2.auc.bed > $password2.auc.threshold.bed
# else
if [[ $norm == "norm" ]] #If normalizing, multiply control bedgraph by normalization constant
then
constant=`cat $password.norm.txt | sed -n '1p'`
awk -v mult=$constant 'BEGIN{OFS="\t"}; {$4=$4*mult; print $0}' $password2.auc.bed > $password2.auc2.bed
mv $password2.auc2.bed $password2.auc.bed
fi
awk -v value=$thresh '$4 > value {print $0}' $password2.auc.bed > $password2.auc.threshold.bed
# fi
fi
echo "Merging nearby features and eliminating control-enriched features: $(date)"
# module load bedtools ## For use on cluster
mean=`awk '{s+=$3-$2; t++}END{print s/(t*10)}' $password.auc.threshold.bed`
if [[ -f $2 ]]
then
awk -v value=$mean 'BEGIN{s=1}; {if(s==1){chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6; s++}else{if(chr==$1 && $2 < stop+value){stop=$3; auc=auc+$4; if($5 > max){max=$5; coord=$6}else if($5==max){split(coord,t,"-"); split($6,u,"-"); coord=t[1]"-"}u[2]}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord; chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6}}}' $password.auc.threshold.bed | bedtools intersect -wa -v -a - -b $password2.auc.threshold.bed > $5.auc.threshold.merge.bed
else
awk -v value=$mean 'BEGIN{s=1}; {if(s==1){chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6; s++}else{if(chr==$1 && $2 < stop+value){stop=$3; auc=auc+$4; if($5 > max){max=$5; coord=$6}else if($5==max){split(coord,t,"-"); split($6,u,"-"); coord=t[1]"-"}u[2]}else{print chr"\t"start"\t"stop"\t"auc"\t"max"\t"coord; chr=$1; start=$2; stop=$3; auc=$4; max=$5; coord=$6}}}' $password.auc.threshold.bed > $5.auc.threshold.merge.bed
fi
echo "Removing temporary files: $(date)"
rm $password.auc.bed
rm $password.auc
rm $password.threshold.txt
rm $password.auc.threshold.bed
if [[ -f $2 ]]
then
rm $password2.auc.bed
rm $password2.auc
rm $password2.auc.threshold.bed
fi
if [[ $norm == "norm" ]]
then
rm $password.norm.txt
fi
echo "Done: $(date)"