-
Notifications
You must be signed in to change notification settings - Fork 1
/
00-Bismark.sh
executable file
·140 lines (97 loc) · 3.23 KB
/
00-Bismark.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/bin/bash
## Job Name
#SBATCH --job-name=olyoly
## Allocation Definition
#SBATCH --account=srlab
#SBATCH --partition=srlab
## Resources
## Nodes (We only get 1, so this is fixed)
#SBATCH --nodes=1
## Walltime (days-hours:minutes:seconds format)
#SBATCH --time=6-00:00:00
## Memory per node
#SBATCH --mem=100G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=sr320@uw.edu
## Specify the working directory for this job
#SBATCH --chdir=/gscratch/scrubbed/sr320/020320-oly/
#Script output available at https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/
# Directories and programs
bismark_dir="/gscratch/srlab/programs/Bismark-0.21.0"
bowtie2_dir="/gscratch/srlab/programs/bowtie2-2.3.4.1-linux-x86_64/"
samtools="/gscratch/srlab/programs/samtools-1.9/samtools"
reads_dir="/gscratch/srlab/sr320/data/olurida-bs/decomp/"
genome_folder="/gscratch/srlab/sr320/data/olurida-genomes/v081/"
source /gscratch/srlab/programs/scripts/paths.sh
# ${bismark_dir}/bismark_genome_preparation \
# --verbose \
# --parallel 28 \
# --path_to_aligner ${bowtie2_dir} \
# ${genome_folder}
find ${reads_dir}*_s456_trimmed.fq \
| xargs basename -s _s456_trimmed.fq | xargs -I{} ${bismark_dir}/bismark \
--path_to_bowtie ${bowtie2_dir} \
-genome ${genome_folder} \
-p 4 \
--non_directional \
${reads_dir}/{}_s456_trimmed.fq
find *.bam | \
xargs basename -s .bam | \
xargs -I{} ${bismark_dir}/deduplicate_bismark \
--bam \
{}.bam
${bismark_dir}/bismark_methylation_extractor \
--bedGraph --counts --scaffolds \
--multicore 14 \
--buffer_size 75% \
*deduplicated.bam
# Bismark processing report
${bismark_dir}/bismark2report
#Bismark summary report
${bismark_dir}/bismark2summary
# Sort files for methylkit and IGV
find *deduplicated.bam | \
xargs basename -s .bam | \
xargs -I{} ${samtools} \
sort --threads 28 {}.bam \
-o {}.sorted.bam
# Index sorted files for IGV
# The "-@ 16" below specifies number of CPU threads to use.
find *.sorted.bam | \
xargs basename -s .sorted.bam | \
xargs -I{} ${samtools} \
index -@ 28 {}.sorted.bam
find *deduplicated.bismark.cov.gz \
| xargs basename -s _trimmed_bismark_bt2.deduplicated.bismark.cov.gz \
| xargs -I{} ${bismark_dir}/coverage2cytosine \
--genome_folder ${genome_folder} \
-o {} \
--merge_CpG \
--zero_based \
{}_trimmed_bismark_bt2.deduplicated.bismark.cov.gz
#creating bedgraphs post merge
for f in *merged_CpG_evidence.cov
do
STEM=$(basename "${f}" .CpG_report.merged_CpG_evidence.cov)
cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 10) {print $1, $2, $3, $4}}' \
> "${STEM}"_10x.bedgraph
done
for f in *merged_CpG_evidence.cov
do
STEM=$(basename "${f}" .CpG_report.merged_CpG_evidence.cov)
cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 5) {print $1, $2, $3, $4}}' \
> "${STEM}"_5x.bedgraph
done
#creating tab files with raw count for glms
for f in *merged_CpG_evidence.cov
do
STEM=$(basename "${f}" .CpG_report.merged_CpG_evidence.cov)
cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 10) {print $1, $2, $3, $4, $5, $6}}' \
> "${STEM}"_10x.tab
done
for f in *merged_CpG_evidence.cov
do
STEM=$(basename "${f}" .CpG_report.merged_CpG_evidence.cov)
cat "${f}" | awk -F $'\t' 'BEGIN {OFS = FS} {if ($5+$6 >= 5) {print $1, $2, $3, $4, $5, $6}}' \
> "${STEM}"_5x.tab
done