-
Notifications
You must be signed in to change notification settings - Fork 47
/
preprocess.sh
executable file
·131 lines (111 loc) · 4.21 KB
/
preprocess.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/bin/bash
#
# preprocess.sh
#
# This script preprocesses a FASTQ-formatted file
# Chiu Laboratory
# University of California, San Francisco
# January, 2014
#
# cutadapt=>crop->dust removal (no uniq) ***
#
# 12/20/12 - modified to switch to cutadapt for trimming
# 12/31/12 - modified from Cshell to BASH version for timing
#
# Copyright (C) 2014 Charles Chiu - All Rights Reserved
# SURPI has been released under a modified BSD license.
# Please see license file for details.
scriptname=${0##*/}
if [ $# != 10 ]; then
echo "Usage: $scriptname <R1 FASTQ file> <S/I quality> <Y/N uniq> <length_cutoff; 0 for no length_cutoff> <Y/N keep short reads> <adapter_set> <start_nt> <crop_length> <temporary_files_directory> <quality_cutoff>"
exit
fi
###
inputfile=$1
quality=$2
run_uniq=$3
length_cutoff=$4
keep_short_reads=$5
adapter_set=$6
start_nt=$7
crop_length=$8
temporary_files_directory=$9
quality_cutoff=${10}
###
if [ ! -f $inputfile ];
then
echo "$inputfile not found!"
exit
fi
if [ $quality = "S" ]
then
echo -e "$(date)\t$scriptname\tselected Sanger quality"
else
echo -e "$(date)\t$scriptname\tselected Illumina quality"
fi
# fix header if space is present
s=`head -1 $inputfile | awk '{if ($0 ~ / /) {print "SPACE"} else {print "NOSPACE"}}'`
echo -e "$(date)\t$scriptname\t$s in header"
nopathf=${1##*/}
basef=${nopathf%.fastq}
#################### START OF PREPROCESSING, READ1 #########################
# run cutadapt, Read1
echo -e "$(date)\t$scriptname\t********** running cutadapt, Read1 **********"
if [ $s == "SPACE" ]
then
sed "s/\([@HWI|@M00135|@SRR][^ ]*\) \(.\):.:0:\(.*\)/\1#\3\/\2/g" $inputfile > $basef.modheader.fastq
# modified to take into account anything in there [N or Y]
START1=$(date +%s)
cutadapt_quality.csh $basef.modheader.fastq $quality $length_cutoff $keep_short_reads $adapter_set $temporary_files_directory $quality_cutoff
mv $basef.modheader.cutadapt.fastq $basef.cutadapt.fastq
rm -f $basef.modheader.fastq
else
START1=$(date +%s)
cutadapt_quality.csh $inputfile $quality $length_cutoff $keep_short_reads $adapter_set $temporary_files_directory $quality_cutoff
fi
END1=$(date +%s)
diff=$(( $END1 - $START1 ))
echo -e "$(date)\t$scriptname\tDone cutadapt: CUTADAPT took $diff seconds"
# run uniq, Read1
if [ $run_uniq == "Y" ]
then
echo -e "$(date)\t$scriptname\t********** running uniq, Read1 **********"
START1=$(date +%s)
if [ $quality = "S" ]
then
fastq filter --unique --adjust 64 $basef.cutadapt.fastq > $basef.cutadapt.uniq.fastq
else
fastq filter --unique --adjust 32 $basef.cutadapt.fastq > $basef.cutadapt.uniq.fastq
fi
END1=$(date +%s)
diff=$(( $END1 - $START1 ))
echo -e "$(date)\t$scriptname\tDone uniq: UNIQ took $diff seconds"
fi
# run crop, Read 1
echo -e "$(date)\t$scriptname\t********** running crop, Read1 **********"
START1=$(date +%s)
if [ $run_uniq == "Y" ]
then
echo -e "$(date)\t$scriptname\tWe will be using $crop_length as the length of the cropped read"
crop_reads.csh $basef.cutadapt.uniq.fastq $start_nt $crop_length > $basef.cutadapt.uniq.cropped.fastq
else
echo -e "$(date)\t$scriptname\tWe will be using $crop_length as the length of the cropped read"
crop_reads.csh $basef.cutadapt.fastq $start_nt $crop_length > $basef.cutadapt.cropped.fastq
fi
END1=$(date +%s)
diff=$(( $END1 - $START1 ))
echo -e "$(date)\t$scriptname\tDone crop: CROP took $diff seconds"
# run dust, Read1
echo -e "$(date)\t$scriptname\t********** running dust, Read1 **********"
START1=$(date +%s)
if [ $run_uniq == "Y" ]
then
prinseq-lite.pl -fastq $basef.cutadapt.uniq.cropped.fastq -out_format 3 -out_good $basef.cutadapt.uniq.cropped.dusted -out_bad $basef.cutadapt.uniq.cropped.dusted.bad -log -lc_method dust -lc_threshold 7
mv -f $basef.cutadapt.uniq.cropped.dusted.fastq $basef.preprocessed.fastq
else
prinseq-lite.pl -fastq $basef.cutadapt.cropped.fastq -out_format 3 -out_good $basef.cutadapt.cropped.dusted -out_bad $basef.cutadapt.cropped.dusted.bad -log -lc_method dust -lc_threshold 7
mv -f $basef.cutadapt.cropped.dusted.fastq $basef.preprocessed.fastq
fi
END1=$(date +%s)
diff=$(( $END1 - $START1 ))
echo -e "$(date)\t$scriptname\tDone dust: DUST took $diff seconds"