forked from chiulab/surpi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cutadapt_quality.csh
executable file
·118 lines (111 loc) · 7.13 KB
/
cutadapt_quality.csh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/bin/csh
#
# cutadapt_quality.csh
#
# runs cutadapt to remove primer sequences from Illumina files
# also accepts a quality argument for Illumina / Sanger quality
# user specifies length cutoff
# user specifies whether short reads less than length cutoff are kept; if so, they are converted to size=1
#
# Chiu Laboratory
# University of California, San Francisco
# January, 2014
#
# *** modified by Scot Federman, 2013 ***
# -- Added TEMPDIR var for portability to AWS
# -- TEMPDIR specifies a directory that can be used for temp storage during the life of this program execution
# -- AWS typically has boot volumes of only 8GB, which is too small for the data created.
#
# Copyright (C) 2014 Charles Chiu - All Rights Reserved
# SURPI has been released under a modified BSD license.
# Please see license file for details.
if ($#argv != 7) then
echo "Usage: cutadapt_quality.csh <input FASTQ file> <quality S/I> <length cutoff> <keep short reads Y/N> <adapter_set> <temporary_files_directory> <quality cutoff>"
exit(1)
endif
###
set inputfile = $argv[1]
set quality = $argv[2]
set length_cutoff = $argv[3]
set keep_short_reads = $argv[4]
set adapter_set = $argv[5]
set TEMPDIR = $argv[6]
set quality_cutoff = $argv[7]
###
#set numreads_start = `egrep -c "@HWI|@M00|@SRR" $inputfile`
#echo $numreads_start" reads at beginning of cutadapt"
if ($quality == "S") then
echo "Quality is Sanger, quality filtering <15 by default"
set qual = 33
else
echo "Quality is Illumina, quality filtering <15 by default"
set qual = 64
endif
if ($adapter_set == "Truseq") then
if ($keep_short_reads == "N") then # delete short reads
cutadapt -g GTTTCCCACTGGAGGATA -a TATCCTCCAGTGGGAAAC -a AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT -g GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -a AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATC -n 15 -O 10 -q $quality_cutoff -m $length_cutoff --quality-base=$qual -o $TEMPDIR{$$} --info-file=$inputfile:r.adapterinfo.log $inputfile > $inputfile:r.cutadapt.summary.log
echo -n "****** removing reads of size less than "
echo -n $length_cutoff
echo " bp ******"
sed 's/^$/N/g' $TEMPDIR{$$} > $inputfile:r.cutadapt.fastq
else
cutadapt -g GTTTCCCACTGGAGGATA -a TATCCTCCAGTGGGAAAC -a AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT -g GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -a AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATC -n 15 -O 10 -q $quality_cutoff --quality-base=$qual -o $TEMPDIR{$$} --info-file=$inputfile:r.adapterinfo.log $inputfile > $inputfile:r.cutadapt.summary.log
# convert entries <50 to size=1 ("N")
echo -n "****** converting reads of size less than "
echo -n $length_cutoff
echo " bp to N ******"
sed 's/^$/N/g' $TEMPDIR{$$} | awk 'NR%2==1 {print $0} NR%2==0 {if ( length ( $0 ) < '$length_cutoff') print "N"; else print $0}' > $inputfile:r.cutadapt.fastq
endif
else if ($adapter_set == "Nextera") then
if ($keep_short_reads == "N") then # delete short reads
cutadapt -a CTGTCTCTTATACACATCTCCGAGCCCACGAGAC -a CTGTCTCTTATACACATCTGACGCTGCCGACGA -a CTGTCTCTTATACACATCT -n 15 -O 15 -q $quality_cutoff -m $length_cutoff --quality-base=$qual -o $TEMPDIR{$$} --info-file=$inputfile:r.adapterinfo.log $inputfile > $inputfile:r.cutadapt.summary.log
echo -n "****** removing reads of size less than "
echo -n $length_cutoff
echo " bp ******"
sed 's/^$/N/g' $TEMPDIR{$$} > $inputfile:r.cutadapt.fastq
else
cutadapt -a CTGTCTCTTATACACATCTCCGAGCCCACGAGAC -a CTGTCTCTTATACACATCTGACGCTGCCGACGA -a CTGTCTCTTATACACATCT -n 15 -O 15 -q $quality_cutoff --quality-base=$qual -o $TEMPDIR{$$} --info-file=$inputfile:r.adapterinfo.log $inputfile > $inputfile:r.cutadapt.summary.log
# convert entries <50 to size=1 ("N")
echo -n "****** converting reads of size less than "
echo -n $length_cutoff
echo " bp to N ******"
sed 's/^$/N/g' $TEMPDIR{$$} | awk 'NR%2==1 {print $0} NR%2==0 {if ( length ( $0 ) < '$length_cutoff') print "N"; else print $0}' > $inputfile:r.cutadapt.fastq
endif
else if ($adapter_set == "NexSolTruseq") then
if ($keep_short_reads == "N") then # delete short reads
cutadapt -g GTTTCCCACTGGAGGATA -a TATCCTCCAGTGGGAAAC -a AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT -g GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -a AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATC -a CTGTCTCTTATACACATCTCCGAGCCCACGAGAC -a CTGTCTCTTATACACATCTGACGCTGCCGACGA -a CTGTCTCTTATACACATCT -n 15 -O 15 -q $quality_cutoff -m $length_cutoff --quality-base=$qual -o $TEMPDIR{$$} --info-file=$inputfile:r.adapterinfo.log $inputfile > $inputfile:r.cutadapt.summary.log
echo -n "****** removing reads of size less than "
echo -n $length_cutoff
echo " bp ******"
sed 's/^$/N/g' $TEMPDIR{$$} > $inputfile:r.cutadapt.fastq
else
cutadapt -g GTTTCCCACTGGAGGATA -a TATCCTCCAGTGGGAAAC -a AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT -g GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -a AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATC -a CTGTCTCTTATACACATCTCCGAGCCCACGAGAC -a CTGTCTCTTATACACATCTGACGCTGCCGACGA -a CTGTCTCTTATACACATCT -n 15 -O 15 -q $quality_cutoff --quality-base=$qual -o $TEMPDIR{$$} --info-file=$inputfile:r.adapterinfo.log $inputfile > $inputfile:r.cutadapt.summary.log
# convert entries <50 to size=1 ("N")
echo -n "****** converting reads of size less than "
echo -n $length_cutoff
echo " bp to N ******"
sed 's/^$/N/g' $TEMPDIR{$$} | awk 'NR%2==1 {print $0} NR%2==0 {if ( length ( $0 ) < '$length_cutoff') print "N"; else print $0}' > $inputfile:r.cutadapt.fastq
endif
else if ($adapter_set == "NexSolB") then
if ($keep_short_reads == "N") then # delete short reads
cutadapt -g GTTTCCCACTGGAGGATA -a TATCCTCCAGTGGGAAAC -a CTGTCTCTTATACACATCTCCGAGCCCACGAGAC -a CTGTCTCTTATACACATCTGACGCTGCCGACGA -a CTGTCTCTTATACACATCT -n 15 -O 15 -q $quality_cutoff -m $length_cutoff --quality-base=$qual -o $TEMPDIR{$$} --info-file=$inputfile:r.adapterinfo.log $inputfile > $inputfile:r.cutadapt.summary.log
echo -n "****** removing reads of size less than "
echo -n $length_cutoff
echo " bp ******"
sed 's/^$/N/g' $TEMPDIR{$$} > $inputfile:r.cutadapt.fastq
else
cutadapt -g GTTTCCCACTGGAGGATA -a TATCCTCCAGTGGGAAAC -a CTGTCTCTTATACACATCTCCGAGCCCACGAGAC -a CTGTCTCTTATACACATCTGACGCTGCCGACGA -a CTGTCTCTTATACACATCT -n 15 -O 15 -q $quality_cutoff --quality-base=$qual -o $TEMPDIR{$$} --info-file=$inputfile:r.adapterinfo.log $inputfile > $inputfile:r.cutadapt.summary.log
# convert entries <50 to size=1 ("N")
echo -n "****** converting reads of size less than "
echo -n $length_cutoff
echo " bp to N ******"
sed 's/^$/N/g' $TEMPDIR{$$} | awk 'NR%2==1 {print $0} NR%2==0 {if ( length ( $0 ) < '$length_cutoff') print "N"; else print $0}' > $inputfile:r.cutadapt.fastq
endif
else
echo "No adapter set selected!!!!!"
endif
#set numreads_end = `egrep -c "@HWI|@M00|@SRR" $inputfile:r.cutadapt.fastq`
#@ reads_removed = $numreads_start - $numreads_end
#echo $reads_removed" reads removed by cutadapt"
#echo $numreads_end" reads at end of cutadapt"
rm -f $TEMPDIR{$$}