-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcompressTrinity
executable file
·255 lines (227 loc) · 11.9 KB
/
compressTrinity
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#!/usr/bin/env bash
usage()
{
cat << EOF
compressTrinity
Version 0.1 (2020-05-13)
License: GNU GPLv2
To report bugs or errors, please contact Daren Card (daren.card@gmail.com).
This script is provided as-is, with no support and no guarantee of proper or
desirable functioning.
This script will compress the standard output of Trinity, potentially saving
significant amounts of space. Based on (very limited) testing, it appears that
compressing could result in as little as roughly 20% the disk usage of the full
output. This script can also be used to uncompress previously-compressed
Trinity output.
The user must provide the absolute or relative path to the base directory of
the Trinity output. By default this is "trinity_out_dir" but it can also be
manually set by the user and could vary. The script will automatically detect
what mode of Trinity was run (regular vs. genome-guided) and will compress files
accordingly, so the user must ensure that Trinity.fasta (regular) or Trinity-GG.fasta
(genome-guided) are present in the base directory for detection purposes. The user
may also choose to adjust the compression level, though the default of 9 (max
compression) is suggested. The number of compression cores can also be set by the user.
Note that this script relies on pigz for multi-threaded compression, though
the default number of cores to use is 1. Ensure that pigz is installed for
proper functioning. Also note that this script will not compress any additional,
non-standard Trinity output that might be present, such as that created by
the user. It is recommended that the user double check the sizes of all files
and perform additional compression, where necessary. This script should not
modify any non-standard files/directories in the base directory.
compressTrinity -b <base_directory> -c -d [ -l <level> -t <threads> -h ]
OPTIONS:
-h usage information and help (this message)
-b base directory of the Trinity output
-c binary flag to compress the base directory contents [false]
-d binary flag to decompress the base directory contents [false]
-l the level of compression (1-9) [9]
-t number of computer threads to use for compression [1]
EOF
}
THREAD=1
COMPRESS='false'
DECOMPRESS='false'
LEVEL=9
while getopts "hb:cdl:t:" OPTION
do
case $OPTION in
help)
usage
exit 1
;;
b)
BASE=$OPTARG
;;
c)
COMPRESS='true'
;;
d)
DECOMPRESS='true'
;;
l)
LEVEL=$OPTARG
;;
t)
THREAD=$OPTARG
;;
?)
usage
exit
;;
esac
done
if [[ -z ${BASE} ]]
then
echo -e 'Error: base directory not provided.\n' >&2
usage
exit 1
fi
if [[ ${COMPRESS} == 'true' && ${DECOMPRESS} == 'true' ]]
then
echo -e 'Error: compression and decompression both selected.\n' >&2
usage
exit 1
fi
if [[ ${COMPRESS} == 'false' && ${DECOMPRESS} == 'false' ]]
then
echo -e 'Error: compression or decompression not selected.\n' >&2
usage
exit 1
fi
if ! [[ -x "$(command -v pigz)" ]]
then
echo -e 'Error: pigz is not installed.\n' >&2
usage
exit 1
fi
if [[ ${COMPRESS} == 'true' ]]
then
# normal Trinity mode
if [[ -f ${BASE}/Trinity.fasta ]]
then
pigz -${LEVEL} -p ${THREAD} ${BASE}/both.fa &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/inchworm.fa &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/jellyfish.kmers.25.asm.fa &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/partitioned_reads.files.list &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/recursive_trinity.cmds.completed &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/recursive_trinity.cmds &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/scaffolding_entries.sam &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/Trinity.fasta.gene_trans_map &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/Trinity.fasta &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/chrysalis/bundled_iworm_contigs.fasta &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/chrysalis/GraphFromIwormFasta.out &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/chrysalis/inchworm.fa.min100.?.bt2 &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/chrysalis/inchworm.fa.min100 &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/chrysalis/inchworm.fa.min100.rev.?.bt2 &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/chrysalis/iworm_cluster_welds_graph.txt &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/chrysalis/iworm_cluster_welds_graph.txt.sorted &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/chrysalis/iworm_cluster_welds_graph.txt.sorted.wIwormNames &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/chrysalis/iworm_scaffolds.txt &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/chrysalis/readsToComponents.out &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/chrysalis/readsToComponents.out.sort &&
tar cf - ${BASE}/read_partitions | pigz -${LEVEL} -p ${THREAD} > ${BASE}/read_partitions.tar.gz &&
rm -rf ${BASE}/read_partitions &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/insilico_read_normalization/*.fq
# reference guided Trinity mode
elif [[ -f ${BASE}/Trinity-GG.fasta ]]
then
for dir in `find ${base} -type d -name "Dir_*_merged_stranded.bam.norm_200.bam.?.sam.minC1.gff"`
do
cmd="tar cf - ${dir} | pigz -${LEVEL} -p ${THREAD} > ${dir}.tar.gz" &&
eval $cmd &&
rm -rf ${dir}
done
pigz -${LEVEL} -p ${THREAD} ${BASE}/Dir_*_merged_stranded.bam.norm_200.bam.-.sam.minC1.gff.listing &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/Dir_*_merged_stranded.bam.norm_200.bam.+.sam.minC1.gff.listing &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/read_files.list &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.-.sam &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.+.sam &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.-.sam.frag_coords &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.+.sam.frag_coords &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.-.sam.frag_coverage.wig &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.+.sam.frag_coverage.wig &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.-.sam.minC1.gff &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.+.sam.minC1.gff &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.-.sam.read_coords &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.+.sam.read_coords &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.-.sam.read_coords.sort_by_readname &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.+.sam.read_coords.sort_by_readname &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/trinity_GG.cmds &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/trinity_GG.cmds.completed &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/Trinity-GG.fasta &&
pigz -${LEVEL} -p ${THREAD} ${BASE}/Trinity-GG.fasta.gene_trans_map
else
echo -e "Error: Cannot detect the type of Trinity run to compress. Please ensure that one of the Trinity files is provided in the base directory:"
echo -e "1. Trinity.fasta must be present for the normal Trinity mode"
echo -e "2. Trinity-GG.fasta must be present for the genome-guided Trinity mode.\n"
usage
exit 1
fi
fi
if [[ ${DECOMPRESS} == 'true' ]]
then
# normal Trinity mode
if [[ -f ${BASE}/Trinity.fasta.gz ]]
then
pigz -d -p ${THREAD} ${BASE}/both.fa.gz &&
pigz -d -p ${THREAD} ${BASE}/inchworm.fa.gz &&
pigz -d -p ${THREAD} ${BASE}/jellyfish.kmers.25.asm.fa.gz &&
pigz -d -p ${THREAD} ${BASE}/partitioned_reads.files.list.gz &&
pigz -d -p ${THREAD} ${BASE}/recursive_trinity.cmds.completed.gz &&
pigz -d -p ${THREAD} ${BASE}/recursive_trinity.cmds.gz &&
pigz -d -p ${THREAD} ${BASE}/scaffolding_entries.sam.gz &&
pigz -d -p ${THREAD} ${BASE}/Trinity.fasta.gene_trans_map.gz &&
pigz -d -p ${THREAD} ${BASE}/Trinity.fasta.gz &&
pigz -d -p ${THREAD} ${BASE}/chrysalis/bundled_iworm_contigs.fasta.gz &&
pigz -d -p ${THREAD} ${BASE}/chrysalis/GraphFromIwormFasta.out.gz &&
pigz -d -p ${THREAD} ${BASE}/chrysalis/inchworm.fa.min100.?.bt2.gz &&
pigz -d -p ${THREAD} ${BASE}/chrysalis/inchworm.fa.min100.gz &&
pigz -d -p ${THREAD} ${BASE}/chrysalis/inchworm.fa.min100.rev.?.bt2.gz &&
pigz -d -p ${THREAD} ${BASE}/chrysalis/iworm_cluster_welds_graph.txt.gz &&
pigz -d -p ${THREAD} ${BASE}/chrysalis/iworm_cluster_welds_graph.txt.sorted.gz &&
pigz -d -p ${THREAD} ${BASE}/chrysalis/iworm_cluster_welds_graph.txt.sorted.wIwormNames.gz &&
pigz -d -p ${THREAD} ${BASE}/chrysalis/iworm_scaffolds.txt.gz &&
pigz -d -p ${THREAD} ${BASE}/chrysalis/readsToComponents.out.gz &&
pigz -d -p ${THREAD} ${BASE}/chrysalis/readsToComponents.out.sort.gz &&
root=`basename ${BASE}/read_partitions.tar.gz .tar.gz` &&
cmd="pigz -dc -p ${THREAD} ${BASE}/read_partitions.tar.gz | tar xf - > ${BASE}/${root}" &&
eval $cmd &&
rm ${BASE}/read_partitions.tar.gz &&
pigz -d -p ${THREAD} ${BASE}/insilico_read_normalization/*.fq.gz
# reference guided Trinity mode
elif [[ -f ${BASE}/Trinity-GG.fasta.gz ]]
then
for tarball in `find ${base} -type f -name "Dir_*_merged_stranded.bam.norm_200.bam.?.sam.minC1.gff.tar.gz"`
do
root=`basename ${tarball} .tar.gz` &&
cmd="pigz -dc -p ${THREAD} ${tarball} | tar xf - > ${BASE}/${root}" &&
eval $cmd &&
rm -rf ${tarball}
done
pigz -d -p ${THREAD} ${BASE}/Dir_*_merged_stranded.bam.norm_200.bam.-.sam.minC1.gff.listing.gz &&
pigz -d -p ${THREAD} ${BASE}/Dir_*_merged_stranded.bam.norm_200.bam.+.sam.minC1.gff.listing.gz &&
pigz -d -p ${THREAD} ${BASE}/read_files.list.gz &&
pigz -d -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.-.sam.gz &&
pigz -d -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.+.sam.gz &&
pigz -d -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.-.sam.frag_coords.gz &&
pigz -d -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.+.sam.frag_coords.gz &&
pigz -d -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.-.sam.frag_coverage.wig.gz &&
pigz -d -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.+.sam.frag_coverage.wig.gz &&
pigz -d -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.-.sam.minC1.gff.gz &&
pigz -d -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.+.sam.minC1.gff.gz &&
pigz -d -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.-.sam.read_coords.gz &&
pigz -d -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.+.sam.read_coords.gz &&
pigz -d -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.-.sam.read_coords.sort_by_readname.gz &&
pigz -d -p ${THREAD} ${BASE}/*_merged_stranded.bam.norm_200.bam.+.sam.read_coords.sort_by_readname.gz &&
pigz -d -p ${THREAD} ${BASE}/trinity_GG.cmds.gz &&
pigz -d -p ${THREAD} ${BASE}/trinity_GG.cmds.completed.gz &&
pigz -d -p ${THREAD} ${BASE}/Trinity-GG.fasta.gz &&
pigz -d -p ${THREAD} ${BASE}/Trinity-GG.fasta.gene_trans_map.gz
else
echo -e "Error: Cannot detect the type of Trinity run to decompress. Please ensure that one of the Trinity files is present in the base directory:"
echo -e "1. Trinity.fasta.gz must be present for the normal Trinity mode"
echo -e "2. Trinity-GG.fasta.gz must be present for the genome-guided Trinity mode.\n"
usage
exit 1
fi
fi