forked from chiulab/surpi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractSamFromSam.sh
executable file
·67 lines (56 loc) · 1.95 KB
/
extractSamFromSam.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/bin/bash
#
# extactSamFromSam.sh
#
# extract SAM reads corresponding to a SAM header file from another SAM reference file and writes to
# SAM output file
# Chiu Laboratory
# University of California, San Francisco
# 3/15/2014
#
# Copyright (C) 2014 Charles Y Chiu - All Rights Reserved
# SURPI has been released under a modified BSD license.
# Please see license file for details.
scriptname=${0##*/}
if [ $# -lt 3 ]; then
echo "Usage: $scriptname <SAM header file> <SAM reference file> <SAM output file> <optional: # of cores>"
exit
fi
###
basef=$1
baseg=$2
output_file=$3
cores=$4
###
echo -e "$(date)\t$scriptname\tstarting: "
START1=$(date +%s)
if [ $# -lt 4 ]; then # using 1 core only
echo -e "$(date)\t$scriptname\textracting reads from $baseg using headers from $basef..."
# associative array for lookup
awk 'FNR==NR { a[$1]=$1; next} $1 in a {print $0}' "$basef" "$baseg" > $output_file
echo -e "$(date)\t$scriptname\tdone"
else
# splitting input SAM header file by number of cores
echo -e "$(date)\t$scriptname\tsplitting $basef..."
let "numlines = `wc -l basef | awk '{print $1}'`"
let "LinesPerCore = numlines / $cores"
echo -e "$(date)\t$scriptname\twill use $cores cores with $LinesPerCore entries per core"
split -l $LinesPerCore $basef
echo -e "$(date)\t$scriptname\textracting reads from $baseg using headers from $basef"
rm -f $output_file # delete previous output file, if present
for f in `ls x??`
do
# associative array for lookup, running in background
awk 'FNR==NR { a[$1]=$1; next} $1 in a {print $0)}' "$f" "$baseg" >> $output_file &
done
for job in `jobs -p`
do
wait $job
done
echo -e "$(date)\t$scriptname\tdone extracting reads for each chunk"
rm -f x??
fi
END1=$(date +%s)
echo -e "$(date)\t$scriptname\tDone with extractSamFromSam.sh"
diff=$(( $END1 - $START1 ))
echo -e "$(date)\t$scriptname\textractSamFromSam.sh took $diff seconds"