-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_unify_refs.additional.sh
executable file
·76 lines (67 loc) · 2.84 KB
/
get_unify_refs.additional.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env bash
#either hg38 or grcm38
set -ex
ROOT="https://genome-idx.s3.amazonaws.com/recount/recount-ref"
org=$1
mkdir -p ${org}_unify
pushd ${org}_unify
#first grab all the disjoint exon mapping files (mapping back to annotated exons & genes)
#as well as a stand in file with 0's for those samples w/o any exon sums (blank_exon_sums)
#finally get the chromosome sizes for the genome reference used in recount-pump
for f in disjoint2exons.bed.gz disjoint2exons2genes.bed.gz disjoint2exons2genes.rejoin_genes.bed.gz recount_pump.chr_sizes.tsv.gz blank_exon_sums.gz exon_bitmasks.tsv.gz exon_bitmask_coords.tsv.gz ; do
unzipped=$(echo $f | sed 's/\.gz$//')
if [[ ! -e "$unzipped" ]]; then
wget $ROOT/${org}_unify/$f
gunzip $f
fi
done
#get rows counts for Unify post-run validation
wget $ROOT/${org}_unify/gene_exon_annotation_row_counts.tsv
#next get list of annotated jx's which is separate the main annotations used in recount-pump
#annotated junctions stay gzipped
wget $ROOT/${org}_unify/annotated_junctions.tsv.gz
#now get genome ref FASTA file, this is part of the recount-pump refs
#so just get it from there
if [[ ! -e ../${org}/fasta/genome.fa ]]; then
mkdir -p ../${org}
pushd ../${org}
wget $ROOT/${org}/fasta.tar.gz
tar -zxvf fasta.tar.gz
popd
fi
#can't be symbolic since the container won't be able to follow it
ln -f ../${org}/fasta/genome.fa recount_pump.fa
#now get disjoint of annotated exons, which is also part of the recount-pump refs
if [[ ! -e ../${org}/gtf/exons.bed ]]; then
mkdir -p ../${org}
pushd ../${org}
wget $ROOT/${org}/gtf.tar.gz
tar -zxvf gtf.tar.gz
popd
fi
#need to add a header to the exons file and gzip it
#slight misnomer in the header, "gene" is really "chromosome" but leave for backwards compatibility
cat <(echo "gene start end name score strand") ../${org}/gtf/exons.bed | gzip > exons.w_header.bed.gz
#finally, grab per-annotation ordering and default annotation disjoin-exon-per-gene BED file for post-run resum check
annotations="G026 G029 R109 F006 ERCC SIRV"
default="G026"
if [[ $org == "grcm38" ]]; then
annotations="M023 ERCC SIRV"
default="M023"
fi
if [[ ! -e disjoint2exons2genes.${default}.sorted.cut.bed ]]; then
wget $ROOT/${org}_unify/disjoint2exons2genes.${default}.sorted.cut.bed.gz
gunzip disjoint2exons2genes.${default}.sorted.cut.bed.gz
fi
for f in $annotations; do
f="${f}.gene_sums.gene_order.tsv.gz"
unzipped=$(echo $f | sed 's/\.gz$//')
if [[ ! -e "$unzipped" ]]; then
wget $ROOT/${org}_unify/$f
gunzip $f
fi
done
#additional annotation (e.g. gencode v43 or gencode m32)
aws s3 cp s3://monorail-batch/faster_refs/${org}/gtf/exons_new.bed.coords.gz - | zcat > exons_new.bed.coords
aws s3 cp s3://monorail-batch/faster_refs/${org}/gtf/split_exons_new.bed.gene_ids.gz - | zcat > split_exons_new.bed.gene_ids
popd