-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathngs2poppca.sh
137 lines (115 loc) · 7.48 KB
/
ngs2poppca.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/bash -l
#OUTDIR=/home/dmvelasc/Projects/Prunus/Analysis/angsd_output
#SBATCH -D /home/dmvelasc/Projects/Prunus/Analysis/angsd_output
#SBATCH -o /home/dmvelasc/Projects/Prunus/slurm-log/%j-pca-stdout.txt
#SBATCH -e /home/dmvelasc/Projects/Prunus/slurm-log/%j-pca-stderr.txt
#SBATCH -J 2poppca
#SBATCH -p bigmemm
#SBATCH -n 1
#SBATCH -c 4
#SBATCH --mail-user=dmvelasco@ucdavis.edu
#SBATCH --mail-type=ALL
set -e
set -u
# Array variables
#x=$SLURM_ARRAY_TASK_ID
# Load zlib 1.2.8
module load zlib
# Declare directories
dir1="/home/dmvelasc/bin" # software binary directory
dir2="/home/dmvelasc/Projects/Prunus/Script" # script directory
dir3="/home/dmvelasc/Data/references/persica-SCF" # reference genome FASTA directory
dir4="/home/dmvelasc/Data/references/cerasifera" # ancestral genome FASTA directory
dir5="/home/dmvelasc/Software/angsd/misc" # ANGSD subprogram directory
dir6="/home/dmvelasc/Software/ngsPopGen" # ngsPopGen directory #formerly used in subdirectory of ngsTools
#Declare other variables
taxa1="PD" # file prefix for dulcis
taxa2="PP" # file prefix for persica
taxa3="PE" # file prefix for combined dulcis and persica
#taxa3="PA" # file prefix for combined dulcis and persica
bams1="angsd.dulcis2.txt" # list of BAM files for dulcis
bams2="angsd.persica2.txt" # list of BAM files for persica
bams3="angsd.domesticates.txt" # list of BAM files for dulcis and persica combined
#bams3="angsd.admix.txt" # list of BAM files for dulcis and persica combined
ref="Prunus_persica_v1.0_scaffolds.fa" # file name for reference genome
anc="PC01.fa" # file name for ancestral genome
threads="4" # number of threads used
minMapQ="30" # minimum mapping quality
minQ="20" # mimimum base quality
r="scaffold" # region prefix
# ANGSD resource at http://cgrlucb.wikispaces.com/file/view/CGRL_SNP_workshop.pdf
##### CALCULATING GENOTYPES, GENOTYPE LIKELIHOODS, SFS, MAFS, ETC. WHILE ACCOUNTING FOR INBREEDING
date
echo "Prepping variables..."
N_IND1=$(cat "$dir2"/"$bams1" | wc -l)
N_IND2=$(cat "$dir2"/"$bams2" | wc -l)
N_IND3=$(cat "$dir2"/"$bams3" | wc -l)
echo -e "Number of $taxa1 individuals: $N_IND1 \nNumber of $taxa2 individuals: $N_IND2 \nNumber of $taxa3 individuals: $N_IND3"
CHR1=$(( $N_IND1 * 2 ))
CHR2=$(( $N_IND2 * 2 ))
CHR3=$(( $N_IND3 * 2 ))
echo -e "Number of $taxa1 chromosomes: $CHR1 \nNumber of $taxa2 chromosomes: $CHR2 \nNumber of $taxa3 chromosomes: $CHR3"
date
#######################################
# COMBINED DULCIS AND PERSICA SAMPLES #
#######################################
##### PRINCIPAL COMPONENT ANALYSIS (PCA)
# Covariance matrix
date
echo "Calculating PCA data"
#BELOW works
"$dir1"/angsd -bam "$dir2"/"$bams3" -anc "$dir4"/"$anc" -ref "$dir3"/"$ref" -nInd "$N_IND3" -doMajorMinor 1 -doMaf 1 -doSaf 2 -GL 1 -doPost 1 -doGeno 32 -indF "$taxa3"_F.indF -r scaffold_1 -nThreads "$threads" -minMapQ "$minMapQ" -minQ "$minQ" -out "$taxa3"_s1.pca
# without region
#"$dir1"/angsd -bam "$dir2"/"$bams3" -anc "$dir4"/"$anc" -ref "$dir3"/"$ref" -nInd "$N_IND3" -doMajorMinor 1 -doMaf 1 -doSaf 2 -GL 1 -doPost 1 -doGeno 32 -indF "$taxa3"_F.indF -nThreads "$threads" -minMapQ "$minMapQ" -minQ "$minQ" -out "$taxa3".pca
# entire genome data was too big to unzip without eating all drive space so limiting to chr 1
#BELOW works, once done not needed again. should probably have a whether file exists check.
echo "Unzipping geno.gz file."
gunzip -c "$taxa3"_s1.pca.geno.gz > "$taxa3"_s1.pca.geno
#gunzip -c "$taxa3".pca.geno.gz > "$taxa3".pca.geno
# Use ngsCovar to estimate a covariance matrix between pairs of individuals. Use -norm 0
# for low coverage sequencing data, which disables normalization proposed in Patterson et
# al. (2006). The first way is to compute an approximation of the posterior of the
# covariance matrix, by weighting each site by its probability of being variable, as
# proposed in Fumagalli et al. (2013):
#../ngsCovar -probfile testA.geno -outfile testA.covar1 -nind 24 -nsites 10000 -call 0 -sfsfile testA.rf.saf -norm 0
N_SITES2=$(( `zcat "$taxa3"_s1.pca.mafs.gz | wc -l`-1 ))
#N_SITES2=$(( `zcat "$taxa3".pca.mafs.gz | wc -l`-1 ))
echo $N_SITES2
# below test based on responses to issue on ngsPopGen github repo
"$dir5"/realSFS print "$taxa3"_s1.pca.saf.idx -oldout 1
#"$dir5"/realSFS print "$taxa3".pca.saf.idx -oldout 1
#use below line while testing otherwise ngsCovar errors that there is already a file by that name
#rm "$taxa3"_s1_wtd.covar
echo "Calculating weighted covariance # not this time"
#"$dir6"/ngsCovar -probfile "$taxa3"_s1.pca.geno -outfile "$taxa3"_s1_wtd.covar -nind "$N_IND3" -nsites "$N_SITES2" -block_size 20000 -call 0 -sfsfile "$taxa3"_s1.pca.saf -norm 0
#"$dir6"/ngsCovar -probfile "$taxa3".pca.geno.gz -outfile "$taxa3"_wtd.covar -nind "$N_IND3" -nsites "$N_SITES2" -block_size 20000 -call 0 -sfsfile "$taxa3".pca.saf -norm 0
# -sfsfile "file with sample allele frequency posterior probabilities"
# The example in the main README shows a file named "pop.sfs.ml.norm" for this parameter (2014-01-03)
# The examples directory example shows a file named "testA.rf.saf" for this parameter (2014-01-07)
# The error when using PE.ml generated for another analysis is "Possible error, binary files might be broken",
# which suggests the testA.rf.saf file may be the correct one
# -norm "0 [no normalization, recommended if no SNP calling is performed]"
# "1 [matrix is normalized by p(1-p) as in Patterson et al. (2006)]"
# "2 [normalized by 2p(1-p)]"
# NOTE: 2015-05-19 commit "Added support to read GENO and SFS files directly from STDIN and print results directly to STDOUT"
# Alternatively, one can remove non-variable or low-frequency sites (this is the preferred way under most cases) with the option -minmaf:
#../ngsCovar -probfile testA.geno -outfile testA.covar2 -nind 24 -nsites 10000 -call 0 -minmaf 0.05
#DV NOTE: works, but gives odd results for samples with low coverage.
echo "Calculating covariance with min MAF of 0.05"
#N_SITES2=$(( `zcat "$taxa3"_s1.pca.mafs.gz | wc -l`-1 ))
"$dir6"/ngsCovar -probfile "$taxa3"_s1.pca.geno -outfile "$taxa3"_s1.covar -nind "$N_IND3" -nsites "$N_SITES2" -block_size 20000 -call 0 -minmaf 0.05
#"$dir6"/ngsCovar -probfile "$taxa3".pca.geno.gz -outfile "$taxa3".covar -nind "$N_IND3" -nsites "$N_SITES2" -block_size 20000 -call 0 -minmaf 0.05
# Finally, in case of high sequencing depth, one can call genotypes as the genotype with the highest posterior probability:
#../ngsCovar -probfile testA.geno -outfile testA.covar3 -nind 24 -nsites 10000 -call 1 -minmaf 0.05
#"dir6"/ngsCovar -probfile testA.geno -outfile testA.covar3 -nind 24 -nsites 10000 -call 1 -minmaf 0.05
# These commands will produce text files with a symmetric covariance matrix NxN (for N individuals).
echo "Done calculating PCA"
date
# PCA plot
# From the covariance matrix we can use a simple R script to perform an eigenvalue decomposition and plot the
# PCA results. Please make sure the use updated versions of required R packages. First, let's create a dummy plink cluster file.
#Rscript --vanilla --slave -e 'write.table(cbind(seq(1,24),rep(1,24),c(rep("A",10),rep("B",8),rep("C",6))), row.names=F, sep="\t", col.names=c("FID","IID","CLUSTER"), file="testA.clst", quote=F)'
# Assuming we want to plot the first 2 PCA components from the testA.covar1 file, we can use the plotPCA.R script provided:
#Rscript --vanilla --slave $SCRIPTS/plotPCA.R -i testA.covar1 -c 1-2 -a testA.clst -o testA.pca.SAF.pdf
# Please note that you need 'ggplot2' and 'optparse' R libraries installed. This script will output the
# explained genetic variance for each component and save as output the PCA plot.