-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsnATACseq_LDSR.smk
126 lines (108 loc) · 6.56 KB
/
snATACseq_LDSR.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# -------------------------------------------------------------------------------------
#
#
# Script for running LDSC on snATAC-seq data peak files
#
#
# -------------------------------------------------------------------------------------
# --------- SET SMK PARAMS ----------
configfile: "../config/config.yaml"
# ------------- RULES ---------------
rule annot2bed:
input: folder = "../resources/ldsc/reference_files/baseline_v1.2_1000G_Phase3"
params: file = "../resources/ldsc/reference_files/baseline_v1.2_1000G_Phase3/baseline.{CHR}.annot.gz"
output: "../results/LDSR/annotation_files/baseline.{CHR}_no_head.bed"
message: "Preparing annotation files for {wildcards.CHR}"
shell:
"zcat {params.file} | tail -n +2 | awk -v OFS=\"\t\" '{{print \"chr\"$1, $2-1, $2, $3, $4}}' "
"| sort -k1,1 -k2,2n > {output}"
rule lift_over:
input: mybed = "../results/peaks/{CELL_TYPE}.hg38.bed",
chain_file = "../resources/liftover/hg38ToHg19.over.chain.gz"
output: "../results/peaks/{CELL_TYPE}.hg19.bed"
message: "Lifting {input.mybed} to hg19"
log: "../results/logs/LDSR/scATACseq.{CELL_TYPE}.hg19_liftOver.log"
params: "../results/archR_data_processing/peaks/{CELL_TYPE}.hg38_unlifted.bed"
shell:
"""
../resources/liftover/liftOver {input.mybed} {input.chain_file} {output} {params} 2> {log}
"""
rule intersect_mybed:
input: annot = rules.annot2bed.output,
mybed = "../results/peaks/{CELL_TYPE}.hg19.bed"
output: "../results/LDSR/annotation_files/snATACseq.{CELL_TYPE}.{CHR}.annot.gz"
params: out = "../results/LDSR/annotation_files/snATACseq.{CELL_TYPE}.{CHR}.annot"
message: "Creating LDSR annotation bed files for {wildcards.CHR}"
shell:
"module load bedtools; "
"echo -e \"CHR\tBP\tSNP\tCM\tANN\" > {params.out}; "
"bedtools intersect -a {input.annot} -b {input.mybed} -c | "
"sed 's/^chr//g' | awk -v OFS=\"\t\" '{{print $1, $2, $4, $5, $6}}' >> {params.out}; "
"gzip {params.out}"
rule ldsr:
input: annot = "../results/LDSR/annotation_files/snATACseq.{CELL_TYPE}.{CHR}.annot.gz",
bfile_folder = "../resources/ldsc/reference_files/1000G_EUR_Phase3_plink",
snps_folder = "../resources/ldsc/reference_files/hapmap3_snps"
output: "../results/LDSR/annotation_files/snATACseq.{CELL_TYPE}.{CHR}.l2.ldscore.gz"
conda: "../envs/ldsc.yml"
params: bfile = "../resources/ldsc/reference_files/1000G_EUR_Phase3_plink/1000G.EUR.QC.{CHR}",
ldscores = "../results/LDSR/annotation_files/snATACseq.{CELL_TYPE}.{CHR}",
snps = "../resources/ldsc/reference_files/w_hm3.snplist_rsIds"
message: "Running LDSR Phase3 {wildcards.CHR}"
log: "../results/logs/LDSR/snATACseq.{CELL_TYPE}.chr{CHR}_ldsr.log"
shell:
"python ../resources/ldsc/ldsc.py --l2 --bfile {params.bfile} --ld-wind-cm 1 "
"--annot {input.annot} --out {params.ldscores} --print-snps {params.snps} 2> {log}"
rule partitioned_heritability:
input: SUMSTATS = "../results/GWAS_for_ldsc/{GWAS}_hg19_ldsc_ready.sumstats.gz",
LDSR = expand(rules.ldsr.output, CELL_TYPE = config['LDSR_CELL_TYPES'], CHR = range(1,23))
output: "../results/LDSR/part_herit/baseline_v1.2/snATACseq_LDSR_{CELL_TYPE}_{GWAS}_baseline.v1.2.results"
conda: "../envs/ldsc.yml"
params: weights = "../resources/ldsc/reference_files/weights_hm3_no_hla/weights.",
baseline = "../resources/ldsc/reference_files/baseline_v1.2_1000G_Phase3/baseline.",
frqfile = "../resources/ldsc/reference_files/1000G_Phase3_frq/1000G.EUR.QC.",
LD_anns = "../results/LDSR/annotation_files/snATACseq.{CELL_TYPE}.",
out_file = "../results/LDSR/part_herit/baseline_v1.2/snATACseq_LDSR_{CELL_TYPE}_{GWAS}_baseline.v1.2"
message: "Running Prt Hrt with {wildcards.CELL_TYPE} and {wildcards.GWAS} GWAS"
log: "../results/logs/LDSR/snATACseq.{CELL_TYPE}.{GWAS}_baseline.v1.2_partHerit.log"
shell:
"python ../resources/ldsc/ldsc.py --h2 {input.SUMSTATS} --w-ld-chr {params.weights} "
"--ref-ld-chr {params.baseline},{params.LD_anns} --overlap-annot "
"--frqfile-chr {params.frqfile} --out {params.out_file} --print-coefficients 2> {log}"
rule create_partHerit_summary:
# Requires list of snATACseq cell types in atac_celltypes.tsv
input: expand("../results/LDSR/part_herit/baseline_v1.2/snATACseq_LDSR_{CELL_TYPE}_{GWAS}_baseline.v1.2.results", CELL_TYPE = config["LDSR_CELL_TYPES"], GWAS = config['LDSC_GWAS'])
output: "../results/LDSR/part_herit/baseline_v1.2/snATACseq_LDSR_baseline.v1.2_summary_{GWAS}.tsv"
message: "Creating summary file for {wildcards.GWAS} GWAS"
params: ph_dir = "../results/LDSR/part_herit/baseline_v1.2/",
results_dir = "../results/LDSR/part_herit/baseline_v1.2/",
cell_types = "../resources/sheets/atac_celltypes.tsv"
log: "../results/logs/LDSR/snATACseq.{GWAS}_baseline_v1.2_partHerit.summary.log"
shell:
"""
head -1 {params.ph_dir}snATACseq_LDSR_FC.ExN_SCZ_baseline.v1.2.results > {output}
File={params.cell_types}
Lines=$(cat $File)
for Line in $Lines
do
grep L2_1 {params.ph_dir}snATACseq_LDSR_"$Line"_{wildcards.GWAS}_baseline.v1.2.results | sed "s/L2_1/$Line/g" >> {output}
done
"""
rule ldsr_mrkdwn_report:
input: summary = expand("../results/LDSR/part_herit/baseline_v1.2/snATACseq_LDSR_baseline.v1.2_summary_{GWAS}.tsv", GWAS = config['LDSC_GWAS']),
markdown = "scripts/snATACseq_LDSR_report.Rmd"
output: "../results/rmarkdown_reports/snATACseq_LDSR_baseline.v1.2_report.html"
message: "Creating LDSR Rmarkdown report"
params: out_dir = "../results/rmarkdown_reports/",
report_file = "snATACseq_LDSR_baseline.v1.2_report.html"
log: "../results/logs/LDSR/snATACseq.LDSR.baseline.v1.2.report.log"
shell:
"""
export R_LIBS_USER=/scratch/c.c1477909/R/library
module load libgit2/1.1.0
module load pandoc/2.7.3
/apps/languages/R/4.0.3/el7/AVX512/gnu-8.1/bin/Rscript --vanilla \
scripts/snATACseq_LDSR_report.R {input.markdown} {params.out_dir} {params.report_file} 2> {log}
"""
# -------------------------------------------------------------------------------------
# -------------------------------------------------------------------------------------