forked from Al-Murphy/MungeSumstats
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.R
163 lines (156 loc) · 6.7 KB
/
data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#' Summary Statistics Column Headers
#'
#' @description List of uncorrected column headers often found in GWAS Summary
#' Statistics column headers. Note the effect allele will always be the A2
#' allele, this is the approach done for
#' VCF(https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7805039/). This is enforced
#' with the column header corrections here and also the check allele flipping
#' test.
#'
#' @source The code to prepare the .Rda file file from the marker file is:
#' \code{
#' # Most the data in the below table comes from the LDSC github wiki
#' data("sumstatsColHeaders")
#' # Make additions to sumstatsColHeaders using github version of MungeSumstats-
#' # shown is an example of adding columns for Standard Error (SE)
#' #se_cols <- data.frame("Uncorrected"=c("SE","se","STANDARD.ERROR",
#' # "STANDARD_ERROR","STANDARD-ERROR"),
#' # "Corrected"=rep("SE",5))
#' #sumstatsColHeaders <- rbind(sumstatsColHeaders,se_cols)
#' #Once additions are made, order & save the new mapping dataset
#' #now sort ordering -important for logic that
#' # uncorrected=corrected comes first
#' sumstatsColHeaders$ordering <-
#' sumstatsColHeaders$Uncorrected==sumstatsColHeaders$Corrected
#' sumstatsColHeaders <-
#' sumstatsColHeaders[order(sumstatsColHeaders$Corrected,
#' sumstatsColHeaders$ordering,decreasing = TRUE),]
#' rownames(sumstatsColHeaders)<-1:nrow(sumstatsColHeaders)
#' sumstatsColHeaders$ordering <- NULL
#' #manually move FRWQUENCY to above MAR - github issue 95
#' frequency <- sumstatsColHeaders[sumstatsColHeaders$Uncorrected=="FREQUENCY",]
#' maf <- sumstatsColHeaders[sumstatsColHeaders$Uncorrected=="MAF",]
#' if(as.integer(rownames(frequency))>as.integer(rownames(maf))){
#' sumstatsColHeaders[as.integer(rownames(frequency)),] <- maf
#' sumstatsColHeaders[as.integer(rownames(maf)),] <- frequency
#' }
#' usethis::use_data(sumstatsColHeaders,overwrite = TRUE, internal=TRUE)
#' save(sumstatsColHeaders,
#' file="data/sumstatsColHeaders.rda")
#' # You will need to restart your r session for effects to take account
#' }
#' @format dataframe with 2 columns
#' @usage data("sumstatsColHeaders")
"sumstatsColHeaders"
#' GWAS Educational Attainment Okbay 2016 - Subset
#'
#' @description GWAS Summary Statistics on Educational Attainment by Okbay et
#' al 2016:
#' PMID: 27898078 PMCID: PMC5509058 DOI: 10.1038/ng1216-1587b.
#' A subset of 93 SNPs
#'
#' @details GWAS Summary Statistics on Educational Attainment by Okbay et
#' al 2016 has been subsetted here to act as an example summary statistic file
#' which has some issues in the formatting. MungeSumstats can correct these
#' issues.
#'
#' @name raw_eduAttainOkbay
#' @section eduAttainOkbay.txt
#' @source The summary statistics file was downloaded from
#' https://www.nature.com/articles/ng.3552
#' and formatted to a .rda with the following:
#' \code{
#' #Get example dataset, use Educational-Attainment_Okbay_2016
#' link<-"Educational-Attainment_Okbay_2016/EduYears_Discovery_5000.txt"
#' eduAttainOkbay<-readLines(link,n=100)
#' #There is an issue where values end with .0, this 0 is removed in func
#' #There are also SNPs not on ref genome or arebi/tri allelic
#' #So need to remove these in this dataset as its used for testing
#' tmp <- tempfile()
#' writeLines(eduAttainOkbay,con=tmp)
#' eduAttainOkbay <- data.table::fread(tmp) #DT read removes the .0's
#' #remove those not on ref genome and withbi/tri allelic
#' rmv <- c("rs192818565","rs79925071","rs1606974","rs1871109",
#' "rs73074378","rs7955289")
#' eduAttainOkbay <- eduAttainOkbay[!MarkerName %in% rmv,]
#' data.table::fwrite(eduAttainOkbay,file=tmp,sep="\t")
#' eduAttainOkbay <- readLines(tmp)
#' writeLines(eduAttainOkbay,"inst/extdata/eduAttainOkbay.txt")
#' }
#' @format txt document with 94 items
NULL
#' GWAS Amyotrophic lateral sclerosis ieu open GWAS project - Subset
#'
#' @description VCF (VCFv4.2) of the GWAS Amyotrophic lateral sclerosis ieu
#' open GWAS project Dataset: ebi-a-GCST005647.
#' A subset of 99 SNPs
#'
#' @name raw_ALSvcf
#' @section ALSvcf.vcf
#' @details A VCF file (VCFv4.2) of the GWAS Amyotrophic lateral sclerosis ieu
#' open GWAS project has been subsetted here to act as an example summary
#' statistic file in VCF format which has some issues in the formatting.
#' MungeSumstats can correct these issues and produced a standardised summary
#' statistics format.
#' @source The summary statistics VCF (VCFv4.2) file was downloaded from
#' https://gwas.mrcieu.ac.uk/datasets/ebi-a-GCST005647/
#' and formatted to a .rda with the following:
#' \code{
#' #Get example VCF dataset, use GWAS Amyotrophic lateral sclerosis
#' ALS_GWAS_VCF <- readLines("ebi-a-GCST005647.vcf.gz")
#' #Subset to just the first 99 SNPs
#' ALSvcf <- ALS_GWAS_VCF[1:528]
#' writeLines(ALSvcf,"inst/extdata/ALSvcf.vcf")
#' }
#' @format vcf document with 528 items relating to 99 SNPs
#' @details NULL
NULL
#' UCSC Chain file hg38 to hg19
#'
#' @description UCSC Chain file hg38 to hg19, .chain.gz file, downloaded from
#' https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/ on 09/10/21
#'
#' @name hg38ToHg19
#' @section hg38ToHg19.over.chain.gz
#' @details UCSC Chain file hg38 to hg19, .chain.gz file, downloaded on 09/10/21
#' To be used as a back up if the download from UCSC fails.
#' @source The chain file was downloaded from
#' https://hgdownload.cse.ucsc.edu/goldenpath/hg38/liftOver/
#' \code{
#' utils::download.file('ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz',tempdir())
#' }
#' @format gunzipped chain file
#' @details NULL
NULL
#' UCSC Chain file hg19 to hg38
#'
#' @description UCSC Chain file hg19 to hg38, .chain.gz file, downloaded from
#' https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/ on 09/10/21
#'
#' @name hg19ToHg38
#' @section hg19ToHg38.over.chain.gz
#' @details UCSC Chain file hg19 to hg38, .chain.gz file, downloaded on 09/10/21
#' To be used as a back up if the download from UCSC fails.
#' @source The chain file was downloaded from
#' https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/
#' \code{
#' utils::download.file('ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz',tempdir())
#' }
#' @format gunzipped chain file
#' @details NULL
NULL
#' Local ieu-a-298 file from IEU Open GWAS
#'
#' @description Local ieu-a-298 file from IEU Open GWAS, downloaded on 09/10/21.
#'
#' @name ieu-a-298
#' @section ieu-a-298.tsv.gz
#' @details Local ieu-a-298 file from IEU Open GWAS, downlaoded on 09/10/21.
#' This is done in case the download in the package vignette fails.
#' @source The file was downloaded with:
#' \code{
#' MungeSumstats::import_sumstats(ids = "ieu-a-298",ref_genome = "GRCH37")
#' }
#' @format gunzipped tsv file
#' @details NULL
NULL