Skip to content

Commit

Permalink
updated package function for model robust data generation and model m…
Browse files Browse the repository at this point in the history
…isspecified subsampling
  • Loading branch information
Amalan-ConStat committed Oct 3, 2024
1 parent 76720a5 commit d68d72d
Show file tree
Hide file tree
Showing 65 changed files with 1,139 additions and 1,627 deletions.
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ export(AoptimalGauLMSub)
export(AoptimalMCGLMSub)
export(GenGLMdata)
export(GenModelMissGLMdata)
export(GenModelRobustGLMdata)
export(LCCsampling)
export(LeverageSampling)
export(modelMissLinSub)
Expand Down
179 changes: 0 additions & 179 deletions R/ALoptimalGLMSub.R
Original file line number Diff line number Diff line change
Expand Up @@ -465,182 +465,3 @@ ALoptimalGLMSub <- function(r1,r2,Y,X,N,family){
return(ans)
}
}

#' Generate data for Generalised Linear Models
#'
#' Function to simulate big data under linear, logistic and Poisson regression for sampling.
#' Covariate data X is through Normal or Uniform distribution for linear regression.
#' Covariate data X is through Exponential or Normal or Uniform distribution for logistic regression.
#' Covariate data X is through Normal or Uniform distribution for Poisson regression.
#'
#' @usage
#' GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,family)
#'
#' @param Dist a character value for the distribution "Normal" or "Uniform or "Exponential"
#' @param Dist_Par a list of parameters for the distribution that would generate data for covariate X
#' @param No_Of_Var number of variables
#' @param Beta a vector for the model parameters, including the intercept
#' @param N the big data size
#' @param family a character vector for "linear", "logistic" and "poisson" regression from Generalised Linear Models
#'
#' @details
#' Big data for the Generalised Linear Models are generated by the "linear", "logistic" and "poisson"
#' regression types.
#'
#' We have limited the covariate data generation for
#' linear regression through normal and uniform distribution,
#' logistic regression through exponential, normal and uniform and
#' Poisson regression through normal and uniform distribution.
#'
#' @return
#' The output of \code{GenGLMData} gives a list of
#'
#' \code{Basic} a list of outputs based on the inputs and Beta Estimates for all models
#'
#' \code{Complete_Data} a matrix for Y and X
#'
#' @references
#' \insertRef{lee1996hierarchical}{NeEDS4BigData}
#'
#' @examples
#' Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1,Error_Variance=0.5)
#' No_Of_Var<-2; Beta<-c(-1,2,1); N<-5000; Family<-"linear"
#' Results<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
#'
#' Dist<-"Normal"; Dist_Par<-list(Mean=0,Variance=1)
#' No_Of_Var<-2; Beta<-c(-1,2,1); N<-5000; Family<-"logistic"
#' Results<-GenGLMdata(Dist,Dist_Par,No_Of_Var,Beta,N,Family)
#'
#' Dist<-"Normal";
#' No_Of_Var<-2; Beta<-c(-1,2,1); N<-5000; Family<-"poisson"
#' Results<-GenGLMdata(Dist,NULL,No_Of_Var,Beta,N,Family)
#'
#' @import stats
#' @export
GenGLMdata<-function(Dist,Dist_Par,No_Of_Var,Beta,N,family){
if(any(is.na(c(Dist,Beta,No_Of_Var,N,family))) | any(is.nan(c(Dist,No_Of_Var,Beta,N,family)))){
stop("NA or Infinite or NAN values in the Dist,Beta,No_Of_Var,N or family")
}

if(any(is.na(unlist(Dist_Par))) | any(is.nan(unlist(Dist_Par)))){
stop("NA or Infinite or NAN values in the Dist_Par")
}

if(!any(family == c("linear","logistic","poisson"))){
stop("Only the regression types 'linear','logistic' or 'poisson' are allowed")
}

if(family == "linear"){
if(!(Dist == "Normal" | Dist == "Uniform")){
stop("For linear regression select the distribution 'Normal' \n or 'Uniform' to generate the covarate data")
}
}

if(family == "logistic"){
if(!(Dist == "Exponential" | Dist == "Normal" | Dist == "Uniform")){
stop("For logistic regression select the distribution 'Exponential', \n 'Normal' or 'Uniform' to generate the covarate data")
}
}

if(family == "poisson"){
if(!(Dist == "Normal" | Dist == "Uniform")){
stop("For poisson regression select the distribution 'Normal' \n or 'Uniform' to generate the covarate data")
}
}

if(family %in% "linear"){
if(Dist %in% "Normal"){
X<-replicate(No_Of_Var,stats::rnorm(n = N, mean = Dist_Par$Mean, sd = sqrt(Dist_Par$Variance)))
}
if(Dist %in% "Uniform"){
X<-replicate(No_Of_Var,stats::runif(n = N, min = Dist_Par$Min, max = Dist_Par$Max))
}
Complete_Data<-cbind(1,X);
colnames(Complete_Data)<-c(paste0("X",0:ncol(X)))
Residual<-stats::rnorm(n=N,mean=0,sd=sqrt(Dist_Par$Error_Variance))
Y <- Complete_Data%*%Beta + Residual

Complete_Data<-cbind(Y,Complete_Data)
colnames(Complete_Data)<-c("Y",paste0("X",0:ncol(X)))

stats::lm(Y~.-1,data=as.data.frame(Complete_Data))->Results
Beta_Estimate<-stats::coefficients(Results)
Var_Epsilon_Estimate<-mean((Y-stats::fitted.values(Results))^2)

Outputs<-list("Basic"=list("N"=N,"Beta"=Beta,
"Beta_Estimates"=Beta_Estimate,
"Variance_Epsilon_Estimates"=Var_Epsilon_Estimate,
"Distribution"=Dist,
"Distribution_Parameter"=Dist_Par,
"No_Of_Variables"=No_Of_Var),
"Complete_Data"=Complete_Data)

class(Outputs)<-c("A_L_OptimalSubsampling","linear")
return(Outputs)
}
if(family %in% "logistic"){
if(Dist %in% "Exponential"){
X<-replicate(No_Of_Var,stats::rexp(n = N, rate = Dist_Par$Rate))
}
if(Dist %in% "Normal"){
X<-replicate(No_Of_Var,stats::rnorm(n = N, mean = Dist_Par$Mean, sd = sqrt(Dist_Par$Variance)))
}
if(Dist %in% "Uniform"){
X<-replicate(No_Of_Var,stats::runif(n = N, min = Dist_Par$Min, max = Dist_Par$Max))
}

Complete_Data<-cbind(1,X)
colnames(Complete_Data)<-c(paste0("X",0:ncol(X)))

Linear_Predictor_Data <- Complete_Data%*%Beta
Pi_Data <- 1-1/(1+exp(Linear_Predictor_Data))
Y <- stats::rbinom(n=N,size=1,prob = Pi_Data)

Complete_Data<-cbind(Y,Complete_Data)
colnames(Complete_Data)<-c("Y",paste0("X",0:ncol(X)))

stats::glm(Y~.-1,data=as.data.frame(Complete_Data),family = "binomial")->Results
Beta_Estimate<-stats::coefficients(Results)

Outputs<-list("Basic"=list("N"=N,"Beta"=Beta,
"Beta_Estimates"=Beta_Estimate,
"Distribution"=Dist,
"Distribution_Parameter"=Dist_Par,
"No_Of_Variables"=No_Of_Var),
"Complete_Data"=Complete_Data)

class(Outputs)<-c("A_L_OptimalSubsampling","logistic")
return(Outputs)
}
if(family %in% "poisson"){
if(Dist %in% "Normal"){
X<-replicate(No_Of_Var,stats::rnorm(n = N, mean = 0, sd = 1))
}
if(Dist %in% "Uniform"){
X<-replicate(No_Of_Var,stats::runif(n = N, min = 0, max = 1))
}

Complete_Data<-cbind(1,X)
colnames(Complete_Data)<-c(paste0("X",0:ncol(X)))

Linear_Predictor_Data <- Complete_Data%*%Beta
Lambda_Data <- exp(Linear_Predictor_Data)
Y <- stats::rpois(n=N,lambda = Lambda_Data)

Complete_Data<-cbind(Y,Complete_Data)
colnames(Complete_Data)<-c("Y",paste0("X",0:ncol(X)))

stats::glm(Y~.-1,data=as.data.frame(Complete_Data),family = "poisson")->Results
Beta_Estimate<-stats::coefficients(Results)

Outputs<-list("Basic"=list("N"=N,"Beta"=Beta,
"Beta_Estimates"=Beta_Estimate,
"Distribution"=Dist,
"Distribution_Parameter"=Dist_Par,
"No_Of_Variables"=No_Of_Var),
"Complete_Data"=Complete_Data)

class(Outputs)<-c("A_L_OptimalSubsampling","poisson")
return(Outputs)
}
}
Loading

0 comments on commit d68d72d

Please sign in to comment.