From bc20322008f37b4101769de8fb846074e6fe9508 Mon Sep 17 00:00:00 2001 From: dbetebenner Date: Thu, 7 Mar 2013 09:53:55 -0500 Subject: [PATCH] First commit to repository --- .gitignore | 5 ++ New_Jersey_Data_LONG_2011.R | 128 ++++++++++++++++++++++++++++++++++++ New_Jersey_Data_LONG_2012.R | 104 +++++++++++++++++++++++++++++ New_Jersey_SGP_2011.R | 52 +++++++++++++++ New_Jersey_SGP_2012.R | 61 +++++++++++++++++ README.md | 4 +- 6 files changed, 352 insertions(+), 2 deletions(-) create mode 100644 .gitignore create mode 100644 New_Jersey_Data_LONG_2011.R create mode 100644 New_Jersey_Data_LONG_2012.R create mode 100644 New_Jersey_SGP_2011.R create mode 100644 New_Jersey_SGP_2012.R diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..81eaae7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.Rapp.history +.DS_Store +.RData +.Rdata +/Data diff --git a/New_Jersey_Data_LONG_2011.R b/New_Jersey_Data_LONG_2011.R new file mode 100644 index 0000000..4aaa7a7 --- /dev/null +++ b/New_Jersey_Data_LONG_2011.R @@ -0,0 +1,128 @@ +############################################################################################################################ +#### +#### Code for preparation of New_Jersey data +#### +############################################################################################################################ + +library("SGP") +library("doMC") +#registerDoMC(4) # Can run sequentially if you prefer/can't parallelize +#getDoParWorkers() + +### +### Reading in .csv files as data.tables - pipe (|) seperated files. +### + +my.files <- c("NJASK_2010_2011_Math.csv", "NJASK_2010-2011_ELA.csv") + +setwd("../Data") +New_Jersey_Data <- list(Student = foreach (i = my.files, .combine = "rbind", .packages = "data.table", .inorder=FALSE) %dopar% {data.table(read.csv(i, sep="|"))}) +setwd("../SGP") + + +### +### Re-Name the data - substitute _ for . and capitalize names. Here's a function to do it: +### + +subSpecial <- function(x) { + s <- strsplit(x, split=".", fixed=TRUE)[[1]] + s <- paste(toupper(substring(s, 1,1)), toupper(substring(s, 2)), sep="", collapse="_") + s <- strsplit(s, split="__")[[1]][1] + return(s) +} + + +# rename: + +for (j in 1:dim(New_Jersey_Data[["Student"]])[2]) names(New_Jersey_Data[["Student"]])[j] <- subSpecial(names(New_Jersey_Data$Student)[j]) + +sapply(New_Jersey_Data[["Student"]], class) + +names(New_Jersey_Data[["Student"]])[1] <- "TESTING_YEAR" +New_Jersey_Data[["Student"]]$SCALED_SCORE <- as.numeric(New_Jersey_Data[["Student"]]$SCALED_SCORE) # For SGP function - doesn't always like integers... +New_Jersey_Data[["Student"]]$SUBJECT <- toupper(New_Jersey_Data[["Student"]]$SUBJECT) # Any field that gets 'keyed' on needs to be ALL CAPS + + +############################################################################################################################ +### IDENTIFY VALID CASES +### +### Duplicate rows for individual students may be the only issue: +### ALL scores are in range of 100 - 300, so NO SCALED_SCORE greater than LOSS and less than HOSS +### All students in grades 3 - 8 (assume taking grade level examinations?) +### +############################################################################################################################ + +New_Jersey_Data[["Student"]][["VALID_CASE"]] <- factor(1, levels=1:2, labels=c("VALID_CASE", "INVALID_CASE")) + +### Duplicated Records + +key(New_Jersey_Data[["Student"]]) <- c("STUDENT_ID", "TESTING_YEAR", "SUBJECT") + +# Inspect the dublicates first to see what's going on. +dup.ids<-New_Jersey_Data[["Student"]]$STUDENT_ID[which(duplicated(New_Jersey_Data[["Student"]]))] +dups<-New_Jersey_Data[["Student"]][New_Jersey_Data[["Student"]]$STUDENT_ID %in% dup.ids] +length(dup.ids) # only a handful, but we'll try to keep the best of the lot +dim(dups) +summary(dups) + +# Invalidate lowest score for duplicates. +key(New_Jersey_Data[["Student"]]) <- c("VALID_CASE", "STUDENT_ID", "TESTING_YEAR", "SUBJECT", "SCALED_SCORE") +key(New_Jersey_Data[["Student"]]) <- c("VALID_CASE", "STUDENT_ID", "TESTING_YEAR", "SUBJECT") +New_Jersey_Data[["Student"]][["VALID_CASE"]][which(duplicated(New_Jersey_Data[["Student"]]) & New_Jersey_Data[["Student"]]$VALID_CASE=="VALID_CASE")-1] <- "INVALID_CASE" + +############################################################################################################################ +### +### Create Additional Variables: +### +############################################################################################################################ + +### Prior Performance Levels +# Make the Performance Levels an ORDERED factor +New_Jersey_Data[["Student"]]$PERFORMANCE_LEVEL <- ordered(New_Jersey_Data[["Student"]]$PERFORMANCE_LEVEL, levels=c("Partially Proficient", "Proficient", "Advanced Proficient")) + +# Use data.table to select each "Valid" student record from last year and tack on the scaled score and performance level from that record onto the current year record (as *_PRIOR). +key(New_Jersey_Data[["Student"]]) <- c("STUDENT_ID", "SUBJECT", "TESTING_YEAR", "VALID_CASE") +New_Jersey_Data[["Student"]]$SCALED_SCORE_PRIOR <- New_Jersey_Data[["Student"]][SJ(STUDENT_ID, SUBJECT, TESTING_YEAR-1, "VALID_CASE"), mult="last"][,SCALED_SCORE] +New_Jersey_Data[["Student"]]$PERFORMANCE_LEVEL_PRIOR <- New_Jersey_Data[["Student"]][SJ(STUDENT_ID, SUBJECT, TESTING_YEAR-1, "VALID_CASE"), mult="last"][,PERFORMANCE_LEVEL] + +### Unique School Identifier ? + +summary(New_Jersey_Data[["Student"]]$DISTRICT_CODE[New_Jersey_Data[["Student"]]$SCHOOL_CODE==150]) #tried with 50, 100, 999 - defintely not a unique ID +summary(New_Jersey_Data[["Student"]]$SCHOOL_CODE) # All 3 digit numbers + +New_Jersey_Data[["Student"]]$UNIQUE_SCHOOL_NUMBER <- New_Jersey_Data[["Student"]]$DISTRICT_CODE*1000 + New_Jersey_Data[["Student"]]$SCHOOL_CODE + +summary(New_Jersey_Data[["Student"]]$UNIQUE_SCHOOL_NUMBER) +summary(New_Jersey_Data[["Student"]]$UNIQUE_SCHOOL_NUMBER %% 1000) # Modulo 1000 returns SCHOOL_CODE + +############################################################################################################################ +### +### SGP standard variable names: +### +############################################################################################################################ + +my.names <- c("YEAR", "Testing.Program", "CONTENT_AREA", "GRADE", "ID", "DISTRICT_NUMBER", "School.Code", "County.Name", + "DISTRICT_NAME", "SCHOOL_NAME", "DFG", "Gender", "Race.Ethnicity.Combined", "Title.I.LAL", "Title.I.Math", + "Special.Education..SE.", "General.ED", "Former.LEP", "Current.LEP", "Time.in.District.Less.Than.1.Year", + "Economically.Disadvantaged", "Migrant", "SCALE_SCORE", "ACHIEVEMENT_LEVEL", "VALID_CASE", "SCALE_SCORE_PRIOR", + "ACHIEVEMENT_LEVEL_PRIOR", "SCHOOL_NUMBER") + +names(New_Jersey_Data[["Student"]]) <- my.names +New_Jersey_Data[["Student"]]$SCALE_SCORE_PRIOR <- NULL +New_Jersey_Data[["Student"]]$ID <- factor(New_Jersey_Data[["Student"]]$ID) +levels(New_Jersey_Data[["Student"]]$CONTENT_AREA) <- c("ELA", "MATHEMATICS") +New_Jersey_Data[["Student"]]$Gender[New_Jersey_Data[["Student"]]$Gender==""] <- NA +New_Jersey_Data[["Student"]]$Gender <- droplevels(New_Jersey_Data[["Student"]]$Gender) +levels(New_Jersey_Data[["Student"]]$Gender) <- c("Female", "Male") +levels(New_Jersey_Data[["Student"]]$Race.Ethnicity.Combined) <- c("Asian", "Black", "Hispanic", "Native American", "Other", "Pacific Islander", "White") +levels(New_Jersey_Data[["Student"]]$Title.I.LAL) <- c("No", "Yes") +levels(New_Jersey_Data[["Student"]]$Title.I.Math) <- c("No", "Yes") +levels(New_Jersey_Data[["Student"]]$General.ED) <- c("No", "Yes") +levels(New_Jersey_Data[["Student"]]$Economically.Disadvantaged) <- c("No", "Yes") +levels(New_Jersey_Data[["Student"]]$Migrant) <- c("No", "Yes") + +# Save with original variable names in place + +New_Jersey_Data_LONG_2011 <- New_Jersey_Data$Student +save(New_Jersey_Data_LONG_2011, file="../Data/New_Jersey_Data_LONG_2011.Rdata", compress=TRUE) # + diff --git a/New_Jersey_Data_LONG_2012.R b/New_Jersey_Data_LONG_2012.R new file mode 100644 index 0000000..bdae1e6 --- /dev/null +++ b/New_Jersey_Data_LONG_2012.R @@ -0,0 +1,104 @@ +################################################################################ +### +### Create New Jersey Data LONG for 2012 +### +################################################################################ + +### Load SGP Package + +require(SGP) + + +### Load data + +New_Jersey_Data_ELA <- read.csv("Data/Base_Files/NJASK_2011_2012_ELA.csv", sep="|") +New_Jersey_Data_MATH <- read.csv("Data/Base_Files/NJASK_2011_2012_MATH.csv", sep="|") + + +### Combine ELA and MATH + +New_Jersey_Data_LONG_2012 <- rbind(New_Jersey_Data_ELA, New_Jersey_Data_MATH) + + +### Tidy up data + +names(New_Jersey_Data_LONG_2012) <- c("YEAR", "Testing.Program", "CONTENT_AREA", "GRADE", "ID", "Student.ID..SSID.", "DISTRICT_NUMBER", "School.Code", "County.Name", + "DISTRICT_NAME", "SCHOOL_NAME", "DFG", "Gender", "Race.Ethnicity.Combined","Title.I.LAL", "Title.I.Math", "Special.Education..SE.", "General.ED", "Former.LEP", + "Current.LEP", "Time.in.District.Less.Than.1.Year", "Economically.Disadvantaged", "Migrant", "Homeless", "SCALE_SCORE", "ACHIEVEMENT_LEVEL") + +New_Jersey_Data_LONG_2012$Homeless <- NULL +New_Jersey_Data_LONG_2012$Former.LEP <- NULL +New_Jersey_Data_LONG_2012$Testing.Program <- NULL + +New_Jersey_Data_LONG_2012$CONTENT_AREA <- as.character(New_Jersey_Data_LONG_2012$CONTENT_AREA) +New_Jersey_Data_LONG_2012$CONTENT_AREA[New_Jersey_Data_LONG_2012$CONTENT_AREA=="Math"] <- "MATHEMATICS" + +New_Jersey_Data_LONG_2012$ID <- as.character(New_Jersey_Data_LONG_2012$ID) + +New_Jersey_Data_LONG_2012$Gender[New_Jersey_Data_LONG_2012$Gender==""] <- NA +New_Jersey_Data_LONG_2012$Gender <- factor(New_Jersey_Data_LONG_2012$Gender) +levels(New_Jersey_Data_LONG_2012$Gender) <- c("Female", "Male") + +levels(New_Jersey_Data_LONG_2012$Race.Ethnicity.Combined) <- c("Asian", "Black", "Hispanic", "Native American", "Other", "Pacific Islander", "White") + +levels(New_Jersey_Data_LONG_2012$General.ED) <- c("General Education: No", "General Education: Yes") + +New_Jersey_Data_LONG_2012$Current.LEP[New_Jersey_Data_LONG_2012$Current.LEP==""] <- NA +New_Jersey_Data_LONG_2012$Current.LEP <- factor(New_Jersey_Data_LONG_2012$Current.LEP) +levels(New_Jersey_Data_LONG_2012$Current.LEP) <- c("Less than 1 Year", "1 Year", "2 Years", "3 Years", "Yes") + +levels(New_Jersey_Data_LONG_2012$Time.in.District.Less.Than.1.Year) <- c("Time in District Less than 1 Year: No", "Time in District Less than 1 Year: Yes") + +levels(New_Jersey_Data_LONG_2012$Economically.Disadvantaged) <- c("Economically Disadvantaged: Yes", "Economically Disadvantaged: No", "Economically Disadvantaged: Yes") +New_Jersey_Data_LONG_2012$Economically.Disadvantaged <- as.character(New_Jersey_Data_LONG_2012$Economically.Disadvantaged) +New_Jersey_Data_LONG_2012$Economically.Disadvantaged <- factor(New_Jersey_Data_LONG_2012$Economically.Disadvantaged) + +levels(New_Jersey_Data_LONG_2012$Migrant) <- c("Migrant: No", "Migrant: Yes") + +levels(New_Jersey_Data_LONG_2012$Title.I.LAL) <- "Title I LAL: No" +levels(New_Jersey_Data_LONG_2012$Title.I.Math) <- "Title I Math: No" + +New_Jersey_Data_LONG_2012$ACHIEVEMENT_LEVEL[New_Jersey_Data_LONG_2012$ACHIEVEMENT_LEVEL==""] <- NA +New_Jersey_Data_LONG_2012$ACHIEVEMENT_LEVEL <- factor(New_Jersey_Data_LONG_2012$ACHIEVEMENT_LEVEL) +New_Jersey_Data_LONG_2012$ACHIEVEMENT_LEVEL <- factor(New_Jersey_Data_LONG_2012$ACHIEVEMENT_LEVEL, levels=c("Partially Proficient", "Proficient", "Advanced Proficient"), ordered=TRUE) + +New_Jersey_Data_LONG_2012$ID[New_Jersey_Data_LONG_2012$ID=="NULL"] <- NA + +New_Jersey_Data_LONG_2012$SCHOOL_NUMBER <- New_Jersey_Data_LONG_2012$DISTRICT_NUMBER*1000 + New_Jersey_Data_LONG_2012$School.Code + +levels(New_Jersey_Data_LONG_2012$Title.I.LAL) <- "Title I LAL: No" +levels(New_Jersey_Data_LONG_2012$Title.I.Math) <- "Title I Math: No" + +### Indentify Valid Cases + +New_Jersey_Data_LONG_2012$VALID_CASE <- "VALID_CASE" +New_Jersey_Data_LONG_2012$VALID_CASE[is.na(New_Jersey_Data_LONG_2012$ID)] <- "INVALID_CASE" + +New_Jersey_Data_LONG_2012 <- as.data.table(New_Jersey_Data_LONG_2012) + +setkeyv(New_Jersey_Data_LONG_2012, c("VALID_CASE", "ID", "YEAR", "CONTENT_AREA")) + +# Inspect the dublicates first to see what's going on. +dup.ids<-New_Jersey_Data_LONG_2012$ID[which(duplicated(New_Jersey_Data_LONG_2012))] +dups<-New_Jersey_Data_LONG_2012[New_Jersey_Data_LONG_2012$ID %in% dup.ids] +length(dup.ids) # only a handful, but we'll try to keep the best of the lot +dim(dups) +summary(dups) + +# Invalidate lowest score for duplicates. + +setkeyv(New_Jersey_Data_LONG_2012, c("VALID_CASE", "ID", "YEAR", "CONTENT_AREA", "SCALE_SCORE")) +setkeyv(New_Jersey_Data_LONG_2012, c("VALID_CASE", "ID", "YEAR", "CONTENT_AREA")) +New_Jersey_Data_LONG_2012[["VALID_CASE"]][which(duplicated(New_Jersey_Data_LONG_2012) & New_Jersey_Data_LONG_2012$VALID_CASE=="VALID_CASE")-1] <- "INVALID_CASE" + + +# ENROLLMENT_STATUS + +New_Jersey_Data_LONG_2012$STATE_ENROLLMENT_STATUS <- factor(1, levels=0:1, labels=c("Enrolled State: Yes", "Enrolled State: No")) +New_Jersey_Data_LONG_2012$DISTRICT_ENROLLMENT_STATUS <- factor(1, levels=0:1, labels=c("Enrolled District: Yes", "Enrolled District: No")) +New_Jersey_Data_LONG_2012$SCHOOL_ENROLLMENT_STATUS <- factor(1, levels=0:1, labels=c("Enrolled School: Yes", "Enrolled School: No")) + +# Save the results + +save(New_Jersey_Data_LONG_2012, file="Data/New_Jersey_Data_LONG_2012.Rdata") + diff --git a/New_Jersey_SGP_2011.R b/New_Jersey_SGP_2011.R new file mode 100644 index 0000000..46d6d83 --- /dev/null +++ b/New_Jersey_SGP_2011.R @@ -0,0 +1,52 @@ +#################################################################### +### +### Code to update SGP analyses for New Jersey +### +#################################################################### + +### Load SGP Package + +require(SGP) + + +### Load data + +load("../Data/New_Jersey_Data_LONG_2011.Rdata") +load("../Data/Base_Files/New_Jersey_SGP.Rdata") + + +### Merge files + +New_Jersey_SGP@Data <- as.data.table(rbind.fill(as.data.frame(New_Jersey_Data_LONG_2011), as.data.frame(New_Jersey_SGP@Data))) + + +### prepareSGP + +New_Jersey_SGP <- prepareSGP(New_Jersey_SGP) + +save(New_Jersey_SGP, file="../Data/New_Jersey_SGP.Rdata") + +### analyzeSGP + +New_Jersey_SGP <- analyzeSGP(New_Jersey_SGP, + years=2011, + simulate.sgps=FALSE) + +save(New_Jersey_SGP, file="../Data/New_Jersey_SGP.Rdata") + + +### combineSGP + +New_Jersey_SGP <- combineSGP(New_Jersey_SGP, + years=2011) + +save(New_Jersey_SGP, file="../Data/New_Jersey_SGP.Rdata") + + +### summarizeSGP + +New_Jersey_SGP <- summarizeSGP(New_Jersey_SGP) + +### visualizeSGP + +visualizeSGP(New_Jersey_SGP, sgPlot.demo.report=TRUE) diff --git a/New_Jersey_SGP_2012.R b/New_Jersey_SGP_2012.R new file mode 100644 index 0000000..7481990 --- /dev/null +++ b/New_Jersey_SGP_2012.R @@ -0,0 +1,61 @@ +#################################################################### +### +### Code to update SGP analyses for New Jersey +### +#################################################################### + +### Load SGP Package + +require(SGP) +options(error=recover) + +### Load data + +load("Data/New_Jersey_Data_LONG_2012.Rdata") +load("Data/Base_Files/New_Jersey_SGP.Rdata") + + +### Merge files + +New_Jersey_SGP@Data <- as.data.table(rbind.fill(New_Jersey_Data_LONG_2012, New_Jersey_SGP@Data)) + + +### prepareSGP + +New_Jersey_SGP <- prepareSGP(New_Jersey_SGP) +save(New_Jersey_SGP, file="Data/New_Jersey_SGP.Rdata") + + +### analyzeSGP + +New_Jersey_SGP <- analyzeSGP( + New_Jersey_SGP, + years=2012, + sgp.percentiles=TRUE, + sgp.projections=TRUE, + sgp.projections.lagged=TRUE, + sgp.percentiles.baseline=TRUE, + sgp.projections.baseline=TRUE, + sgp.projections.lagged.baseline=TRUE, + simulate.sgps=FALSE, + parallel.config=list(BACKEND="PARALLEL", WORKERS=list(PERCENTILES=15, BASELINE_PERCENTILES=30, PROJECTIONS=10, LAGGED_PROJECTIONS=8, SUMMARY=30, GA_PLOTS=10, SG_PLOTS=1))) + +save(New_Jersey_SGP, file="Data/New_Jersey_SGP.Rdata") + + +### combineSGP + +New_Jersey_SGP <- combineSGP(New_Jersey_SGP) + +save(New_Jersey_SGP, file="Data/New_Jersey_SGP.Rdata") + + +### summarizeSGP + +New_Jersey_SGP <- summarizeSGP(New_Jersey_SGP, parallel.config=list(BACKEND="PARALLEL", WORKERS=list(SUMMARY=10))) + +save(New_Jersey_SGP, file="Data/New_Jersey_SGP.Rdata") + +### visualizeSGP + +visualizeSGP(New_Jersey_SGP, sgPlot.demo.report=TRUE) diff --git a/README.md b/README.md index 42b6086..297ff0a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -New_Jersey +New Jersey ========== -SGP source code and documentation associated with New Jersey SGP analyses \ No newline at end of file +SGP source code and documentation associated with New Jersey SGP analyses