Skip to content

Commit

Permalink
[R/wb_to_df] enable reading POSIXct dates and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
JanMarvin committed Feb 28, 2022
1 parent c4c0fea commit 38854bf
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 69 deletions.
75 changes: 65 additions & 10 deletions R/wb_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,13 @@ guess_col_type <- function(tt) {
col_dte <- vapply(tt[!col_num], function(x) all(x == "d", na.rm = TRUE), NA)
types[names(col_dte[col_dte])] <- 2

# or even posix
col_dte <- vapply(tt[!col_num], function(x) all(x == "p", na.rm = TRUE), NA)
types[names(col_dte[col_dte])] <- 3

# there are bools as well
col_log <- vapply(tt[!col_num], function(x) any(x == "b", na.rm = TRUE), NA)
types[names(col_log[col_log])] <- 3
types[names(col_log[col_log])] <- 4

types
}
Expand All @@ -92,9 +96,7 @@ numfmt_is_date <- function(numFmt) {
date_fmts <- c(
"yy", "yyyy",
"m", "mm", "mmm", "mmmm", "mmmmm",
"d", "dd", "ddd", "dddd",
"h", "hh", "m", "mm", "s", "ss",
"AM", "PM", "A", "P"
"d", "dd", "ddd", "dddd"
)
date_or_fmt <- paste0(date_fmts, collapse = "|")
maybe_dates <- grepl(pattern = date_or_fmt, x = numFmt_df$formatCode)
Expand All @@ -104,14 +106,59 @@ numfmt_is_date <- function(numFmt) {
z
}

#' check if numFmt is posix. internal function
#' @param numFmt numFmt xml nodes
numfmt_is_posix <- function(numFmt) {

# if numFmt is character(0)
if (length(numFmt) ==0) return(z <- NULL)

numFmt_df <- read_numfmt(read_xml(numFmt))
num_fmts <- c(
"#", as.character(0:9)
)
num_or_fmt <- paste0(num_fmts, collapse = "|")
maybe_num <- grepl(pattern = num_or_fmt, x = numFmt_df$formatCode)

posix_fmts <- c(
"yy", "yyyy",
"m", "mm", "mmm", "mmmm", "mmmmm",
"d", "dd", "ddd", "dddd",
"h", "hh", "m", "mm", "s", "ss",
"AM", "PM", "A", "P"
)
posix_or_fmt <- paste0(posix_fmts, collapse = "|")
maybe_posix <- grepl(pattern = posix_or_fmt, x = numFmt_df$formatCode)

z <- numFmt_df$numFmtId[maybe_posix & !maybe_num]
if (length(z)==0) z <- NULL
z
}

#' check if style is date. internal function
#'
#' @param cellXfs cellXfs xml nodes
#' @param numfmt_date custom numFmtId dates
style_is_date <- function(cellXfs, numfmt_date) {

# numfmt_date: some basic date formats and custom formats
date_numfmts <- as.character(14:22)
date_numfmts <- as.character(14:17)
numfmt_date <- c(numfmt_date, date_numfmts)

cellXfs_df <- read_xf(read_xml(cellXfs))
z <- rownames(cellXfs_df[cellXfs_df$numFmtId %in% numfmt_date,])
if (length(z)==0) z <- NA
z
}

#' check if style is posix. internal function
#'
#' @param cellXfs cellXfs xml nodes
#' @param numfmt_date custom numFmtId dates
style_is_posix <- function(cellXfs, numfmt_date) {

# numfmt_date: some basic date formats and custom formats
date_numfmts <- as.character(18:22)
numfmt_date <- c(numfmt_date, date_numfmts)

cellXfs_df <- read_xf(read_xml(cellXfs))
Expand Down Expand Up @@ -340,13 +387,13 @@ wb_to_df <- function(
numfmt_date <- numfmt_is_date(wb$styles$numFmts)
xlsx_date_style <- style_is_date(wb$styles$cellXfs, numfmt_date)

numfmt_posix <- numfmt_is_posix(wb$styles$numFmts)
xlsx_posix_style <- style_is_posix(wb$styles$cellXfs, numfmt_posix)

This comment has been minimized.

Copy link
@JanMarvin

JanMarvin Feb 28, 2022

Author Owner

Both checks can be combined into a single check is_date_or_posix(), otherwise some numbers identified as date will be converted to date, and if later identified as posix they will be convert again this time as posix. Obviously this slows down things in files with large amounts of dates with custom formats. Date detection should be improved either way, therefore I'm in no hurry to improve things now


# create temporary data frame. hard copy required
z <- dims_to_dataframe(dims)
tt <- dims_to_dataframe(dims)




# tt <- data.frame(matrix(0, nrow = 4, ncol = ncol(z)))
# names(tt) <- names(z)
# rownames(tt) <- c("b", "s", "d", "n")
Expand Down Expand Up @@ -412,6 +459,7 @@ wb_to_df <- function(
cc$val[sel] <- "NA"
cc$typ[sel] <- "na_string"
}

# dates
if (!is.null(cc$c_s)) {
# if a cell is t="s" the content is a sst and not da date
Expand All @@ -421,10 +469,15 @@ wb_to_df <- function(

if (detectDates) {
sel <- (cc$c_s %in% xlsx_date_style) & !cc$is_string & cc$v != "_openxlsx_NA_"
cc$val[sel] <- as.character(convertToDate(cc$v[sel]))
cc$val[sel] <- suppressWarnings(as.character(convertToDate(cc$v[sel])))
cc$typ[sel] <- "d"

sel <- (cc$c_s %in% xlsx_posix_style) & !cc$is_string & cc$v != "_openxlsx_NA_"
cc$val[sel] <- suppressWarnings(as.character(convertToDateTime(cc$v[sel])))

This comment has been minimized.

Copy link
@JanMarvin

JanMarvin Mar 1, 2022

Author Owner

probably should handle this old date format as well. read it from the workbook and add it here as option?

cc$typ[sel] <- "p"
}
}

# remaining values are numeric?
sel <- is.na(cc$typ)
cc$val[sel] <- cc$v[sel]
Expand Down Expand Up @@ -509,11 +562,13 @@ wb_to_df <- function(
if (any(sel)) {
nums <- names( which(types[sel] == 1) )
dtes <- names( which(types[sel] == 2) )
logs <- names( which(types[sel] == 3) )
poxs <- names( which(types[sel] == 3) )
logs <- names( which(types[sel] == 4) )
# convert "#NUM!" to "NaN" -- then converts to NaN
# maybe consider this an option to instead return NA?
z[nums] <- lapply(z[nums], function(i) as.numeric(replace(i, i == "#NUM!", "NaN")))
z[dtes] <- lapply(z[dtes], as.Date)
z[poxs] <- lapply(z[poxs], as.POSIXct)
z[logs] <- lapply(z[logs], as.logical)
} else {
warning("could not convert. All missing in row used for variable names")
Expand Down
100 changes: 41 additions & 59 deletions tests/testthat/test-writing_posixct.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,62 +23,44 @@ test_that("Writing Posixct with writeData & writeDataTable", {
options("openxlsx.datetimeFormat" = "yyyy-mm-dd hh:mm:ss")
})

# # missing datetime is not yet implemented
# test_that("Writing mixed EDT/EST Posixct with writeData & writeDataTable", {
# options("openxlsx.datetimeFormat" = "dd/mm/yy hh:mm")
#
# tstart1 <- as.POSIXct("12/03/2018 08:30", format = "%d/%m/%Y %H:%M")
# tstart2 <- as.POSIXct("10/03/2018 08:30", format = "%d/%m/%Y %H:%M")
# TimeDT1 <- c(NA, 0, 10, 30, 60, 120, 240, 720, 1440) * 60 + tstart1
# TimeDT2 <- c(0, 10, 30, 60, 120, 240, 720, NA, 1440) * 60 + tstart2
#
# df <- data.frame(
# timeval = c(TimeDT1, TimeDT2),
# timetxt = format(c(TimeDT1, TimeDT2), "%Y-%m-%d %H:%M")
# )
#
# wb <- createWorkbook()
# addWorksheet(wb, "writeData")
# addWorksheet(wb, "writeDataTable")
#
# writeData(wb, "writeData", df, startCol = 2, startRow = 3, rowNames = FALSE)
# writeDataTable(wb, "writeDataTable", df, startCol = 2, startRow = 3)
#
# # xlsx file is broken‚ <NA> where some missing value is expected.
# wb_open(wb)
#
# wd <- as.numeric(wb$worksheets[[1]]$sheet_data$v)
# wdt <- as.numeric(wb$worksheets[[2]]$sheet_data$v)
# wd <- wd[wb$worksheets[[1]]$sheet_data$cols == 2]
# wdt <- wdt[wb$worksheets[[2]]$sheet_data$cols == 2]
#
# # drop any integer indexes introduced in write
# wd <- wd[wd != 0 | is.na(wd)]
# wdt <- wdt[wdt != 0 | is.na(wdt)]
#
# # sort everything
# wd <- convertToDateTime(wd[order(wd)])
# wdt <- convertToDateTime(wdt[order(wdt)])
# expected <- df$timeval[order(df$timeval)]
#
# # compare
# expect_equal(
# object = wd,
# expected = expected,
# tolerance = 10 ^ -10,
# check.tzone = FALSE
# )
# expect_equal(
# object = wdt,
# expected = expected,
# tolerance = 10 ^ -10,
# check.tzone = TRUE
# )
# expect_equal(
# object = wd,
# expected = wdt,
# check.tzone = TRUE
# )
#
# options("openxlsx.datetimeFormat" = "yyyy-mm-dd hh:mm:ss")
# })
# missing datetime is not yet implemented
test_that("Writing mixed EDT/EST Posixct with writeData & writeDataTable", {
options("openxlsx.datetimeFormat" = "dd/mm/yy hh:mm")

tstart1 <- as.POSIXct("12/03/2018 08:30", format = "%d/%m/%Y %H:%M")
tstart2 <- as.POSIXct("10/03/2018 08:30", format = "%d/%m/%Y %H:%M")
TimeDT1 <- c(NA, 0, 10, 30, 60, 120, 240, 720, 1440) * 60 + tstart1
TimeDT2 <- c(0, 10, 30, 60, 120, 240, 720, NA, 1440) * 60 + tstart2

df <- data.frame(
timeval = c(TimeDT1, TimeDT2),
timetxt = format(c(TimeDT1, TimeDT2), "%Y-%m-%d %H:%M")
)

wb <- createWorkbook()
addWorksheet(wb, "writeData")
addWorksheet(wb, "writeDataTable")

writeData(wb, "writeData", df, startCol = 2, startRow = 3, rowNames = FALSE)
writeDataTable(wb, "writeDataTable", df, startCol = 2, startRow = 3)

# xlsx file is broken‚ <NA> where some missing value is expected.
# TODO check: looks alright in LibreOffice
# wb_open(wb)
xlsxFile <- temp_xlsx()
saveWorkbook(wb, xlsxFile, TRUE)
wb <- wb_to_df(xlsxFile)

exp <- df$timeval
got <- wb$timeval

# compare
expect_equal(
exp,
got,
tolerance = 10 ^ -10,
check.tzone = FALSE
)

options("openxlsx.datetimeFormat" = "yyyy-mm-dd hh:mm:ss")
})

1 comment on commit 38854bf

@JanMarvin
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI: this reads ycphs/openxlsx#288 without issues

Please sign in to comment.