From 2d000ecc081fbbc7137ea90bcbe44284235d8b22 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Mon, 23 Jul 2018 13:15:41 +0200 Subject: [PATCH 01/52] add a non exported meta() function which prepares the vectors for storage Signed-off-by: Thierry Onkelinx --- DESCRIPTION | 1 + R/meta.R | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 R/meta.R diff --git a/DESCRIPTION b/DESCRIPTION index 7eef2f125..b34701287 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,6 +44,7 @@ Collate: 'index.R' 'libgit2.R' 'merge.R' + 'meta.R' 'note.R' 'odb.R' 'plot.R' diff --git a/R/meta.R b/R/meta.R new file mode 100644 index 000000000..2a1906451 --- /dev/null +++ b/R/meta.R @@ -0,0 +1,47 @@ +## git2r, R bindings to the libgit2 library. +## Copyright (C) 2013-2018 The git2r contributors +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License, version 2, +## as published by the Free Software Foundation. +## +## git2r is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +##' Optimise a vector for storage in to a git repository and add meta data +##' @param x the vector +##' @noRd +meta <- function(x) { + UseMethod("meta") +} + +meta.character <- function(x) { + attr(x, "meta") <- " class: character" + return(x) +} + +meta.integer <- function(x) { + attr(x, "meta") <- " class: integer" + return(x) +} + +meta.numeric <- function(x) { + attr(x, "meta") <- " class: numeric" + return(x) +} + +meta.factor <- function(x) { + z <- as.integer(x) + attr(z, "meta") <- paste( + " class: factor\n levels:", + paste(" -", levels(x), collapse = "\n"), + sep = "\n" + ) + return(z) +} From 03d8f6ddd1d6e13900a987393d20279fc459d6f0 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Mon, 23 Jul 2018 14:25:49 +0200 Subject: [PATCH 02/52] Add write_delim_git() Signed-off-by: Thierry Onkelinx --- DESCRIPTION | 1 + NAMESPACE | 2 ++ R/write_delim_git.R | 56 ++++++++++++++++++++++++++++++++++++++++++ man/write_delim_git.Rd | 21 ++++++++++++++++ 4 files changed, 80 insertions(+) create mode 100644 R/write_delim_git.R create mode 100644 man/write_delim_git.Rd diff --git a/DESCRIPTION b/DESCRIPTION index b34701287..a91ad1e09 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -66,4 +66,5 @@ Collate: 'time.R' 'tree.R' 'when.R' + 'write_delim_git.R' RoxygenNote: 6.0.1 diff --git a/NAMESPACE b/NAMESPACE index a8c5a49ff..2cede24f7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -152,6 +152,7 @@ export(tags) export(tree) export(when) export(workdir) +export(write_delim_git) importFrom(graphics,axis) importFrom(graphics,barplot) importFrom(graphics,par) @@ -162,4 +163,5 @@ importFrom(graphics,title) importFrom(utils,capture.output) importFrom(utils,head) importFrom(utils,sessionInfo) +importFrom(utils,write.table) useDynLib(git2r, .registration=TRUE) diff --git a/R/write_delim_git.R b/R/write_delim_git.R new file mode 100644 index 000000000..4b7cb57ca --- /dev/null +++ b/R/write_delim_git.R @@ -0,0 +1,56 @@ +## Copyright (C) 2013-2018 The git2r contributors +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License, version 2, +## as published by the Free Software Foundation. +## +## git2r is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +##' Write a \code{data.frame} to a git repository +##' +##' This will create two files. The \code{".tsv"} file contains the raw data. +##' The \code{".yml"} contains the meta data on the columns in YAML format. +##' @param x the \code{data.frame} +##' @param file the name of the file with file extension. Can include a path +##' relative to the path of the \code{repo} +##' @param repo a \code{git_repository} object, created with +##' \code{\link{repository}} +##' @export +##' @include meta.R +##' @importFrom utils write.table +write_delim_git <- function(x, file, repo) { + if (!inherits(x, "data.frame")) { + stop("x is not a 'data.frame'") + } + if (grepl("\\..*$", file)) { + warning("file extensions are stripped") + file <- gsub("\\..*$", "", file) + } + if (!inherits(repo, "git_repository")) { + stop("repo is not a 'git_repository'") + } + raw_file <- sprintf("%s/%s.tsv", dirname(repo$path), file) + meta_file <- sprintf("%s/%s.yml", dirname(repo$path), file) + if (!dir.exists(dirname(raw_file))) { + dir.create(dirname(raw_file), recursive = TRUE) + } + raw_data <- as.data.frame(lapply(x, meta), stringsAsFactors = FALSE) + meta_data <- paste( + colnames(x), + vapply(raw_data, attr, "", which = "meta"), + sep = ":\n" + ) + writeLines(meta_data, meta_file) + write.table( + x = raw_data, file = raw_file, append = FALSE, + quote = FALSE, sep = "\t", eol = "\n", dec = ".", + row.names = FALSE, col.names = FALSE, fileEncoding = "UTF-8" + ) +} diff --git a/man/write_delim_git.Rd b/man/write_delim_git.Rd new file mode 100644 index 000000000..378729cfa --- /dev/null +++ b/man/write_delim_git.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/write_delim_git.R +\name{write_delim_git} +\alias{write_delim_git} +\title{Write a \code{data.frame} to a git repository} +\usage{ +write_delim_git(x, file, repo) +} +\arguments{ +\item{x}{the \code{data.frame}} + +\item{file}{the name of the file with file extension. Can include a path +relative to the path of the \code{repo}} + +\item{repo}{a \code{git_repository} object, created with +\code{\link{repository}}} +} +\description{ +This will create two files. The \code{".tsv"} file contains the raw data. +The \code{".yml"} contains the meta data on the columns in YAML format. +} From ec2b5fd10e965279fd8d462ffbb70dec34afd8dc Mon Sep 17 00:00:00 2001 From: ThierryO Date: Mon, 23 Jul 2018 14:48:21 +0200 Subject: [PATCH 03/52] Add read_delim_git() Signed-off-by: Thierry Onkelinx --- DESCRIPTION | 1 + NAMESPACE | 2 ++ R/read_delim_git.R | 62 ++++++++++++++++++++++++++++++++++++++++++ man/read_delim_file.Rd | 18 ++++++++++++ 4 files changed, 83 insertions(+) create mode 100644 R/read_delim_git.R create mode 100644 man/read_delim_file.Rd diff --git a/DESCRIPTION b/DESCRIPTION index a91ad1e09..820103d22 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -51,6 +51,7 @@ Collate: 'pull.R' 'punch_card.R' 'push.R' + 'read_delim_git.R' 'reference.R' 'reflog.R' 'refspec.R' diff --git a/NAMESPACE b/NAMESPACE index 2cede24f7..a7d3ba8ec 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -123,6 +123,7 @@ export(parents) export(pull) export(punch_card) export(push) +export(read_delim_file) export(references) export(reflog) export(remote_add) @@ -162,6 +163,7 @@ importFrom(graphics,symbols) importFrom(graphics,title) importFrom(utils,capture.output) importFrom(utils,head) +importFrom(utils,read.table) importFrom(utils,sessionInfo) importFrom(utils,write.table) useDynLib(git2r, .registration=TRUE) diff --git a/R/read_delim_git.R b/R/read_delim_git.R new file mode 100644 index 000000000..91cd05745 --- /dev/null +++ b/R/read_delim_git.R @@ -0,0 +1,62 @@ +## Copyright (C) 2013-2018 The git2r contributors +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License, version 2, +## as published by the Free Software Foundation. +## +## git2r is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +##' Read a \code{data.frame} from a git repository +##' +##' @inheritParams write_delim_git +##' @export +##' @importFrom utils read.table +read_delim_file <- function(file, repo) { + file <- gsub("\\..*$", "", file) + if (!inherits(repo, "git_repository")) { + stop("repo is not a 'git_repository'") + } + raw_file <- sprintf("%s/%s.tsv", dirname(repo$path), file) + meta_file <- sprintf("%s/%s.yml", dirname(repo$path), file) + if (!file.exists(raw_file) || !file.exists(meta_file)) { + stop("raw file and/or meta file missing") + } + meta_data <- readLines(meta_file) + meta_cols <- grep("^\\S*:$", meta_data) + col_names <- gsub(":", "", meta_data[meta_cols]) + raw_data <- read.table( + file = raw_file, header = FALSE, + sep = "\t", quote = "", dec = ".", + as.is = TRUE, col.names = col_names + ) + + col_classes <- gsub(" {4}class: (.*)", "\\1", meta_data[meta_cols + 1]) + col_factor <- which(col_classes == "factor") + level_rows <- grep("^ {8}- .*$", meta_data) + level_value <- gsub("^ {8}- (.*)$", "\\1", meta_data[level_rows]) + level_id <- cumsum(c(TRUE, diff(level_rows) > 1)) + col_factor_level <- vapply( + seq_along(col_factor), + function(id) { + list(level_value[level_id == id]) + }, + list(character(0)) + ) + names(col_factor_level) <- col_names[col_factor] + for (id in names(col_factor_level)) { + raw_data[[id]] <- factor( + raw_data[[id]], + levels = seq_along(col_factor_level[[id]]), + labels = col_factor_level[[id]] + ) + } + + return(raw_data) +} diff --git a/man/read_delim_file.Rd b/man/read_delim_file.Rd new file mode 100644 index 000000000..f9f8d527e --- /dev/null +++ b/man/read_delim_file.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read_delim_git.R +\name{read_delim_file} +\alias{read_delim_file} +\title{Read a \code{data.frame} from a git repository} +\usage{ +read_delim_file(file, repo) +} +\arguments{ +\item{file}{the name of the file with file extension. Can include a path +relative to the path of the \code{repo}} + +\item{repo}{a \code{git_repository} object, created with +\code{\link{repository}}} +} +\description{ +Read a \code{data.frame} from a git repository +} From e2d0e0e80f256d221fc9b457f2cabb6efe93259b Mon Sep 17 00:00:00 2001 From: ThierryO Date: Mon, 23 Jul 2018 16:34:10 +0200 Subject: [PATCH 04/52] write_delim_git() and read_delim_git() handle directories with dots correctly Signed-off-by: Thierry Onkelinx --- R/read_delim_git.R | 7 ++++--- R/write_delim_git.R | 11 +++++++---- man/read_delim_file.Rd | 3 +++ man/write_delim_git.Rd | 3 +++ 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/R/read_delim_git.R b/R/read_delim_git.R index 91cd05745..f99ccdff2 100644 --- a/R/read_delim_git.R +++ b/R/read_delim_git.R @@ -16,15 +16,16 @@ ##' Read a \code{data.frame} from a git repository ##' ##' @inheritParams write_delim_git +##' @return The \code{data.frame} ##' @export ##' @importFrom utils read.table read_delim_file <- function(file, repo) { - file <- gsub("\\..*$", "", file) + file <- file.path(dirname(file), gsub("\\..*$", "", basename(file))) if (!inherits(repo, "git_repository")) { stop("repo is not a 'git_repository'") } - raw_file <- sprintf("%s/%s.tsv", dirname(repo$path), file) - meta_file <- sprintf("%s/%s.yml", dirname(repo$path), file) + raw_file <- file.path(dirname(repo$path), paste0(file, ".tsv")) + meta_file <- file.path(dirname(repo$path), paste0(file, ".yml")) if (!file.exists(raw_file) || !file.exists(meta_file)) { stop("raw file and/or meta file missing") } diff --git a/R/write_delim_git.R b/R/write_delim_git.R index 4b7cb57ca..6464aa4d1 100644 --- a/R/write_delim_git.R +++ b/R/write_delim_git.R @@ -22,6 +22,7 @@ ##' relative to the path of the \code{repo} ##' @param repo a \code{git_repository} object, created with ##' \code{\link{repository}} +##' @return The relative path to the file ##' @export ##' @include meta.R ##' @importFrom utils write.table @@ -29,15 +30,15 @@ write_delim_git <- function(x, file, repo) { if (!inherits(x, "data.frame")) { stop("x is not a 'data.frame'") } - if (grepl("\\..*$", file)) { + if (grepl("\\..*$", basename(file))) { warning("file extensions are stripped") - file <- gsub("\\..*$", "", file) + file <- file.path(dirname(file), gsub("\\..*$", "", basename(file))) } if (!inherits(repo, "git_repository")) { stop("repo is not a 'git_repository'") } - raw_file <- sprintf("%s/%s.tsv", dirname(repo$path), file) - meta_file <- sprintf("%s/%s.yml", dirname(repo$path), file) + raw_file <- file.path(dirname(repo$path), paste0(file, ".tsv")) + meta_file <- file.path(dirname(repo$path), paste0(file, ".yml")) if (!dir.exists(dirname(raw_file))) { dir.create(dirname(raw_file), recursive = TRUE) } @@ -53,4 +54,6 @@ write_delim_git <- function(x, file, repo) { quote = FALSE, sep = "\t", eol = "\n", dec = ".", row.names = FALSE, col.names = FALSE, fileEncoding = "UTF-8" ) + + return(file) } diff --git a/man/read_delim_file.Rd b/man/read_delim_file.Rd index f9f8d527e..6c3e874b7 100644 --- a/man/read_delim_file.Rd +++ b/man/read_delim_file.Rd @@ -13,6 +13,9 @@ relative to the path of the \code{repo}} \item{repo}{a \code{git_repository} object, created with \code{\link{repository}}} } +\value{ +The \code{data.frame} +} \description{ Read a \code{data.frame} from a git repository } diff --git a/man/write_delim_git.Rd b/man/write_delim_git.Rd index 378729cfa..dc8a143cb 100644 --- a/man/write_delim_git.Rd +++ b/man/write_delim_git.Rd @@ -15,6 +15,9 @@ relative to the path of the \code{repo}} \item{repo}{a \code{git_repository} object, created with \code{\link{repository}}} } +\value{ +The relative path to the file +} \description{ This will create two files. The \code{".tsv"} file contains the raw data. The \code{".yml"} contains the meta data on the columns in YAML format. From ecbf3f4f1ec89795f1e6ce9a90dfe4054e58c02f Mon Sep 17 00:00:00 2001 From: ThierryO Date: Mon, 23 Jul 2018 17:06:13 +0200 Subject: [PATCH 05/52] repository() gains a "project" argument Signed-off-by: Thierry Onkelinx --- R/repository.R | 21 +++++++++++++++++++-- man/repository.Rd | 9 ++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/R/repository.R b/R/repository.R index d8e954bfe..9bec266a8 100644 --- a/R/repository.R +++ b/R/repository.R @@ -84,11 +84,17 @@ as.data.frame.git_repository <- function(x, ...) { ##' ##' @param path A path to an existing local git repository. ##' @param discover Discover repository from path. Default is TRUE. +##' @param project The name of of project. Refers to a local path in case of a +##' data repository. Defaults to \code{NULL}, indicating a standard repository. ##' @return A \code{git_repository} object with entries: ##' \describe{ ##' \item{path}{ ##' Path to a git repository ##' } +##' \item{project}{ +##' If set, the local path to the project starting from the root of the +##' repository +##' } ##' } ##' @export ##' @examples @@ -150,7 +156,7 @@ as.data.frame.git_repository <- function(x, ...) { ##' ## List all tags in repository ##' tags(repo) ##' } -repository <- function(path = ".", discover = TRUE) { +repository <- function(path = ".", discover = TRUE, project = NULL) { if (isTRUE(discover)) { path <- discover_repository(path) if (is.null(path)) @@ -164,7 +170,18 @@ repository <- function(path = ".", discover = TRUE) { if (!isTRUE(.Call(git2r_repository_can_open, path))) stop("Unable to open repository at 'path'") - structure(list(path = path), class = "git_repository") + if (is.null(project)) { + return(structure(list(path = path), class = "git_repository")) + } + + stopifnot(is.character(project)) + stopifnot(length(project) == 1) + + local_path <- file.path(path, project) + if (!dir.exists(local_path)) { + dir.create(local_path, recursive = TRUE) + } + structure(list(path = path, project = project), class = "git_repository") } ##' Init a repository diff --git a/man/repository.Rd b/man/repository.Rd index 6fc2e4d14..9db0f29f7 100644 --- a/man/repository.Rd +++ b/man/repository.Rd @@ -4,12 +4,15 @@ \alias{repository} \title{Open a repository} \usage{ -repository(path = ".", discover = TRUE) +repository(path = ".", discover = TRUE, project = NULL) } \arguments{ \item{path}{A path to an existing local git repository.} \item{discover}{Discover repository from path. Default is TRUE.} + +\item{project}{The name of of project. Refers to a local path in case of a +data repository. Defaults to \code{NULL}, indicating a standard repository.} } \value{ A \code{git_repository} object with entries: @@ -17,6 +20,10 @@ A \code{git_repository} object with entries: \item{path}{ Path to a git repository } + \item{project}{ + If set, the local path to the project starting from the root of the + repository + } } } \description{ From 908619484295163f6169460a5137bbe63f679f83 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Mon, 23 Jul 2018 17:18:22 +0200 Subject: [PATCH 06/52] write_delim_git() and read_delim_git() use the project concept Signed-off-by: Thierry Onkelinx --- R/read_delim_git.R | 9 +++++++-- R/write_delim_git.R | 15 +++++++++++---- man/read_delim_file.Rd | 5 +++-- man/write_delim_git.Rd | 5 +++-- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/R/read_delim_git.R b/R/read_delim_git.R index f99ccdff2..e639c2aa7 100644 --- a/R/read_delim_git.R +++ b/R/read_delim_git.R @@ -24,8 +24,13 @@ read_delim_file <- function(file, repo) { if (!inherits(repo, "git_repository")) { stop("repo is not a 'git_repository'") } - raw_file <- file.path(dirname(repo$path), paste0(file, ".tsv")) - meta_file <- file.path(dirname(repo$path), paste0(file, ".yml")) + if (is.null(repo$project)) { + project_path <- dirname(repo$path) + } else { + project_path <- file.path(dirname(repo$path), repo$project) + } + raw_file <- file.path(project_path, paste0(file, ".tsv")) + meta_file <- file.path(project_path, paste0(file, ".yml")) if (!file.exists(raw_file) || !file.exists(meta_file)) { stop("raw file and/or meta file missing") } diff --git a/R/write_delim_git.R b/R/write_delim_git.R index 6464aa4d1..c5ad6bdd3 100644 --- a/R/write_delim_git.R +++ b/R/write_delim_git.R @@ -18,8 +18,9 @@ ##' This will create two files. The \code{".tsv"} file contains the raw data. ##' The \code{".yml"} contains the meta data on the columns in YAML format. ##' @param x the \code{data.frame} -##' @param file the name of the file with file extension. Can include a path -##' relative to the path of the \code{repo} +##' @param file the name of the file without file extension. Can include a relative +##' path. It is relative to the "project" when set in the \code{repo}. Otherwise +##' it is relative to the root of the \code{repo}. ##' @param repo a \code{git_repository} object, created with ##' \code{\link{repository}} ##' @return The relative path to the file @@ -37,8 +38,14 @@ write_delim_git <- function(x, file, repo) { if (!inherits(repo, "git_repository")) { stop("repo is not a 'git_repository'") } - raw_file <- file.path(dirname(repo$path), paste0(file, ".tsv")) - meta_file <- file.path(dirname(repo$path), paste0(file, ".yml")) + + if (is.null(repo$project)) { + project_path <- dirname(repo$path) + } else { + project_path <- file.path(dirname(repo$path), repo$project) + } + raw_file <- file.path(project_path, paste0(file, ".tsv")) + meta_file <- file.path(project_path, paste0(file, ".yml")) if (!dir.exists(dirname(raw_file))) { dir.create(dirname(raw_file), recursive = TRUE) } diff --git a/man/read_delim_file.Rd b/man/read_delim_file.Rd index 6c3e874b7..7df66afcf 100644 --- a/man/read_delim_file.Rd +++ b/man/read_delim_file.Rd @@ -7,8 +7,9 @@ read_delim_file(file, repo) } \arguments{ -\item{file}{the name of the file with file extension. Can include a path -relative to the path of the \code{repo}} +\item{file}{the name of the file without file extension. Can include a relative +path. It is relative to the "project" when set in the \code{repo}. Otherwise +it is relative to the root of the \code{repo}.} \item{repo}{a \code{git_repository} object, created with \code{\link{repository}}} diff --git a/man/write_delim_git.Rd b/man/write_delim_git.Rd index dc8a143cb..8f16a3b24 100644 --- a/man/write_delim_git.Rd +++ b/man/write_delim_git.Rd @@ -9,8 +9,9 @@ write_delim_git(x, file, repo) \arguments{ \item{x}{the \code{data.frame}} -\item{file}{the name of the file with file extension. Can include a path -relative to the path of the \code{repo}} +\item{file}{the name of the file without file extension. Can include a relative +path. It is relative to the "project" when set in the \code{repo}. Otherwise +it is relative to the root of the \code{repo}.} \item{repo}{a \code{git_repository} object, created with \code{\link{repository}}} From 18b38bbc39fbc8aab45d224df212608385b39ad5 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Mon, 23 Jul 2018 19:52:52 +0200 Subject: [PATCH 07/52] init() and clone() gain the "project" argument Signed-off-by: Thierry Onkelinx --- R/repository.R | 9 ++++++--- man/clone.Rd | 5 ++++- man/init.Rd | 5 ++++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/R/repository.R b/R/repository.R index 9bec266a8..1d303b2d6 100644 --- a/R/repository.R +++ b/R/repository.R @@ -191,6 +191,7 @@ repository <- function(path = ".", discover = TRUE, project = NULL) { ##' is created at the pointed path. If FALSE, provided path will ##' be considered as the working directory into which the .git ##' directory will be created. +##' @inheritParams repository ##' @return A \code{git_repository} object ##' @export ##' @seealso \link{repository} @@ -208,12 +209,12 @@ repository <- function(path = ".", discover = TRUE, project = NULL) { ##' repo_bare <- init(path_bare, bare = TRUE) ##' is_bare(repo_bare) ##' } -init <- function(path = ".", bare = FALSE) { +init <- function(path = ".", bare = FALSE, project = NULL) { path <- normalizePath(path, winslash = "/", mustWork = TRUE) if (!file.info(path)$isdir) stop("'path' is not a directory") .Call(git2r_repository_init, path, bare) - repository(path) + repository(path, project = project) } ##' Clone a remote repository @@ -229,6 +230,7 @@ init <- function(path = ".", bare = FALSE) { ##' access. Default is NULL. To use and query an ssh-agent for the ##' ssh key credentials, let this parameter be NULL (the default). ##' @param progress Show progress. Default is TRUE. +##' @inheritParams repository ##' @return A \code{git_repository} object. ##' @seealso \link{repository}, \code{\link{cred_user_pass}}, ##' \code{\link{cred_ssh_key}} @@ -279,11 +281,12 @@ clone <- function(url = NULL, branch = NULL, checkout = TRUE, credentials = NULL, + project = NULL, progress = TRUE) { .Call(git2r_clone, url, local_path, bare, branch, checkout, credentials, progress) - repository(local_path) + repository(local_path, project = project) } ##' Get HEAD for a repository diff --git a/man/clone.Rd b/man/clone.Rd index 97afb89a6..c6544d3d8 100644 --- a/man/clone.Rd +++ b/man/clone.Rd @@ -5,7 +5,7 @@ \title{Clone a remote repository} \usage{ clone(url = NULL, local_path = NULL, bare = FALSE, branch = NULL, - checkout = TRUE, credentials = NULL, progress = TRUE) + checkout = TRUE, credentials = NULL, project = NULL, progress = TRUE) } \arguments{ \item{url}{The remote repository to clone} @@ -24,6 +24,9 @@ is TRUE.} access. Default is NULL. To use and query an ssh-agent for the ssh key credentials, let this parameter be NULL (the default).} +\item{project}{The name of of project. Refers to a local path in case of a +data repository. Defaults to \code{NULL}, indicating a standard repository.} + \item{progress}{Show progress. Default is TRUE.} } \value{ diff --git a/man/init.Rd b/man/init.Rd index 0d10d4a80..ff8d24bd6 100644 --- a/man/init.Rd +++ b/man/init.Rd @@ -4,7 +4,7 @@ \alias{init} \title{Init a repository} \usage{ -init(path = ".", bare = FALSE) +init(path = ".", bare = FALSE, project = NULL) } \arguments{ \item{path}{A path to where to init a git repository} @@ -13,6 +13,9 @@ init(path = ".", bare = FALSE) is created at the pointed path. If FALSE, provided path will be considered as the working directory into which the .git directory will be created.} + +\item{project}{The name of of project. Refers to a local path in case of a +data repository. Defaults to \code{NULL}, indicating a standard repository.} } \value{ A \code{git_repository} object From 7b490df5adcff241852d4aa5fbfdf05c8988aa23 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Mon, 23 Jul 2018 20:13:01 +0200 Subject: [PATCH 08/52] export meta() Signed-off-by: Thierry Onkelinx --- NAMESPACE | 5 +++++ R/meta.R | 6 +++++- man/meta.Rd | 14 ++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 man/meta.Rd diff --git a/NAMESPACE b/NAMESPACE index a7d3ba8ec..26cfd2c3f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,6 +22,10 @@ S3method(length,git_tree) S3method(merge,character) S3method(merge,git_branch) S3method(merge,git_repository) +S3method(meta,character) +S3method(meta,factor) +S3method(meta,integer) +S3method(meta,numeric) S3method(plot,git_repository) S3method(print,git_blob) S3method(print,git_branch) @@ -113,6 +117,7 @@ export(lookup) export(ls_tree) export(merge) export(merge_base) +export(meta) export(note_create) export(note_default_ref) export(note_remove) diff --git a/R/meta.R b/R/meta.R index 2a1906451..caca2fdd9 100644 --- a/R/meta.R +++ b/R/meta.R @@ -16,26 +16,30 @@ ##' Optimise a vector for storage in to a git repository and add meta data ##' @param x the vector -##' @noRd +##' @export meta <- function(x) { UseMethod("meta") } +##' @export meta.character <- function(x) { attr(x, "meta") <- " class: character" return(x) } +##' @export meta.integer <- function(x) { attr(x, "meta") <- " class: integer" return(x) } +##' @export meta.numeric <- function(x) { attr(x, "meta") <- " class: numeric" return(x) } +##' @export meta.factor <- function(x) { z <- as.integer(x) attr(z, "meta") <- paste( diff --git a/man/meta.Rd b/man/meta.Rd new file mode 100644 index 000000000..32ffaceac --- /dev/null +++ b/man/meta.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/meta.R +\name{meta} +\alias{meta} +\title{Optimise a vector for storage in to a git repository and add meta data} +\usage{ +meta(x) +} +\arguments{ +\item{x}{the vector} +} +\description{ +Optimise a vector for storage in to a git repository and add meta data +} From a19f3a34a286175229cda7735ce2a8d00e33c9f1 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Mon, 23 Jul 2018 20:46:13 +0200 Subject: [PATCH 09/52] workdir() takes "project" into account when set Signed-off-by: Thierry Onkelinx --- R/read_delim_git.R | 9 ++------- R/repository.R | 6 +++++- R/write_delim_git.R | 9 ++------- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/R/read_delim_git.R b/R/read_delim_git.R index e639c2aa7..0cd4f1b11 100644 --- a/R/read_delim_git.R +++ b/R/read_delim_git.R @@ -24,13 +24,8 @@ read_delim_file <- function(file, repo) { if (!inherits(repo, "git_repository")) { stop("repo is not a 'git_repository'") } - if (is.null(repo$project)) { - project_path <- dirname(repo$path) - } else { - project_path <- file.path(dirname(repo$path), repo$project) - } - raw_file <- file.path(project_path, paste0(file, ".tsv")) - meta_file <- file.path(project_path, paste0(file, ".yml")) + raw_file <- file.path(workdir(repo), paste0(file, ".tsv")) + meta_file <- file.path(workdir(repo), paste0(file, ".yml")) if (!file.exists(raw_file) || !file.exists(meta_file)) { stop("raw file and/or meta file missing") } diff --git a/R/repository.R b/R/repository.R index 1d303b2d6..cb2458301 100644 --- a/R/repository.R +++ b/R/repository.R @@ -719,7 +719,11 @@ strip_trailing_slash <- function(path) { ##' } workdir <- function(repo = ".") { path <- .Call(git2r_repository_workdir, lookup_repository(repo)) - strip_trailing_slash(path) + path <- strip_trailing_slash(path) + if (!inherits(repo, "git_repository") || is.null(repo$project)) { + return(path) + } + strip_trailing_slash(file.path(path, repo$project)) } ##' Find path to repository for any file diff --git a/R/write_delim_git.R b/R/write_delim_git.R index c5ad6bdd3..3374cf4d3 100644 --- a/R/write_delim_git.R +++ b/R/write_delim_git.R @@ -39,13 +39,8 @@ write_delim_git <- function(x, file, repo) { stop("repo is not a 'git_repository'") } - if (is.null(repo$project)) { - project_path <- dirname(repo$path) - } else { - project_path <- file.path(dirname(repo$path), repo$project) - } - raw_file <- file.path(project_path, paste0(file, ".tsv")) - meta_file <- file.path(project_path, paste0(file, ".yml")) + raw_file <- file.path(workdir(repo), paste0(file, ".tsv")) + meta_file <- file.path(workdir(repo), paste0(file, ".yml")) if (!dir.exists(dirname(raw_file))) { dir.create(dirname(raw_file), recursive = TRUE) } From 66817dc11dea14b5db2b98b2bc16ea197d7b1334 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Mon, 23 Jul 2018 21:17:15 +0200 Subject: [PATCH 10/52] move meta() and read_delim_git() and use 4 spaces per tab Signed-off-by: Thierry Onkelinx --- DESCRIPTION | 2 - R/meta.R | 51 --------------- R/read_delim_git.R | 63 ------------------ R/write_delim_git.R | 144 ++++++++++++++++++++++++++++++++--------- man/meta.Rd | 2 +- man/read_delim_file.Rd | 2 +- 6 files changed, 116 insertions(+), 148 deletions(-) delete mode 100644 R/meta.R delete mode 100644 R/read_delim_git.R diff --git a/DESCRIPTION b/DESCRIPTION index 820103d22..6b0a48ba0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,14 +44,12 @@ Collate: 'index.R' 'libgit2.R' 'merge.R' - 'meta.R' 'note.R' 'odb.R' 'plot.R' 'pull.R' 'punch_card.R' 'push.R' - 'read_delim_git.R' 'reference.R' 'reflog.R' 'refspec.R' diff --git a/R/meta.R b/R/meta.R deleted file mode 100644 index caca2fdd9..000000000 --- a/R/meta.R +++ /dev/null @@ -1,51 +0,0 @@ -## git2r, R bindings to the libgit2 library. -## Copyright (C) 2013-2018 The git2r contributors -## -## This program is free software; you can redistribute it and/or modify -## it under the terms of the GNU General Public License, version 2, -## as published by the Free Software Foundation. -## -## git2r is distributed in the hope that it will be useful, -## but WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -## GNU General Public License for more details. -## -## You should have received a copy of the GNU General Public License along -## with this program; if not, write to the Free Software Foundation, Inc., -## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -##' Optimise a vector for storage in to a git repository and add meta data -##' @param x the vector -##' @export -meta <- function(x) { - UseMethod("meta") -} - -##' @export -meta.character <- function(x) { - attr(x, "meta") <- " class: character" - return(x) -} - -##' @export -meta.integer <- function(x) { - attr(x, "meta") <- " class: integer" - return(x) -} - -##' @export -meta.numeric <- function(x) { - attr(x, "meta") <- " class: numeric" - return(x) -} - -##' @export -meta.factor <- function(x) { - z <- as.integer(x) - attr(z, "meta") <- paste( - " class: factor\n levels:", - paste(" -", levels(x), collapse = "\n"), - sep = "\n" - ) - return(z) -} diff --git a/R/read_delim_git.R b/R/read_delim_git.R deleted file mode 100644 index 0cd4f1b11..000000000 --- a/R/read_delim_git.R +++ /dev/null @@ -1,63 +0,0 @@ -## Copyright (C) 2013-2018 The git2r contributors -## -## This program is free software; you can redistribute it and/or modify -## it under the terms of the GNU General Public License, version 2, -## as published by the Free Software Foundation. -## -## git2r is distributed in the hope that it will be useful, -## but WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -## GNU General Public License for more details. -## -## You should have received a copy of the GNU General Public License along -## with this program; if not, write to the Free Software Foundation, Inc., -## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -##' Read a \code{data.frame} from a git repository -##' -##' @inheritParams write_delim_git -##' @return The \code{data.frame} -##' @export -##' @importFrom utils read.table -read_delim_file <- function(file, repo) { - file <- file.path(dirname(file), gsub("\\..*$", "", basename(file))) - if (!inherits(repo, "git_repository")) { - stop("repo is not a 'git_repository'") - } - raw_file <- file.path(workdir(repo), paste0(file, ".tsv")) - meta_file <- file.path(workdir(repo), paste0(file, ".yml")) - if (!file.exists(raw_file) || !file.exists(meta_file)) { - stop("raw file and/or meta file missing") - } - meta_data <- readLines(meta_file) - meta_cols <- grep("^\\S*:$", meta_data) - col_names <- gsub(":", "", meta_data[meta_cols]) - raw_data <- read.table( - file = raw_file, header = FALSE, - sep = "\t", quote = "", dec = ".", - as.is = TRUE, col.names = col_names - ) - - col_classes <- gsub(" {4}class: (.*)", "\\1", meta_data[meta_cols + 1]) - col_factor <- which(col_classes == "factor") - level_rows <- grep("^ {8}- .*$", meta_data) - level_value <- gsub("^ {8}- (.*)$", "\\1", meta_data[level_rows]) - level_id <- cumsum(c(TRUE, diff(level_rows) > 1)) - col_factor_level <- vapply( - seq_along(col_factor), - function(id) { - list(level_value[level_id == id]) - }, - list(character(0)) - ) - names(col_factor_level) <- col_names[col_factor] - for (id in names(col_factor_level)) { - raw_data[[id]] <- factor( - raw_data[[id]], - levels = seq_along(col_factor_level[[id]]), - labels = col_factor_level[[id]] - ) - } - - return(raw_data) -} diff --git a/R/write_delim_git.R b/R/write_delim_git.R index 3374cf4d3..76cad4752 100644 --- a/R/write_delim_git.R +++ b/R/write_delim_git.R @@ -25,37 +25,121 @@ ##' \code{\link{repository}} ##' @return The relative path to the file ##' @export -##' @include meta.R ##' @importFrom utils write.table write_delim_git <- function(x, file, repo) { - if (!inherits(x, "data.frame")) { - stop("x is not a 'data.frame'") - } - if (grepl("\\..*$", basename(file))) { - warning("file extensions are stripped") + if (!inherits(x, "data.frame")) { + stop("x is not a 'data.frame'") + } + if (grepl("\\..*$", basename(file))) { + warning("file extensions are stripped") + file <- file.path(dirname(file), gsub("\\..*$", "", basename(file))) + } + if (!inherits(repo, "git_repository")) { + stop("repo is not a 'git_repository'") + } + + raw_file <- file.path(workdir(repo), paste0(file, ".tsv")) + meta_file <- file.path(workdir(repo), paste0(file, ".yml")) + if (!dir.exists(dirname(raw_file))) { + dir.create(dirname(raw_file), recursive = TRUE) + } + raw_data <- as.data.frame(lapply(x, meta), stringsAsFactors = FALSE) + meta_data <- paste( + colnames(x), + vapply(raw_data, attr, "", which = "meta"), + sep = ":\n" + ) + writeLines(meta_data, meta_file) + write.table( + x = raw_data, file = raw_file, append = FALSE, + quote = FALSE, sep = "\t", eol = "\n", dec = ".", + row.names = FALSE, col.names = FALSE, fileEncoding = "UTF-8" + ) + + return(file) +} + +##' Read a \code{data.frame} from a git repository +##' +##' @inheritParams write_delim_git +##' @return The \code{data.frame} +##' @export +##' @importFrom utils read.table +read_delim_file <- function(file, repo) { file <- file.path(dirname(file), gsub("\\..*$", "", basename(file))) - } - if (!inherits(repo, "git_repository")) { - stop("repo is not a 'git_repository'") - } - - raw_file <- file.path(workdir(repo), paste0(file, ".tsv")) - meta_file <- file.path(workdir(repo), paste0(file, ".yml")) - if (!dir.exists(dirname(raw_file))) { - dir.create(dirname(raw_file), recursive = TRUE) - } - raw_data <- as.data.frame(lapply(x, meta), stringsAsFactors = FALSE) - meta_data <- paste( - colnames(x), - vapply(raw_data, attr, "", which = "meta"), - sep = ":\n" - ) - writeLines(meta_data, meta_file) - write.table( - x = raw_data, file = raw_file, append = FALSE, - quote = FALSE, sep = "\t", eol = "\n", dec = ".", - row.names = FALSE, col.names = FALSE, fileEncoding = "UTF-8" - ) - - return(file) + if (!inherits(repo, "git_repository")) { + stop("repo is not a 'git_repository'") + } + raw_file <- file.path(workdir(repo), paste0(file, ".tsv")) + meta_file <- file.path(workdir(repo), paste0(file, ".yml")) + if (!file.exists(raw_file) || !file.exists(meta_file)) { + stop("raw file and/or meta file missing") + } + meta_data <- readLines(meta_file) + meta_cols <- grep("^\\S*:$", meta_data) + col_names <- gsub(":", "", meta_data[meta_cols]) + raw_data <- read.table( + file = raw_file, header = FALSE, + sep = "\t", quote = "", dec = ".", + as.is = TRUE, col.names = col_names + ) + + col_classes <- gsub(" {4}class: (.*)", "\\1", meta_data[meta_cols + 1]) + col_factor <- which(col_classes == "factor") + level_rows <- grep("^ {8}- .*$", meta_data) + level_value <- gsub("^ {8}- (.*)$", "\\1", meta_data[level_rows]) + level_id <- cumsum(c(TRUE, diff(level_rows) > 1)) + col_factor_level <- vapply( + seq_along(col_factor), + function(id) { + list(level_value[level_id == id]) + }, + list(character(0)) + ) + names(col_factor_level) <- col_names[col_factor] + for (id in names(col_factor_level)) { + raw_data[[id]] <- factor( + raw_data[[id]], + levels = seq_along(col_factor_level[[id]]), + labels = col_factor_level[[id]] + ) + } + + return(raw_data) +} + +##' Optimise a vector for storage in to a git repository and add meta data +##' @param x the vector +##' @export +meta <- function(x) { + UseMethod("meta") +} + +##' @export +meta.character <- function(x) { + attr(x, "meta") <- " class: character" + return(x) +} + +##' @export +meta.integer <- function(x) { + attr(x, "meta") <- " class: integer" + return(x) +} + +##' @export +meta.numeric <- function(x) { + attr(x, "meta") <- " class: numeric" + return(x) +} + +##' @export +meta.factor <- function(x) { + z <- as.integer(x) + attr(z, "meta") <- paste( + " class: factor\n levels:", + paste(" -", levels(x), collapse = "\n"), + sep = "\n" + ) + return(z) } diff --git a/man/meta.Rd b/man/meta.Rd index 32ffaceac..c241cc064 100644 --- a/man/meta.Rd +++ b/man/meta.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/meta.R +% Please edit documentation in R/write_delim_git.R \name{meta} \alias{meta} \title{Optimise a vector for storage in to a git repository and add meta data} diff --git a/man/read_delim_file.Rd b/man/read_delim_file.Rd index 7df66afcf..dc4e2a88c 100644 --- a/man/read_delim_file.Rd +++ b/man/read_delim_file.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/read_delim_git.R +% Please edit documentation in R/write_delim_git.R \name{read_delim_file} \alias{read_delim_file} \title{Read a \code{data.frame} from a git repository} From 7dfa1310542871b8322ad870f19e920b3c925549 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Mon, 23 Jul 2018 21:39:06 +0200 Subject: [PATCH 11/52] write_delim_git() stages the files Signed-off-by: Thierry Onkelinx --- R/index.R | 3 +++ R/write_delim_git.R | 10 ++++++---- man/read_delim_file.Rd | 6 +++--- man/write_delim_git.Rd | 8 +++++--- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/R/index.R b/R/index.R index 3a299d724..e4881a273 100644 --- a/R/index.R +++ b/R/index.R @@ -118,6 +118,9 @@ add <- function(repo = ".", path = NULL, force = FALSE) ## directory. Substitute common prefix with "" sub(paste0("^", repo_wd), "", np) }, character(1)) + if (inherits(repo, "git_repository") && !is.null(repo$project)) { + path <- file.path(repo$project, path) + } .Call(git2r_index_add_all, repo, path, isTRUE(force)) diff --git a/R/write_delim_git.R b/R/write_delim_git.R index 76cad4752..1a88fc392 100644 --- a/R/write_delim_git.R +++ b/R/write_delim_git.R @@ -21,12 +21,12 @@ ##' @param file the name of the file without file extension. Can include a relative ##' path. It is relative to the "project" when set in the \code{repo}. Otherwise ##' it is relative to the root of the \code{repo}. -##' @param repo a \code{git_repository} object, created with -##' \code{\link{repository}} +##' @template repo-param +##' @inheritParams add ##' @return The relative path to the file ##' @export ##' @importFrom utils write.table -write_delim_git <- function(x, file, repo) { +write_delim_git <- function(x, file, repo = ".", force = FALSE) { if (!inherits(x, "data.frame")) { stop("x is not a 'data.frame'") } @@ -55,17 +55,19 @@ write_delim_git <- function(x, file, repo) { quote = FALSE, sep = "\t", eol = "\n", dec = ".", row.names = FALSE, col.names = FALSE, fileEncoding = "UTF-8" ) + add(repo, path = paste0(file, c(".tsv", ".yml")), force = force) return(file) } ##' Read a \code{data.frame} from a git repository ##' +##' @template repo-param ##' @inheritParams write_delim_git ##' @return The \code{data.frame} ##' @export ##' @importFrom utils read.table -read_delim_file <- function(file, repo) { +read_delim_file <- function(file, repo = ".") { file <- file.path(dirname(file), gsub("\\..*$", "", basename(file))) if (!inherits(repo, "git_repository")) { stop("repo is not a 'git_repository'") diff --git a/man/read_delim_file.Rd b/man/read_delim_file.Rd index dc4e2a88c..6bc255fc1 100644 --- a/man/read_delim_file.Rd +++ b/man/read_delim_file.Rd @@ -4,15 +4,15 @@ \alias{read_delim_file} \title{Read a \code{data.frame} from a git repository} \usage{ -read_delim_file(file, repo) +read_delim_file(file, repo = ".") } \arguments{ \item{file}{the name of the file without file extension. Can include a relative path. It is relative to the "project" when set in the \code{repo}. Otherwise it is relative to the root of the \code{repo}.} -\item{repo}{a \code{git_repository} object, created with -\code{\link{repository}}} +\item{repo}{a path to a repository or a \code{git_repository} +object. Default is '.'} } \value{ The \code{data.frame} diff --git a/man/write_delim_git.Rd b/man/write_delim_git.Rd index 8f16a3b24..3c1b6adff 100644 --- a/man/write_delim_git.Rd +++ b/man/write_delim_git.Rd @@ -4,7 +4,7 @@ \alias{write_delim_git} \title{Write a \code{data.frame} to a git repository} \usage{ -write_delim_git(x, file, repo) +write_delim_git(x, file, repo = ".", force = FALSE) } \arguments{ \item{x}{the \code{data.frame}} @@ -13,8 +13,10 @@ write_delim_git(x, file, repo) path. It is relative to the "project" when set in the \code{repo}. Otherwise it is relative to the root of the \code{repo}.} -\item{repo}{a \code{git_repository} object, created with -\code{\link{repository}}} +\item{repo}{a path to a repository or a \code{git_repository} +object. Default is '.'} + +\item{force}{Add ignored files. Default is FALSE.} } \value{ The relative path to the file From ebcc9c8b8de4a6b51d744ef54def7b0e7e4d60e4 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Mon, 23 Jul 2018 22:56:17 +0200 Subject: [PATCH 12/52] status() takes "project" into account Signed-off-by: Thierry Onkelinx --- R/status.R | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/R/status.R b/R/status.R index 2127c509d..1968e0ff1 100644 --- a/R/status.R +++ b/R/status.R @@ -77,9 +77,17 @@ status <- function(repo = ".", ignored = FALSE, all_untracked = FALSE) { - structure(.Call(git2r_status_list, lookup_repository(repo), staged, + s <- structure(.Call(git2r_status_list, lookup_repository(repo), staged, unstaged, untracked, all_untracked, ignored), class = "git_status") + if (!inherits(repo, "git_repository") || is.null(repo$project)) { + return(s) + } + rgx <- paste0("^", repo$project, "/") + s <- lapply(s, function(x){lapply(x, gsub, pattern = rgx, replacement = "")}) + class(s) <- "git_status" + attr(s, "project") <- repo$project + return(s) } ##' @export @@ -98,6 +106,11 @@ print.git_status <- function(x, ...) invisible(NULL) } + project <- attr(x, "project", exact = TRUE) + if (!is.null(project)) { + cat("Project folder:", project, "\n\n") + } + if (max(sapply(x, length)) == 0L) cat("working directory clean\n") From a9349cae7307a47739bbb15b143c838a88109bad Mon Sep 17 00:00:00 2001 From: ThierryO Date: Tue, 24 Jul 2018 12:34:23 +0200 Subject: [PATCH 13/52] rm_file() handles data repos Signed-off-by: Thierry Onkelinx --- R/index.R | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/R/index.R b/R/index.R index e4881a273..c23df6a3f 100644 --- a/R/index.R +++ b/R/index.R @@ -159,11 +159,25 @@ add <- function(repo = ".", path = NULL, force = FALSE) ##' status(repo) ##' } rm_file <- function(repo = ".", path = NULL) { - if (is.null(path) || !is.character(path)) - stop("'path' must be a character vector") + if (is.null(path) || !is.character(path)) { + if (!inherits(repo, "git_repository") || is.null(repo$project)) { + stop("'path' must be a character vector") + } + path <- list.files( + workdir(repo), + pattern = "\\.(tsv|yml)$", + recursive = TRUE + ) + } repo <- lookup_repository(repo) + if (inherits(repo, "git_repository") & !is.null(repo$project)) { + path <- file.path(dirname(path), gsub("\\..*$", "", basename(path))) + path <- normalizePath(unique(path), mustWork = FALSE) + path <- c(paste0(path, ".tsv"), paste0(path, ".yml")) + } + if (length(path)) { wd <- workdir(repo) From 6cf073753a638dae06077766f304d2311b7d9126 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Tue, 24 Jul 2018 14:31:09 +0200 Subject: [PATCH 14/52] add is_data_repo() Signed-off-by: Thierry Onkelinx --- NAMESPACE | 1 + R/index.R | 6 +++--- R/status.R | 2 +- R/write_delim_git.R | 18 ++++++++++++------ man/is_data_repo.Rd | 17 +++++++++++++++++ 5 files changed, 34 insertions(+), 10 deletions(-) create mode 100644 man/is_data_repo.Rd diff --git a/NAMESPACE b/NAMESPACE index 26cfd2c3f..d52335e45 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -103,6 +103,7 @@ export(is_binary) export(is_blob) export(is_branch) export(is_commit) +export(is_data_repo) export(is_detached) export(is_empty) export(is_head) diff --git a/R/index.R b/R/index.R index c23df6a3f..9785e81d5 100644 --- a/R/index.R +++ b/R/index.R @@ -118,7 +118,7 @@ add <- function(repo = ".", path = NULL, force = FALSE) ## directory. Substitute common prefix with "" sub(paste0("^", repo_wd), "", np) }, character(1)) - if (inherits(repo, "git_repository") && !is.null(repo$project)) { + if (is_data_repo(repo)) { path <- file.path(repo$project, path) } @@ -160,7 +160,7 @@ add <- function(repo = ".", path = NULL, force = FALSE) ##' } rm_file <- function(repo = ".", path = NULL) { if (is.null(path) || !is.character(path)) { - if (!inherits(repo, "git_repository") || is.null(repo$project)) { + if (!is_data_repo(repo)) { stop("'path' must be a character vector") } path <- list.files( @@ -172,7 +172,7 @@ rm_file <- function(repo = ".", path = NULL) { repo <- lookup_repository(repo) - if (inherits(repo, "git_repository") & !is.null(repo$project)) { + if (is_data_repo(repo)) { path <- file.path(dirname(path), gsub("\\..*$", "", basename(path))) path <- normalizePath(unique(path), mustWork = FALSE) path <- c(paste0(path, ".tsv"), paste0(path, ".yml")) diff --git a/R/status.R b/R/status.R index 1968e0ff1..e18253cd7 100644 --- a/R/status.R +++ b/R/status.R @@ -80,7 +80,7 @@ status <- function(repo = ".", s <- structure(.Call(git2r_status_list, lookup_repository(repo), staged, unstaged, untracked, all_untracked, ignored), class = "git_status") - if (!inherits(repo, "git_repository") || is.null(repo$project)) { + if (!is_data_repo(repo)) { return(s) } rgx <- paste0("^", repo$project, "/") diff --git a/R/write_delim_git.R b/R/write_delim_git.R index 1a88fc392..dc7315df5 100644 --- a/R/write_delim_git.R +++ b/R/write_delim_git.R @@ -34,9 +34,7 @@ write_delim_git <- function(x, file, repo = ".", force = FALSE) { warning("file extensions are stripped") file <- file.path(dirname(file), gsub("\\..*$", "", basename(file))) } - if (!inherits(repo, "git_repository")) { - stop("repo is not a 'git_repository'") - } + repo <- lookup_repository(repo) raw_file <- file.path(workdir(repo), paste0(file, ".tsv")) meta_file <- file.path(workdir(repo), paste0(file, ".yml")) @@ -69,9 +67,8 @@ write_delim_git <- function(x, file, repo = ".", force = FALSE) { ##' @importFrom utils read.table read_delim_file <- function(file, repo = ".") { file <- file.path(dirname(file), gsub("\\..*$", "", basename(file))) - if (!inherits(repo, "git_repository")) { - stop("repo is not a 'git_repository'") - } + repo <- lookup_repository(repo) + raw_file <- file.path(workdir(repo), paste0(file, ".tsv")) meta_file <- file.path(workdir(repo), paste0(file, ".yml")) if (!file.exists(raw_file) || !file.exists(meta_file)) { @@ -145,3 +142,12 @@ meta.factor <- function(x) { ) return(z) } + +##' Check if object is a data repository +##' @param object the object to check +##' @value TRUE is a data \code{git_repository}, else FALSE +##' @seealso repo init +##' @export +is_data_repo <- function(object) { + inherits(object, "git_repository") && !is.null(object$project) +} diff --git a/man/is_data_repo.Rd b/man/is_data_repo.Rd new file mode 100644 index 000000000..783898fe0 --- /dev/null +++ b/man/is_data_repo.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/write_delim_git.R +\name{is_data_repo} +\alias{is_data_repo} +\title{Check if object is a data repository} +\usage{ +is_data_repo(object) +} +\arguments{ +\item{object}{the object to check} +} +\description{ +Check if object is a data repository +} +\seealso{ +repo init +} From 3bfceb9444b65475f982d386cea8cea4ba1353c4 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Tue, 24 Jul 2018 14:33:11 +0200 Subject: [PATCH 15/52] Fix typo Signed-off-by: Thierry Onkelinx --- NAMESPACE | 2 +- R/write_delim_git.R | 2 +- man/{read_delim_file.Rd => read_delim_git.Rd} | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) rename man/{read_delim_file.Rd => read_delim_git.Rd} (87%) diff --git a/NAMESPACE b/NAMESPACE index d52335e45..90c8b23af 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -129,7 +129,7 @@ export(parents) export(pull) export(punch_card) export(push) -export(read_delim_file) +export(read_delim_git) export(references) export(reflog) export(remote_add) diff --git a/R/write_delim_git.R b/R/write_delim_git.R index dc7315df5..5f270fece 100644 --- a/R/write_delim_git.R +++ b/R/write_delim_git.R @@ -65,7 +65,7 @@ write_delim_git <- function(x, file, repo = ".", force = FALSE) { ##' @return The \code{data.frame} ##' @export ##' @importFrom utils read.table -read_delim_file <- function(file, repo = ".") { +read_delim_git <- function(file, repo = ".") { file <- file.path(dirname(file), gsub("\\..*$", "", basename(file))) repo <- lookup_repository(repo) diff --git a/man/read_delim_file.Rd b/man/read_delim_git.Rd similarity index 87% rename from man/read_delim_file.Rd rename to man/read_delim_git.Rd index 6bc255fc1..526fab49b 100644 --- a/man/read_delim_file.Rd +++ b/man/read_delim_git.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/write_delim_git.R -\name{read_delim_file} -\alias{read_delim_file} +\name{read_delim_git} +\alias{read_delim_git} \title{Read a \code{data.frame} from a git repository} \usage{ -read_delim_file(file, repo = ".") +read_delim_git(file, repo = ".") } \arguments{ \item{file}{the name of the file without file extension. Can include a relative From cdb3e8122babe0f3ae7bd6fb2b7cf0252a1d264d Mon Sep 17 00:00:00 2001 From: ThierryO Date: Tue, 24 Jul 2018 15:40:10 +0200 Subject: [PATCH 16/52] reset() handles data repositories Signed-off-by: Thierry Onkelinx --- R/index.R | 4 +--- R/reset.R | 4 ++++ R/write_delim_git.R | 42 ++++++++++++++++++++++++++---------------- man/is_data_repo.Rd | 3 +++ 4 files changed, 34 insertions(+), 19 deletions(-) diff --git a/R/index.R b/R/index.R index 9785e81d5..efbf82c97 100644 --- a/R/index.R +++ b/R/index.R @@ -173,9 +173,7 @@ rm_file <- function(repo = ".", path = NULL) { repo <- lookup_repository(repo) if (is_data_repo(repo)) { - path <- file.path(dirname(path), gsub("\\..*$", "", basename(path))) - path <- normalizePath(unique(path), mustWork = FALSE) - path <- c(paste0(path, ".tsv"), paste0(path, ".yml")) + path <- clean_data_path(path) } if (length(path)) { diff --git a/R/reset.R b/R/reset.R index d60842312..059fe0cfe 100644 --- a/R/reset.R +++ b/R/reset.R @@ -93,6 +93,10 @@ reset <- function(object, reset_type = c("soft", "mixed", "hard"), path = NULL) if (is_empty(object)) { .Call(git2r_index_remove_bypath, object, path) } else { + if (is_data_repo(object)) { + path <- file.path(object$project, path) + path <- clean_data_path(path) + } .Call(git2r_reset_default, object, path) } } diff --git a/R/write_delim_git.R b/R/write_delim_git.R index 5f270fece..669d4c420 100644 --- a/R/write_delim_git.R +++ b/R/write_delim_git.R @@ -30,16 +30,17 @@ write_delim_git <- function(x, file, repo = ".", force = FALSE) { if (!inherits(x, "data.frame")) { stop("x is not a 'data.frame'") } + repo <- lookup_repository(repo) + if (is_data_repo(repo)) { + file <- file.path(workdir(repo), file) + } if (grepl("\\..*$", basename(file))) { warning("file extensions are stripped") - file <- file.path(dirname(file), gsub("\\..*$", "", basename(file))) + file <- clean_data_path(file) } - repo <- lookup_repository(repo) - raw_file <- file.path(workdir(repo), paste0(file, ".tsv")) - meta_file <- file.path(workdir(repo), paste0(file, ".yml")) - if (!dir.exists(dirname(raw_file))) { - dir.create(dirname(raw_file), recursive = TRUE) + if (!dir.exists(dirname(file["raw_file"]))) { + dir.create(dirname(file["raw_file"]), recursive = TRUE) } raw_data <- as.data.frame(lapply(x, meta), stringsAsFactors = FALSE) meta_data <- paste( @@ -47,13 +48,13 @@ write_delim_git <- function(x, file, repo = ".", force = FALSE) { vapply(raw_data, attr, "", which = "meta"), sep = ":\n" ) - writeLines(meta_data, meta_file) + writeLines(meta_data, file["meta_file"]) write.table( - x = raw_data, file = raw_file, append = FALSE, + x = raw_data, file = file["raw_file"], append = FALSE, quote = FALSE, sep = "\t", eol = "\n", dec = ".", row.names = FALSE, col.names = FALSE, fileEncoding = "UTF-8" ) - add(repo, path = paste0(file, c(".tsv", ".yml")), force = force) + add(repo, path = file, force = force) return(file) } @@ -66,19 +67,17 @@ write_delim_git <- function(x, file, repo = ".", force = FALSE) { ##' @export ##' @importFrom utils read.table read_delim_git <- function(file, repo = ".") { - file <- file.path(dirname(file), gsub("\\..*$", "", basename(file))) repo <- lookup_repository(repo) + file <- clean_data_path(file) - raw_file <- file.path(workdir(repo), paste0(file, ".tsv")) - meta_file <- file.path(workdir(repo), paste0(file, ".yml")) - if (!file.exists(raw_file) || !file.exists(meta_file)) { + if (!all(file.exists(file))) { stop("raw file and/or meta file missing") } - meta_data <- readLines(meta_file) + meta_data <- readLines(file["meta_file"]) meta_cols <- grep("^\\S*:$", meta_data) col_names <- gsub(":", "", meta_data[meta_cols]) raw_data <- read.table( - file = raw_file, header = FALSE, + file = file["raw_file"], header = FALSE, sep = "\t", quote = "", dec = ".", as.is = TRUE, col.names = col_names ) @@ -145,9 +144,20 @@ meta.factor <- function(x) { ##' Check if object is a data repository ##' @param object the object to check -##' @value TRUE is a data \code{git_repository}, else FALSE +##' @return TRUE is a data \code{git_repository}, else FALSE ##' @seealso repo init ##' @export is_data_repo <- function(object) { inherits(object, "git_repository") && !is.null(object$project) } + +##' Clean the data path +##' Strips any file extension from the path and adds the ".tsv" and ".yml" file extensions +##' @param path the paths +##' @return a named vector with "raw_file" and "meta_file", refering to the ".tsv" and ".yml" files +##' @noRd +clean_data_path <- function(path) { + path <- file.path(dirname(path), gsub("\\..*$", "", basename(path))) + path <- normalizePath(unique(path), mustWork = FALSE) + c(raw_file = paste0(path, ".tsv"), meta_file = paste0(path, ".yml")) +} diff --git a/man/is_data_repo.Rd b/man/is_data_repo.Rd index 783898fe0..0d29678c4 100644 --- a/man/is_data_repo.Rd +++ b/man/is_data_repo.Rd @@ -9,6 +9,9 @@ is_data_repo(object) \arguments{ \item{object}{the object to check} } +\value{ +TRUE is a data \code{git_repository}, else FALSE +} \description{ Check if object is a data repository } From 57df0333172fc33917b8d331c9ec2d2291be3115 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Tue, 24 Jul 2018 17:42:05 +0200 Subject: [PATCH 17/52] fix bugs Signed-off-by: Thierry Onkelinx --- R/index.R | 3 +++ R/write_delim_git.R | 16 ++++++++++++---- man/write_delim_git.Rd | 2 +- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/R/index.R b/R/index.R index efbf82c97..910e01ce0 100644 --- a/R/index.R +++ b/R/index.R @@ -213,6 +213,9 @@ rm_file <- function(repo = ".", path = NULL) { .Call(git2r_index_remove_bypath, repo, x) }) } + if (is_data_repo(repo)) { + add(repo = repo, path = path) + } invisible(NULL) } diff --git a/R/write_delim_git.R b/R/write_delim_git.R index 669d4c420..8f7cc8091 100644 --- a/R/write_delim_git.R +++ b/R/write_delim_git.R @@ -23,7 +23,7 @@ ##' it is relative to the root of the \code{repo}. ##' @template repo-param ##' @inheritParams add -##' @return The relative path to the file +##' @return NULL (invisible) ##' @export ##' @importFrom utils write.table write_delim_git <- function(x, file, repo = ".", force = FALSE) { @@ -36,8 +36,8 @@ write_delim_git <- function(x, file, repo = ".", force = FALSE) { } if (grepl("\\..*$", basename(file))) { warning("file extensions are stripped") - file <- clean_data_path(file) } + file <- clean_data_path(file) if (!dir.exists(dirname(file["raw_file"]))) { dir.create(dirname(file["raw_file"]), recursive = TRUE) @@ -56,7 +56,7 @@ write_delim_git <- function(x, file, repo = ".", force = FALSE) { ) add(repo, path = file, force = force) - return(file) + return(invisible(NULL)) } ##' Read a \code{data.frame} from a git repository @@ -68,6 +68,9 @@ write_delim_git <- function(x, file, repo = ".", force = FALSE) { ##' @importFrom utils read.table read_delim_git <- function(file, repo = ".") { repo <- lookup_repository(repo) + if (is_data_repo(repo)) { + file <- file.path(workdir(repo), file) + } file <- clean_data_path(file) if (!all(file.exists(file))) { @@ -157,7 +160,12 @@ is_data_repo <- function(object) { ##' @return a named vector with "raw_file" and "meta_file", refering to the ".tsv" and ".yml" files ##' @noRd clean_data_path <- function(path) { - path <- file.path(dirname(path), gsub("\\..*$", "", basename(path))) + dir_name <- dirname(path) + not_root <- dir_name != "." + path <- gsub("\\..*$", "", basename(path)) + if (any(not_root)) { + path[not_root] <- file.path(dir_name[not_root], path[not_root]) + } path <- normalizePath(unique(path), mustWork = FALSE) c(raw_file = paste0(path, ".tsv"), meta_file = paste0(path, ".yml")) } diff --git a/man/write_delim_git.Rd b/man/write_delim_git.Rd index 3c1b6adff..afd781e1d 100644 --- a/man/write_delim_git.Rd +++ b/man/write_delim_git.Rd @@ -19,7 +19,7 @@ object. Default is '.'} \item{force}{Add ignored files. Default is FALSE.} } \value{ -The relative path to the file +NULL (invisible) } \description{ This will create two files. The \code{".tsv"} file contains the raw data. From 8b90f133060354454852ffc054a0e2196025505e Mon Sep 17 00:00:00 2001 From: ThierryO Date: Tue, 24 Jul 2018 18:14:31 +0200 Subject: [PATCH 18/52] add unit tests for data repositories Signed-off-by: Thierry Onkelinx --- tests/data_repository.R | 80 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 tests/data_repository.R diff --git a/tests/data_repository.R b/tests/data_repository.R new file mode 100644 index 000000000..9ec2342b1 --- /dev/null +++ b/tests/data_repository.R @@ -0,0 +1,80 @@ +## git2r, R bindings to the libgit2 library. +## Copyright (C) 2013-2018 The git2r contributors +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License, version 2, +## as published by the Free Software Foundation. +## +## git2r is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +library("git2r") + +## For debugging +sessionInfo() + +## Create a directory in tempdir +path <- tempfile(pattern = "git2r-") +dir.create(path) + +## Initialize a data repository +data_repo <- init(path, project = "test") +stopifnot(inherits(data_repo, "git_repository")) +stopifnot(all.equal(data_repo$path, file.path(path, ".git"))) +stopifnot(all.equal(data_repo$project, "test")) +config(data_repo, user.name = "Alice", user.email = "alice@example.org") + +stopifnot(all.equal( + tools::assertError(write_delim_git(NA, "test", data_repo))[[1]][["message"]], + "x is not a 'data.frame'" +)) + +x <- data.frame( + x = LETTERS, + y = factor( + sample(c("a", "b", NA), 26, replace = TRUE), + levels = c("a", "b", "c") + ), + z = c(NA, 1:25), + abc = c(rnorm(25), NA), + stringsAsFactors = FALSE +) +tools::assertWarning(wdg <- write_delim_git(x, "test.txt", data_repo)) +z <- status(data_repo) +print(z) +stopifnot( + all.equal(z$s, list(new = "test.tsv", new = "test.yml")) +) +all.equal( + x, + read_delim_git("test", data_repo) +) +stopifnot(all.equal( + tools::assertError(read_delim_git(NA, data_repo))[[1]][["message"]], + "raw file and/or meta file missing" +)) +write_delim_git(x, "junk/test", data_repo) +commit(data_repo, "test") +rm_file(data_repo, "junk/test") +stopifnot( + all.equal( + status(data_repo)$s, + list(deleted = "junk/test.tsv", deleted = "junk/test.yml") + ) +) +rm_file(data_repo) +stopifnot( + all.equal( + status(data_repo)$s, + list( + deleted = "junk/test.tsv", deleted = "junk/test.yml", + deleted = "test.tsv", deleted = "test.yml" + ) + ) +) From 8566c4128790f8fa7249b556f569e790f7b9ac47 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Wed, 25 Jul 2018 10:05:24 +0200 Subject: [PATCH 19/52] Create a "data_repository" class Signed-off-by: Thierry Onkelinx --- DESCRIPTION | 2 +- NAMESPACE | 5 ++++ R/{write_delim_git.R => data_repository.R} | 16 +++++++------ R/merge.R | 7 +++++- R/plot.R | 8 +++++++ R/repository.R | 28 ++++++++++++++++++---- man/is_data_repo.Rd | 2 +- man/meta.Rd | 2 +- man/read_delim_git.Rd | 2 +- man/repository.Rd | 11 ++++++--- man/write_delim_git.Rd | 2 +- 11 files changed, 65 insertions(+), 20 deletions(-) rename R/{write_delim_git.R => data_repository.R} (93%) diff --git a/DESCRIPTION b/DESCRIPTION index 6b0a48ba0..63010414b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,6 +38,7 @@ Collate: 'config.R' 'contributions.R' 'credential.R' + 'data_repository.R' 'diff.R' 'fetch.R' 'git2r.R' @@ -65,5 +66,4 @@ Collate: 'time.R' 'tree.R' 'when.R' - 'write_delim_git.R' RoxygenNote: 6.0.1 diff --git a/NAMESPACE b/NAMESPACE index 90c8b23af..a33728f32 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ S3method("[",git_tree) S3method(as.POSIXct,git_time) S3method(as.character,git_time) +S3method(as.data.frame,data_repository) S3method(as.data.frame,git_commit) S3method(as.data.frame,git_repository) S3method(as.data.frame,git_tree) @@ -20,13 +21,16 @@ S3method(length,git_blob) S3method(length,git_diff) S3method(length,git_tree) S3method(merge,character) +S3method(merge,data_repository) S3method(merge,git_branch) S3method(merge,git_repository) S3method(meta,character) S3method(meta,factor) S3method(meta,integer) S3method(meta,numeric) +S3method(plot,data_repository) S3method(plot,git_repository) +S3method(print,data_repository) S3method(print,git_blob) S3method(print,git_branch) S3method(print,git_commit) @@ -53,6 +57,7 @@ S3method(sha,git_reference) S3method(sha,git_reflog_entry) S3method(sha,git_tag) S3method(sha,git_tree) +S3method(summary,data_repository) S3method(summary,git_commit) S3method(summary,git_diff) S3method(summary,git_repository) diff --git a/R/write_delim_git.R b/R/data_repository.R similarity index 93% rename from R/write_delim_git.R rename to R/data_repository.R index 8f7cc8091..20bf096ab 100644 --- a/R/write_delim_git.R +++ b/R/data_repository.R @@ -30,13 +30,13 @@ write_delim_git <- function(x, file, repo = ".", force = FALSE) { if (!inherits(x, "data.frame")) { stop("x is not a 'data.frame'") } - repo <- lookup_repository(repo) - if (is_data_repo(repo)) { - file <- file.path(workdir(repo), file) + if (!is_data_repo(repo)) { + stop("repo is not a 'data_repository'") } if (grepl("\\..*$", basename(file))) { warning("file extensions are stripped") } + file <- file.path(workdir(repo), file) file <- clean_data_path(file) if (!dir.exists(dirname(file["raw_file"]))) { @@ -67,10 +67,10 @@ write_delim_git <- function(x, file, repo = ".", force = FALSE) { ##' @export ##' @importFrom utils read.table read_delim_git <- function(file, repo = ".") { - repo <- lookup_repository(repo) - if (is_data_repo(repo)) { - file <- file.path(workdir(repo), file) + if (!is_data_repo(repo)) { + stop("repo is not a 'data_repository'") } + file <- file.path(workdir(repo), file) file <- clean_data_path(file) if (!all(file.exists(file))) { @@ -151,7 +151,9 @@ meta.factor <- function(x) { ##' @seealso repo init ##' @export is_data_repo <- function(object) { - inherits(object, "git_repository") && !is.null(object$project) + inherits(object, "git_repository") && + inherits(object, "data_repository") && + !is.null(object$project) } ##' Clean the data path diff --git a/R/merge.R b/R/merge.R index 548df6c48..c43a02c9f 100644 --- a/R/merge.R +++ b/R/merge.R @@ -95,7 +95,12 @@ merge.git_repository <- function(x, y = NULL, commit_on_success = TRUE, merge.git_branch(b, commit_on_success = commit_on_success, merger = merger, fail = fail) } - +##' @export +merge.data_repository <- function(x, y = NULL, commit_on_success = TRUE, + merger = NULL, fail = FALSE, ...) +{ + NextMethod() +} ##' @export ##' @rdname merge merge.character <- function(x = ".", y = NULL, commit_on_success = TRUE, diff --git a/R/plot.R b/R/plot.R index e9c207aec..312ac4a43 100644 --- a/R/plot.R +++ b/R/plot.R @@ -65,3 +65,11 @@ plot.git_repository <- function(x, mp <- barplot(df$n, xlab = xlab, ylab = ylab, main = main, ...) axis(1, at = mp, labels = seq(min(df$when), max(df$when), breaks)) } +##' @export +plot.data_repository <- function(x, + breaks = c("month", "year", "quarter", "week", "day"), + main = NULL, + ...) +{ + NextMethod() +} diff --git a/R/repository.R b/R/repository.R index cb2458301..f5f2269d7 100644 --- a/R/repository.R +++ b/R/repository.R @@ -79,6 +79,10 @@ as.data.frame.git_repository <- function(x, ...) { do.call("rbind", lapply(commits(x), as.data.frame)) } +##' @export +as.data.frame.data_repository <- function(x, ...) { + NextMethod() +} ##' Open a repository ##' @@ -86,14 +90,19 @@ as.data.frame.git_repository <- function(x, ...) { ##' @param discover Discover repository from path. Default is TRUE. ##' @param project The name of of project. Refers to a local path in case of a ##' data repository. Defaults to \code{NULL}, indicating a standard repository. -##' @return A \code{git_repository} object with entries: +##' @return Either a \code{git_repository} object with entries: +##' \describe{ +##' \item{path}{ +##' Path to a git repository +##' } +##' } +##' or a \code{data_repository} object with entries: ##' \describe{ ##' \item{path}{ ##' Path to a git repository ##' } ##' \item{project}{ -##' If set, the local path to the project starting from the root of the -##' repository +##' The local path to the project starting from the root of the repository ##' } ##' } ##' @export @@ -181,7 +190,10 @@ repository <- function(path = ".", discover = TRUE, project = NULL) { if (!dir.exists(local_path)) { dir.create(local_path, recursive = TRUE) } - structure(list(path = path, project = project), class = "git_repository") + structure( + list(path = path, project = project), + class = c("data_repository", "git_repository") + ) } ##' Init a repository @@ -605,6 +617,10 @@ print.git_repository <- function(x, ...) { h$summary)) } } +##' @export +print.data_repository <- function(x, ...) { + NextMethod() +} ##' Summary of repository ##' @@ -690,6 +706,10 @@ summary.git_repository <- function(object, ...) { invisible(NULL) } +##' @export +summary.data_repository <- function(object, ...) { + NextMethod() +} ## Strip trailing slash or backslash, unless it's the current drive ## root (/) or a Windows drive, for example, 'c:\'. diff --git a/man/is_data_repo.Rd b/man/is_data_repo.Rd index 0d29678c4..1a1e1d6fb 100644 --- a/man/is_data_repo.Rd +++ b/man/is_data_repo.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/write_delim_git.R +% Please edit documentation in R/data_repository.R \name{is_data_repo} \alias{is_data_repo} \title{Check if object is a data repository} diff --git a/man/meta.Rd b/man/meta.Rd index c241cc064..cdb2de3de 100644 --- a/man/meta.Rd +++ b/man/meta.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/write_delim_git.R +% Please edit documentation in R/data_repository.R \name{meta} \alias{meta} \title{Optimise a vector for storage in to a git repository and add meta data} diff --git a/man/read_delim_git.Rd b/man/read_delim_git.Rd index 526fab49b..fd760dccc 100644 --- a/man/read_delim_git.Rd +++ b/man/read_delim_git.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/write_delim_git.R +% Please edit documentation in R/data_repository.R \name{read_delim_git} \alias{read_delim_git} \title{Read a \code{data.frame} from a git repository} diff --git a/man/repository.Rd b/man/repository.Rd index 9db0f29f7..6a81a5cde 100644 --- a/man/repository.Rd +++ b/man/repository.Rd @@ -15,14 +15,19 @@ repository(path = ".", discover = TRUE, project = NULL) data repository. Defaults to \code{NULL}, indicating a standard repository.} } \value{ -A \code{git_repository} object with entries: +Either a \code{git_repository} object with entries: +\describe{ + \item{path}{ + Path to a git repository + } +} +or a \code{data_repository} object with entries: \describe{ \item{path}{ Path to a git repository } \item{project}{ - If set, the local path to the project starting from the root of the - repository + The local path to the project starting from the root of the repository } } } diff --git a/man/write_delim_git.Rd b/man/write_delim_git.Rd index afd781e1d..9b5bbd387 100644 --- a/man/write_delim_git.Rd +++ b/man/write_delim_git.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/write_delim_git.R +% Please edit documentation in R/data_repository.R \name{write_delim_git} \alias{write_delim_git} \title{Write a \code{data.frame} to a git repository} From 50df269fce78f7582be2e17c549dcd422ea418bc Mon Sep 17 00:00:00 2001 From: ThierryO Date: Wed, 25 Jul 2018 11:05:19 +0200 Subject: [PATCH 20/52] write_delim_git() maintains order of variables Swapping two variables when rewritten a data.frame results in a large diff while the information content of the data hasn't changed. Therefore the variables will be reordered to match the original order. Signed-off-by: Thierry Onkelinx --- R/data_repository.R | 42 ++++++++++++++++++++++++++++++++++++++++-- man/write_delim_git.Rd | 6 +++++- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/R/data_repository.R b/R/data_repository.R index 20bf096ab..92d7bffe1 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -22,11 +22,14 @@ ##' path. It is relative to the "project" when set in the \code{repo}. Otherwise ##' it is relative to the root of the \code{repo}. ##' @template repo-param +##' @param override Ignore existing meta data. This is required when new +##' variables are added or variables are deleted. Setting this to TRUE can +##' potentially lead to large diffs. Defaults to FALSE. ##' @inheritParams add ##' @return NULL (invisible) ##' @export ##' @importFrom utils write.table -write_delim_git <- function(x, file, repo = ".", force = FALSE) { +write_delim_git <- function(x, file, repo = ".", override = FALSE, force = FALSE) { if (!inherits(x, "data.frame")) { stop("x is not a 'data.frame'") } @@ -48,7 +51,15 @@ write_delim_git <- function(x, file, repo = ".", force = FALSE) { vapply(raw_data, attr, "", which = "meta"), sep = ":\n" ) - writeLines(meta_data, file["meta_file"]) + if (override || !file.exists(file["meta_file"])) { + writeLines(meta_data, file["meta_file"]) + } else { + meta_data <- compare_meta( + meta_data, + old_meta_data = readLines(file["meta_file"]) + ) + raw_data <- raw_data[gsub("(\\S*?):.*", "\\1", meta_data)] + } write.table( x = raw_data, file = file["raw_file"], append = FALSE, quote = FALSE, sep = "\t", eol = "\n", dec = ".", @@ -59,6 +70,33 @@ write_delim_git <- function(x, file, repo = ".", force = FALSE) { return(invisible(NULL)) } +compare_meta <- function(meta_data, old_meta_data) { + meta_cols <- grep("^\\S*:$", old_meta_data) + if (length(meta_cols) != length(meta_data)) { + stop("old data has different number of variables, use override = TRUE") + } + old_col_names <- gsub(":", "", old_meta_data[meta_cols]) + col_names <- gsub("(\\S*?):.*", "\\1", meta_data) + if (!all(sort(col_names) == sort(old_col_names))) { + stop("old data has different variables, use override = TRUE") + } + positions <- cbind( + start = meta_cols, + end = c(tail(meta_cols, -1) - 1, length(old_meta_data)) + ) + old_meta_data <- apply( + positions, + 1, + function(i) { + paste(old_meta_data[i["start"]:i["end"]], collapse = "\n") + } + ) + if (!all(sort(meta_data) == sort(old_meta_data))) { + stop("old data has different variable types, use override = TRUE") + } + return(old_meta_data) +} + ##' Read a \code{data.frame} from a git repository ##' ##' @template repo-param diff --git a/man/write_delim_git.Rd b/man/write_delim_git.Rd index 9b5bbd387..313749333 100644 --- a/man/write_delim_git.Rd +++ b/man/write_delim_git.Rd @@ -4,7 +4,7 @@ \alias{write_delim_git} \title{Write a \code{data.frame} to a git repository} \usage{ -write_delim_git(x, file, repo = ".", force = FALSE) +write_delim_git(x, file, repo = ".", override = FALSE, force = FALSE) } \arguments{ \item{x}{the \code{data.frame}} @@ -16,6 +16,10 @@ it is relative to the root of the \code{repo}.} \item{repo}{a path to a repository or a \code{git_repository} object. Default is '.'} +\item{override}{Ignore existing meta data. This is required when new +variables are added or variables are deleted. Setting this to TRUE can +potentially lead to large diffs. Defaults to FALSE.} + \item{force}{Add ignored files. Default is FALSE.} } \value{ From 886979fcce7280d6700e901c32d6d8a4f058be72 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Wed, 25 Jul 2018 11:58:34 +0200 Subject: [PATCH 21/52] write_delim_git() sorts the data prior to writing When a line is moved in a file, the resulting diff is a deletion at the original location and an addition at the new location. Changing the order of the observations in a data.frame does not change the information content. Sorting the data before writing avoids unnecessary diffs. Signed-off-by: Thierry Onkelinx --- NAMESPACE | 1 + R/data_repository.R | 73 ++++++++++++++++++++++++++++++------------ man/write_delim_git.Rd | 7 +++- 3 files changed, 59 insertions(+), 22 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index a33728f32..21d0bf78a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -176,5 +176,6 @@ importFrom(utils,capture.output) importFrom(utils,head) importFrom(utils,read.table) importFrom(utils,sessionInfo) +importFrom(utils,tail) importFrom(utils,write.table) useDynLib(git2r, .registration=TRUE) diff --git a/R/data_repository.R b/R/data_repository.R index 92d7bffe1..07bd62a34 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -22,20 +22,33 @@ ##' path. It is relative to the "project" when set in the \code{repo}. Otherwise ##' it is relative to the root of the \code{repo}. ##' @template repo-param +##' @param sorting a vector of column names defining which columns to use for +##' sorting \code{x} and in what order to use them. Defaults to +##' \code{colnames(x)} ##' @param override Ignore existing meta data. This is required when new ##' variables are added or variables are deleted. Setting this to TRUE can ##' potentially lead to large diffs. Defaults to FALSE. ##' @inheritParams add ##' @return NULL (invisible) ##' @export -##' @importFrom utils write.table -write_delim_git <- function(x, file, repo = ".", override = FALSE, force = FALSE) { +##' @importFrom utils tail write.table +write_delim_git <- function( + x, file, repo = ".", sorting, override = FALSE, force = FALSE +) { if (!inherits(x, "data.frame")) { stop("x is not a 'data.frame'") } if (!is_data_repo(repo)) { stop("repo is not a 'data_repository'") } + if (!missing(sorting)) { + if (length(sorting) == 0) { + stop("at least one variable is required for sorting") + } + if (!all(sorting %in% colnames(x))) { + stop("use only variables of 'x' for sorting") + } + } if (grepl("\\..*$", basename(file))) { warning("file extensions are stripped") } @@ -51,15 +64,45 @@ write_delim_git <- function(x, file, repo = ".", override = FALSE, force = FALSE vapply(raw_data, attr, "", which = "meta"), sep = ":\n" ) + names(meta_data) <- colnames(x) if (override || !file.exists(file["meta_file"])) { + if (missing(sorting)) { + sorting <- colnames(x) + } + to_sort <- colnames(x) %in% sorting + meta_data <- meta_data[c(sorting, colnames(x)[!to_sort])] + meta_data[sorting] <- paste0(meta_data[sorting], "\n sort") writeLines(meta_data, file["meta_file"]) } else { - meta_data <- compare_meta( - meta_data, - old_meta_data = readLines(file["meta_file"]) + old_meta_data <- readLines(file["meta_file"]) + meta_cols <- grep("^\\S*:$", old_meta_data) + positions <- cbind( + start = meta_cols, + end = c(tail(meta_cols, -1) - 1, length(old_meta_data)) + ) + old_meta_data <- apply( + positions, + 1, + function(i) { + paste(old_meta_data[i["start"]:i["end"]], collapse = "\n") + } ) - raw_data <- raw_data[gsub("(\\S*?):.*", "\\1", meta_data)] + if (missing(sorting)) { + sorting <- grep(".*sort", old_meta_data) + sorting <- gsub("(\\S*?):\n.*", "\\1", old_meta_data)[sorting] + if (!all(sorting %in% colnames(x))) { + stop("new data lacks old sorting variable, use override = TRUE") + } + } + to_sort <- colnames(x) %in% sorting + meta_data <- meta_data[c(sorting, colnames(x)[!to_sort])] + meta_data[sorting] <- paste0(meta_data[sorting], "\n sort") + meta_data <- compare_meta(meta_data, old_meta_data) } + # order the variables + raw_data <- raw_data[gsub("(\\S*?):.*", "\\1", meta_data)] + # order the observations + raw_data <- raw_data[do.call(order, raw_data[sorting]), ] write.table( x = raw_data, file = file["raw_file"], append = FALSE, quote = FALSE, sep = "\t", eol = "\n", dec = ".", @@ -71,28 +114,16 @@ write_delim_git <- function(x, file, repo = ".", override = FALSE, force = FALSE } compare_meta <- function(meta_data, old_meta_data) { - meta_cols <- grep("^\\S*:$", old_meta_data) - if (length(meta_cols) != length(meta_data)) { + if (length(old_meta_data) != length(meta_data)) { stop("old data has different number of variables, use override = TRUE") } - old_col_names <- gsub(":", "", old_meta_data[meta_cols]) + old_col_names <- gsub("(\\S*?):.*", "\\1", old_meta_data) col_names <- gsub("(\\S*?):.*", "\\1", meta_data) if (!all(sort(col_names) == sort(old_col_names))) { stop("old data has different variables, use override = TRUE") } - positions <- cbind( - start = meta_cols, - end = c(tail(meta_cols, -1) - 1, length(old_meta_data)) - ) - old_meta_data <- apply( - positions, - 1, - function(i) { - paste(old_meta_data[i["start"]:i["end"]], collapse = "\n") - } - ) if (!all(sort(meta_data) == sort(old_meta_data))) { - stop("old data has different variable types, use override = TRUE") +stop("old data has different variable types or sorting, use override = TRUE") } return(old_meta_data) } diff --git a/man/write_delim_git.Rd b/man/write_delim_git.Rd index 313749333..2d983d617 100644 --- a/man/write_delim_git.Rd +++ b/man/write_delim_git.Rd @@ -4,7 +4,8 @@ \alias{write_delim_git} \title{Write a \code{data.frame} to a git repository} \usage{ -write_delim_git(x, file, repo = ".", override = FALSE, force = FALSE) +write_delim_git(x, file, repo = ".", sorting, override = FALSE, + force = FALSE) } \arguments{ \item{x}{the \code{data.frame}} @@ -16,6 +17,10 @@ it is relative to the root of the \code{repo}.} \item{repo}{a path to a repository or a \code{git_repository} object. Default is '.'} +\item{sorting}{a vector of column names defining which columns to use for +sorting \code{x} and in what order to use them. Defaults to +\code{colnames(x)}} + \item{override}{Ignore existing meta data. This is required when new variables are added or variables are deleted. Setting this to TRUE can potentially lead to large diffs. Defaults to FALSE.} From 2f24fcb2f7be97fc5963201270c2b572f6888752 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Wed, 25 Jul 2018 16:09:55 +0200 Subject: [PATCH 22/52] update unit tests Signed-off-by: Thierry Onkelinx --- tests/data_repository.R | 72 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 70 insertions(+), 2 deletions(-) diff --git a/tests/data_repository.R b/tests/data_repository.R index 9ec2342b1..d9f7763bf 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -45,20 +45,69 @@ x <- data.frame( abc = c(rnorm(25), NA), stringsAsFactors = FALSE ) -tools::assertWarning(wdg <- write_delim_git(x, "test.txt", data_repo)) + +stopifnot(all.equal( + tools::assertWarning( + write_delim_git(x, "test.txt", data_repo) + )[[1]]$message, + "file extensions are stripped" +)) z <- status(data_repo) print(z) stopifnot( all.equal(z$s, list(new = "test.tsv", new = "test.yml")) ) +write_delim_git(x, "test", data_repo) +stopifnot(all.equal(status(data_repo), z)) +stopifnot(all.equal( + tools::assertError( + write_delim_git(x[, 1:3], "test", data_repo) + )[[1]][["message"]], + "new data lacks old sorting variable, use override = TRUE" +)) +y <- x +y$junk <- x$x +stopifnot(all.equal( + tools::assertError( + write_delim_git(y, "test", data_repo) + )[[1]][["message"]], + "old data has different number of variables, use override = TRUE" +)) +y <- x +y$x <- factor(y$x) +stopifnot(all.equal( + tools::assertError( + write_delim_git(y, "test", data_repo) + )[[1]][["message"]], + "old data has different variable types or sorting, use override = TRUE" +)) all.equal( x, read_delim_git("test", data_repo) ) +write_delim_git(x, "test", data_repo, sorting = c("y", "x"), override = TRUE) +x_sorted <- x[do.call(order, x[c("y", "x")]), c("y", "x", "z", "abc")] +rownames(x_sorted) <- NULL +stopifnot(all.equal(x_sorted, read_delim_git("test", data_repo))) +y <- x +y$abc <- NULL +y$xyz <- x$abc +stopifnot(all.equal( + tools::assertError( + write_delim_git(y, "test", data_repo) + )[[1]][["message"]], + "old data has different variables, use override = TRUE" +)) + stopifnot(all.equal( - tools::assertError(read_delim_git(NA, data_repo))[[1]][["message"]], + tools::assertError(read_delim_git("", data_repo))[[1]][["message"]], "raw file and/or meta file missing" )) +stopifnot(all.equal( + tools::assertError(read_delim_git("test", "."))[[1]][["message"]], + "repo is not a 'data_repository'" +)) + write_delim_git(x, "junk/test", data_repo) commit(data_repo, "test") rm_file(data_repo, "junk/test") @@ -78,3 +127,22 @@ stopifnot( ) ) ) + +stopifnot(all.equal( + tools::assertError( + write_delim_git(x, "test", repository(path)) + )[[1]][["message"]], + "repo is not a 'data_repository'" +)) +stopifnot(all.equal( + tools::assertError( + write_delim_git(x, "test", sorting = character(0), data_repo) + )[[1]][["message"]], + "at least one variable is required for sorting" +)) +stopifnot(all.equal( + tools::assertError( + write_delim_git(x, "test", sorting = "junk", data_repo) + )[[1]][["message"]], + "use only variables of 'x' for sorting" +)) From edf9ee785a759c5f373b8be3d1e0589c9a0ae08d Mon Sep 17 00:00:00 2001 From: ThierryO Date: Wed, 25 Jul 2018 17:22:09 +0200 Subject: [PATCH 23/52] rm_file() can delete all .tsv. or all .yml files Signed-off-by: Thierry Onkelinx --- R/index.R | 53 +++++++++++++++++++++++++++++++++-------- R/reset.R | 2 +- man/rm_file.Rd | 20 +++++++++++++++- tests/data_repository.R | 14 ++++++++--- 4 files changed, 74 insertions(+), 15 deletions(-) diff --git a/R/index.R b/R/index.R index 910e01ce0..c1f1193fa 100644 --- a/R/index.R +++ b/R/index.R @@ -132,8 +132,25 @@ add <- function(repo = ".", path = NULL, force = FALSE) ##' @template repo-param ##' @param path character vector with filenames to remove. The path ##' must be relative to the repository's working folder. Only -##' files known to Git are removed. +##' files known to Git are removed. Works different in case of +##' \code{data_repository}. See details. ##' @return invisible(NULL) +##' @details +##' In case of a \code{data_repository}, there are three options for \code{path} +##' \enumerate{ +##' \item{a vector of file names as used in \code{\link{write_delim_git}}}. +##' This will remove all associated \code{.tsv} and \code{.yml} files +##' \item{\code{".tsv"}} will remove ALL \code{.tsv} files. +##' \item{\code{".yml"}} will remove all \code{.yml} files which have no +##' associated \code{.tsv} file +##' } +##' +##' \code{path = ".tsv"} is useful when updating a \code{data_repository} with a +##' variable number of files. First use \code{rm_file(repo, path = ".tsv")} to +##' remove all \code{.tsv} files. Then use \code{write_delim_git()} the store +##' all the data.frames. End by using \code{rm_file(repo, path = ".yml")}, which +##' will clean any left-over \code{.yml} files. As a result, any data.frame +##' which wasn't rewritten will be deleted. ##' @export ##' @examples ##' \dontrun{ @@ -160,20 +177,36 @@ add <- function(repo = ".", path = NULL, force = FALSE) ##' } rm_file <- function(repo = ".", path = NULL) { if (is.null(path) || !is.character(path)) { - if (!is_data_repo(repo)) { - stop("'path' must be a character vector") - } - path <- list.files( - workdir(repo), - pattern = "\\.(tsv|yml)$", - recursive = TRUE - ) + stop("'path' must be a character vector") } repo <- lookup_repository(repo) if (is_data_repo(repo)) { - path <- clean_data_path(path) + if (length(path) == 1 && path %in% c(".tsv", ".yml")) { + if (path == ".tsv") { + path <- list.files( + workdir(repo), + pattern = "\\.tsv$", + recursive = TRUE + ) + } else { + yml <- list.files( + workdir(repo), + pattern = "\\.yml$", + recursive = TRUE + ) + tsv <- list.files( + workdir(repo), + pattern = "\\.tsv$", + recursive = TRUE + ) + both <- gsub("\\.yml$", "", yml) %in% gsub("\\.tsv$", "", tsv) + path <- yml[!both] + } + } else { + path <- clean_data_path(path) + } } if (length(path)) { diff --git a/R/reset.R b/R/reset.R index 059fe0cfe..9ce605189 100644 --- a/R/reset.R +++ b/R/reset.R @@ -93,7 +93,7 @@ reset <- function(object, reset_type = c("soft", "mixed", "hard"), path = NULL) if (is_empty(object)) { .Call(git2r_index_remove_bypath, object, path) } else { - if (is_data_repo(object)) { + if (is_data_repo(object) && !is.null(path)) { path <- file.path(object$project, path) path <- clean_data_path(path) } diff --git a/man/rm_file.Rd b/man/rm_file.Rd index 13219dd6b..6c93bc048 100644 --- a/man/rm_file.Rd +++ b/man/rm_file.Rd @@ -12,7 +12,8 @@ object. Default is '.'} \item{path}{character vector with filenames to remove. The path must be relative to the repository's working folder. Only -files known to Git are removed.} +files known to Git are removed. Works different in case of +\code{data_repository}. See details.} } \value{ invisible(NULL) @@ -20,6 +21,23 @@ invisible(NULL) \description{ Remove files from the working tree and from the index } +\details{ +In case of a \code{data_repository}, there are three options for \code{path} +\enumerate{ + \item{a vector of file names as used in \code{\link{write_delim_git}}}. + This will remove all associated \code{.tsv} and \code{.yml} files + \item{\code{".tsv"}} will remove ALL \code{.tsv} files. + \item{\code{".yml"}} will remove all \code{.yml} files which have no + associated \code{.tsv} file +} + +\code{path = ".tsv"} is useful when updating a \code{data_repository} with a +variable number of files. First use \code{rm_file(repo, path = ".tsv")} to +remove all \code{.tsv} files. Then use \code{write_delim_git()} the store +all the data.frames. End by using \code{rm_file(repo, path = ".yml")}, which +will clean any left-over \code{.yml} files. As a result, any data.frame +which wasn't rewritten will be deleted. +} \examples{ \dontrun{ ## Initialize a repository diff --git a/tests/data_repository.R b/tests/data_repository.R index d9f7763bf..ab62921bc 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -110,14 +110,22 @@ stopifnot(all.equal( write_delim_git(x, "junk/test", data_repo) commit(data_repo, "test") -rm_file(data_repo, "junk/test") +rm_file(data_repo, ".tsv") +stopifnot( + all.equal( + status(data_repo)$s, + list(deleted = "junk/test.tsv", deleted = "test.tsv") + ) +) +write_delim_git(x, "junk/test", data_repo) +rm_file(data_repo, ".yml") stopifnot( all.equal( status(data_repo)$s, - list(deleted = "junk/test.tsv", deleted = "junk/test.yml") + list(deleted = "test.tsv", deleted = "test.yml") ) ) -rm_file(data_repo) +rm_file(data_repo, "junk/test") stopifnot( all.equal( status(data_repo)$s, From 84c96f9f28eec4ea976e33d3030451b2784fb5e1 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Thu, 26 Jul 2018 13:48:29 +0200 Subject: [PATCH 24/52] meta.character() checks for 'NA' values Signed-off-by: Thierry Onkelinx --- R/data_repository.R | 16 ++++++++++++++++ man/meta.Rd | 10 ++++++++++ tests/data_repository.R | 11 +++++++++++ 3 files changed, 37 insertions(+) diff --git a/R/data_repository.R b/R/data_repository.R index 07bd62a34..d4459f5b3 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -180,6 +180,15 @@ read_delim_git <- function(file, repo = ".") { ##' Optimise a vector for storage in to a git repository and add meta data ##' @param x the vector +##' @details +##' \itemize{ +##' \item \code{meta.character} checks for the presence of \code{'NA'}. +##' Because \code{\link{write_delim_git}} stores the data unquoted, +##' \code{'NA'} and \code{NA} result in the same value in the file. Hence +##' \code{\link{read_delim_git}} would report \code{'NA'} as \code{NA}. +##' Therefore \code{meta.character} will throw an error with \code{'NA'} is +##' detected. +##' } ##' @export meta <- function(x) { UseMethod("meta") @@ -188,6 +197,13 @@ meta <- function(x) { ##' @export meta.character <- function(x) { attr(x, "meta") <- " class: character" + if (any(is.na(x))) { + stop( +"The string 'NA' cannot be stored because it would be indistinguishable from the +missing value NA. Please replace or remove any 'NA' strings. Consider using a +factor." + ) + } return(x) } diff --git a/man/meta.Rd b/man/meta.Rd index cdb2de3de..dbf90e058 100644 --- a/man/meta.Rd +++ b/man/meta.Rd @@ -12,3 +12,13 @@ meta(x) \description{ Optimise a vector for storage in to a git repository and add meta data } +\details{ +\itemize{ + \item \code{meta.character} checks for the presence of \code{'NA'}. + Because \code{\link{write_delim_git}} stores the data unquoted, + \code{'NA'} and \code{NA} result in the same value in the file. Hence + \code{\link{read_delim_git}} would report \code{'NA'} as \code{NA}. + Therefore \code{meta.character} will throw an error with \code{'NA'} is + detected. +} +} diff --git a/tests/data_repository.R b/tests/data_repository.R index ab62921bc..739a5881e 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -98,6 +98,17 @@ stopifnot(all.equal( )[[1]][["message"]], "old data has different variables, use override = TRUE" )) +y <- x +y$x[1] <- NA +stopifnot( + grepl( + "^The string 'NA' cannot be stored", + tools::assertError( + write_delim_git(y, "test", data_repo) + )[[1]]["message"] + ) +) + stopifnot(all.equal( tools::assertError(read_delim_git("", data_repo))[[1]][["message"]], From 15992065632a8f27858114b00d1caac2ef45291f Mon Sep 17 00:00:00 2001 From: ThierryO Date: Thu, 26 Jul 2018 15:29:19 +0200 Subject: [PATCH 25/52] allow the storage of logicals in data repositories Signed-off-by: Thierry Onkelinx --- NAMESPACE | 1 + R/data_repository.R | 13 +++++++++++++ tests/data_repository.R | 12 ++++++++++-- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 21d0bf78a..4fb2c208e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,6 +27,7 @@ S3method(merge,git_repository) S3method(meta,character) S3method(meta,factor) S3method(meta,integer) +S3method(meta,logical) S3method(meta,numeric) S3method(plot,data_repository) S3method(plot,git_repository) diff --git a/R/data_repository.R b/R/data_repository.R index d4459f5b3..160e003bc 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -155,6 +155,7 @@ read_delim_git <- function(file, repo = ".") { ) col_classes <- gsub(" {4}class: (.*)", "\\1", meta_data[meta_cols + 1]) + col_factor <- which(col_classes == "factor") level_rows <- grep("^ {8}- .*$", meta_data) level_value <- gsub("^ {8}- (.*)$", "\\1", meta_data[level_rows]) @@ -175,6 +176,11 @@ read_delim_git <- function(file, repo = ".") { ) } + col_logical <- which(col_classes == "logical") + for (id in col_logical) { + raw_data[[id]] <- as.logical(raw_data[[id]]) + } + return(raw_data) } @@ -230,6 +236,13 @@ meta.factor <- function(x) { return(z) } +##' @export +meta.logical <- function(x) { + x <- as.integer(x) + attr(x, "meta") <- " class: logical" + return(x) +} + ##' Check if object is a data repository ##' @param object the object to check ##' @return TRUE is a data \code{git_repository}, else FALSE diff --git a/tests/data_repository.R b/tests/data_repository.R index 739a5881e..7ae1c818f 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -81,10 +81,10 @@ stopifnot(all.equal( )[[1]][["message"]], "old data has different variable types or sorting, use override = TRUE" )) -all.equal( +stopifnot(all.equal( x, read_delim_git("test", data_repo) -) +)) write_delim_git(x, "test", data_repo, sorting = c("y", "x"), override = TRUE) x_sorted <- x[do.call(order, x[c("y", "x")]), c("y", "x", "z", "abc")] rownames(x_sorted) <- NULL @@ -165,3 +165,11 @@ stopifnot(all.equal( )[[1]][["message"]], "use only variables of 'x' for sorting" )) + +y <- x +y$logic <- sample(c(TRUE, FALSE, NA), replace = TRUE, size = nrow(x)) +write_delim_git(y, "logical", data_repo, sorting = c("y", "logic")) +z <- read_delim_git("logical", data_repo) +y.sorted <- y[do.call(order, y[c("y", "logic")]), colnames(z)] +rownames(y.sorted) <- NULL +stopifnot(all.equal(y.sorted, z)) From 091f9b0d297191934f6f2b5f5d8d11292866c05a Mon Sep 17 00:00:00 2001 From: ThierryO Date: Thu, 26 Jul 2018 16:02:47 +0200 Subject: [PATCH 26/52] data repositories handle complex data Signed-off-by: Thierry Onkelinx --- NAMESPACE | 1 + R/data_repository.R | 6 ++++++ tests/data_repository.R | 3 ++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 4fb2c208e..77113cd9c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,6 +25,7 @@ S3method(merge,data_repository) S3method(merge,git_branch) S3method(merge,git_repository) S3method(meta,character) +S3method(meta,complex) S3method(meta,factor) S3method(meta,integer) S3method(meta,logical) diff --git a/R/data_repository.R b/R/data_repository.R index 160e003bc..36bc5e727 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -243,6 +243,12 @@ meta.logical <- function(x) { return(x) } +##' @export +meta.complex <- function(x) { + attr(x, "meta") <- " class: complex" + return(x) +} + ##' Check if object is a data repository ##' @param object the object to check ##' @return TRUE is a data \code{git_repository}, else FALSE diff --git a/tests/data_repository.R b/tests/data_repository.R index 7ae1c818f..8a443b226 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -167,7 +167,8 @@ stopifnot(all.equal( )) y <- x -y$logic <- sample(c(TRUE, FALSE, NA), replace = TRUE, size = nrow(x)) +y$logic <- sample(c(TRUE, FALSE, NA), replace = TRUE, size = nrow(y)) +y$complex <- complex(real = rnorm(nrow(y)), imaginary = rnorm(nrow(y))) write_delim_git(y, "logical", data_repo, sorting = c("y", "logic")) z <- read_delim_git("logical", data_repo) y.sorted <- y[do.call(order, y[c("y", "logic")]), colnames(z)] From b55089807c21a067d2f926494a7baa17d7dd170d Mon Sep 17 00:00:00 2001 From: ThierryO Date: Thu, 26 Jul 2018 16:25:01 +0200 Subject: [PATCH 27/52] data repositories handle POSIXct timestamps Signed-off-by: Thierry Onkelinx --- NAMESPACE | 1 + R/data_repository.R | 13 +++++++++++++ tests/data_repository.R | 5 +++++ 3 files changed, 19 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 77113cd9c..a1d9bc28c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,6 +24,7 @@ S3method(merge,character) S3method(merge,data_repository) S3method(merge,git_branch) S3method(merge,git_repository) +S3method(meta,POSIXct) S3method(meta,character) S3method(meta,complex) S3method(meta,factor) diff --git a/R/data_repository.R b/R/data_repository.R index 36bc5e727..f621aa87a 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -181,6 +181,12 @@ read_delim_git <- function(file, repo = ".") { raw_data[[id]] <- as.logical(raw_data[[id]]) } + + col_posix <- which(col_classes == "POSIXct") + for (id in col_posix) { + raw_data[[id]] <- as.POSIXct(raw_data[[id]], origin = "1970-01-01") + } + return(raw_data) } @@ -249,6 +255,13 @@ meta.complex <- function(x) { return(x) } +##' @export +meta.POSIXct <- function(x) { + z <- unclass(x) + attr(z, "meta") <- " class: POSIXct\n origin: 1970-01-01\n" + return(z) +} + ##' Check if object is a data repository ##' @param object the object to check ##' @return TRUE is a data \code{git_repository}, else FALSE diff --git a/tests/data_repository.R b/tests/data_repository.R index 8a443b226..0812b8702 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -169,6 +169,11 @@ stopifnot(all.equal( y <- x y$logic <- sample(c(TRUE, FALSE, NA), replace = TRUE, size = nrow(y)) y$complex <- complex(real = rnorm(nrow(y)), imaginary = rnorm(nrow(y))) +y$timestamp <- seq( + as.POSIXct("1900-01-01"), + as.POSIXct("2050-01-01"), + length = 26 +) write_delim_git(y, "logical", data_repo, sorting = c("y", "logic")) z <- read_delim_git("logical", data_repo) y.sorted <- y[do.call(order, y[c("y", "logic")]), colnames(z)] From 98514f10134b9af32f86d8fb3b49fffdd43a71cb Mon Sep 17 00:00:00 2001 From: ThierryO Date: Thu, 26 Jul 2018 17:03:21 +0200 Subject: [PATCH 28/52] add draft version of vignette Signed-off-by: Thierry Onkelinx --- .gitignore | 3 +- DESCRIPTION | 3 + vignettes/data-repository.Rmd | 101 ++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 vignettes/data-repository.Rmd diff --git a/.gitignore b/.gitignore index b61efef40..75e54807a 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,5 @@ openssl.zip /src/zlib/ local320.zip .Rproj.user -windows \ No newline at end of file +windows +inst/doc diff --git a/DESCRIPTION b/DESCRIPTION index 63010414b..0bb275571 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,6 +17,8 @@ Imports: Depends: R (>= 3.1) Suggests: + knitr, + rmarkdown, getPass Type: Package LazyData: true @@ -67,3 +69,4 @@ Collate: 'tree.R' 'when.R' RoxygenNote: 6.0.1 +VignetteBuilder: knitr diff --git a/vignettes/data-repository.Rmd b/vignettes/data-repository.Rmd new file mode 100644 index 000000000..7cdc2a39e --- /dev/null +++ b/vignettes/data-repository.Rmd @@ -0,0 +1,101 @@ +--- +title: "Data repository" +author: "Thierry Onkelinx" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Data repository} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +library(knitr) +opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +# Rationale + +In some cases we want to place data under version control. Data refers here to the information which is stored in a rectangular format. The columns define variables, while the rows represent observations. There is no information in neither the order of the columns nor the order of the rows. We assume that the data is available a an R `data.frame`. + +Although git can store binary files, its real power and efficiency is with plain text files. R has several functions to read and write `data.frames` as plain text files. Main downside is the loss of meta data, especially in case of `factors`. A `factor` will be written as a `character` and hence lose all information on the levels. This can be important when the levels are a) not sorted alphabethical, b) not all levels have observations or c) the `factor` is ordered. Besides that, storing a `factor` as `character` is not efficient. + +Git stores the version history under the form of diffs: a list of lines which are deleted and a list of lines which are inserted at a specific line number in a file. We have to keep this in mind when storing data as plain text files in Git. The table below indicate what impact changing the information content has on the diff in Git. Note that changing observations requires a small diff, while changing variables results in a massive diff. + +```{r echo = FALSE} +cases <- rbind( + c("remove 1 observation", "remove 1 line"), + c("add 1 observation", "add 1 line"), + c("update 1 observation", "remove 1 line and add 1 line"), + c("remove 1 variable", "remove all lines and add all lines"), + c("add 1 variable", "remove all lines and add all lines") +) +colnames(cases) <- c("Change in data", "Git diff") +kable(cases, caption = "Changes in the information content of the data") +``` + +In the next table we show the effect on the diff when making changes in the data which don't change in the information content in the data. + +```{r echo = FALSE} +cases <- rbind( + c("move 1 observation", "remove 1 line and add 1 line"), + c("move 1 variable", "remove all lines and add all lines") +) +colnames(cases) <- c("Change in data", "Git diff") +kable(cases, caption = "Changes in the data without changing the information content") +``` + +So in order to use Git a a performant and efficient version control system for data, we need to make sure to store the meta data and keep the diffs as small as possible. + +# Tools in `git2r` + +## Data repository + +A 'data repository' refers to a sub directory within the root of an existing git repository. Create a `data_repository` object with `init()` or `repository()` and specify the sub directory through the `project` argument. Most functions of the package handle a `data_repository` object in the same way as they handle a `git_repository` object. The main difference is that the working directory points to the sub directory of the root, rather that the root of the git repository. You can create multiple data repositories within the same git repository. + +```{r} +# Create a directory in tempdir +path <- tempfile(pattern = "git2r-") +dir.create(path) + +library(git2r) +data_repo <- init(path, project = "test") +``` + +## Storing data + +Use `write_delim_git()` to store a `data.frame` into the data repository. The function will separate the data and the meta data. The data is stored as a headerless, unquoted tab delimited file with ".tsv" extension and UTF-8 encoding. The meta data is stored in YAML format with ".yml" format. Therefore any extension given to the `file` will be stripped (with a warning). + +The function will do a lot more preprocessing to the data in order to keep the file and the diff as small as possible. That is the reason why the data is stored headerless and unquoted. + +- `logical` is written as 0 (FALSE), 1 (TRUE) or NA to the data. The class is stored in the meta data. +- `integer` is written as is to the data. The class is stored in the meta data. +- `numeric` is written as is to the data. The class is stored in the meta data. +- `complex` is written as is to the data. The class is stored in the meta data. +- `character` is written as is and unquoted to the data. The class is stored in the meta data. +- `factor` is stored as its indices in the data. The labels of levels are stored in the meta data. +- `POSIXct` is written as an integer to the data. The class and the origin are stored in the meta data. + +```{r} +# Create dummy data +x <- data.frame( + x = LETTERS, + y = factor( + sample(c("a", "b", NA), 26, replace = TRUE), + levels = c("a", "b", "c") + ), + z = c(NA, 1:25), + abc = c(rnorm(25), NA), + stringsAsFactors = FALSE +) +str(x) +``` + +```{r} +write_delim_git(x = x, file = "my_data", repo = data_repo, sorting = c("y", "x")) +status(data_repo) +``` + From a60c24814837503b2ebeb9a91af3e2b606de0273 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Fri, 27 Jul 2018 10:03:22 +0200 Subject: [PATCH 29/52] write_delim_git() yields a warning in case of duplicate observations within the sorting variables Signed-off-by: Thierry Onkelinx --- R/data_repository.R | 5 +++++ tests/data_repository.R | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/R/data_repository.R b/R/data_repository.R index f621aa87a..7b92ad447 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -102,6 +102,11 @@ write_delim_git <- function( # order the variables raw_data <- raw_data[gsub("(\\S*?):.*", "\\1", meta_data)] # order the observations + if (anyDuplicated(raw_data[sorting])) { + warning( +"sorting results in ties. Add extra sorting variables to ensure small diffs." + ) + } raw_data <- raw_data[do.call(order, raw_data[sorting]), ] write.table( x = raw_data, file = file["raw_file"], append = FALSE, diff --git a/tests/data_repository.R b/tests/data_repository.R index 0812b8702..8682f3897 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -85,6 +85,12 @@ stopifnot(all.equal( x, read_delim_git("test", data_repo) )) +stopifnot(all.equal( + tools::assertWarning( + write_delim_git(x, "test", data_repo, sorting = "y", override = TRUE) + )[[1]]$message, +"sorting results in ties. Add extra sorting variables to ensure small diffs." +)) write_delim_git(x, "test", data_repo, sorting = c("y", "x"), override = TRUE) x_sorted <- x[do.call(order, x[c("y", "x")]), c("y", "x", "z", "abc")] rownames(x_sorted) <- NULL From c2209991df1d47ed2f45a4dbb91fbc2ee8de1eaf Mon Sep 17 00:00:00 2001 From: ThierryO Date: Fri, 27 Jul 2018 11:18:28 +0200 Subject: [PATCH 30/52] don't return the call with data_repository errors Signed-off-by: Thierry Onkelinx --- R/data_repository.R | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/R/data_repository.R b/R/data_repository.R index 7b92ad447..5b0a504f4 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -120,15 +120,24 @@ write_delim_git <- function( compare_meta <- function(meta_data, old_meta_data) { if (length(old_meta_data) != length(meta_data)) { - stop("old data has different number of variables, use override = TRUE") + stop( + call. = FALSE, + "old data has different number of variables, use override = TRUE" + ) } old_col_names <- gsub("(\\S*?):.*", "\\1", old_meta_data) col_names <- gsub("(\\S*?):.*", "\\1", meta_data) if (!all(sort(col_names) == sort(old_col_names))) { - stop("old data has different variables, use override = TRUE") + stop( + call. = FALSE, + "old data has different variables, use override = TRUE" + ) } if (!all(sort(meta_data) == sort(old_meta_data))) { -stop("old data has different variable types or sorting, use override = TRUE") + stop( + call. = FALSE, + "old data has different variable types or sorting, use override = TRUE" + ) } return(old_meta_data) } @@ -216,6 +225,7 @@ meta.character <- function(x) { attr(x, "meta") <- " class: character" if (any(is.na(x))) { stop( + call. = FALSE, "The string 'NA' cannot be stored because it would be indistinguishable from the missing value NA. Please replace or remove any 'NA' strings. Consider using a factor." From 205d3f3a370411dbda9ed4b5786172875d2ef3ad Mon Sep 17 00:00:00 2001 From: ThierryO Date: Fri, 27 Jul 2018 11:43:50 +0200 Subject: [PATCH 31/52] More work on the vignette Signed-off-by: Thierry Onkelinx --- vignettes/data-repository.Rmd | 109 ++++++++++++++++++++++++++++++++-- 1 file changed, 104 insertions(+), 5 deletions(-) diff --git a/vignettes/data-repository.Rmd b/vignettes/data-repository.Rmd index 7cdc2a39e..f71e3d533 100644 --- a/vignettes/data-repository.Rmd +++ b/vignettes/data-repository.Rmd @@ -17,11 +17,11 @@ opts_chunk$set( ) ``` -# Rationale +## Rationale In some cases we want to place data under version control. Data refers here to the information which is stored in a rectangular format. The columns define variables, while the rows represent observations. There is no information in neither the order of the columns nor the order of the rows. We assume that the data is available a an R `data.frame`. -Although git can store binary files, its real power and efficiency is with plain text files. R has several functions to read and write `data.frames` as plain text files. Main downside is the loss of meta data, especially in case of `factors`. A `factor` will be written as a `character` and hence lose all information on the levels. This can be important when the levels are a) not sorted alphabethical, b) not all levels have observations or c) the `factor` is ordered. Besides that, storing a `factor` as `character` is not efficient. +Although git can store binary files, its real power and efficiency is with plain text files. R has several functions to read and write `data.frames` as plain text files. Main downside is the loss of meta data, especially in case of `factors`. A `factor` will be written as a `character` and hence lose all information on the levels. This can be important when the levels are a) not sorted alphabetical, b) not all levels have observations or c) the `factor` is ordered. Besides that, storing a `factor` as `character` is not efficient. Git stores the version history under the form of diffs: a list of lines which are deleted and a list of lines which are inserted at a specific line number in a file. We have to keep this in mind when storing data as plain text files in Git. The table below indicate what impact changing the information content has on the diff in Git. Note that changing observations requires a small diff, while changing variables results in a massive diff. @@ -50,9 +50,9 @@ kable(cases, caption = "Changes in the data without changing the information con So in order to use Git a a performant and efficient version control system for data, we need to make sure to store the meta data and keep the diffs as small as possible. -# Tools in `git2r` +## Tools in `git2r` -## Data repository +### Data repository A 'data repository' refers to a sub directory within the root of an existing git repository. Create a `data_repository` object with `init()` or `repository()` and specify the sub directory through the `project` argument. Most functions of the package handle a `data_repository` object in the same way as they handle a `git_repository` object. The main difference is that the working directory points to the sub directory of the root, rather that the root of the git repository. You can create multiple data repositories within the same git repository. @@ -62,10 +62,18 @@ path <- tempfile(pattern = "git2r-") dir.create(path) library(git2r) +# inititialize a new git repository +repo <- init(path) +# use a data repository in an existing git repository +data_repo <- repository(path, project = "test") +``` + +```{r eval = FALSE} +# inititialize a new git repository and use a data repository within it data_repo <- init(path, project = "test") ``` -## Storing data +### Storing data Use `write_delim_git()` to store a `data.frame` into the data repository. The function will separate the data and the meta data. The data is stored as a headerless, unquoted tab delimited file with ".tsv" extension and UTF-8 encoding. The meta data is stored in YAML format with ".yml" format. Therefore any extension given to the `file` will be stripped (with a warning). @@ -89,13 +97,104 @@ x <- data.frame( ), z = c(NA, 1:25), abc = c(rnorm(25), NA), + xyz = complex(real = rnorm(26), imaginary = rnorm(26)), + def = sample(c(TRUE, FALSE, NA), 26, replace = TRUE), + timestamp = seq( + as.POSIXct("2018-01-01"), + as.POSIXct("2019-01-01"), + length = 26 + ), stringsAsFactors = FALSE ) str(x) ``` +Another important part of the preprocessing is ordering both the variables as the observations. This is determined when the file is created and stored in the meta data. When the file is overwritten, the ordering is based on the existing meta data. Therefore it is important to think carefully about the ordering and make sure that is how you want it to stay **before** you create your first commit. The ordering of the observations is specified through the `sorting` argument. It holds a vector of variable names which are used in the ordering. Make sure that you add enough sorting variables to avoid ties. `sorting` also defines the order of the variables, the order of the other variables is taken from the `data.frame`. + +The data file (".tsv") and the meta data file (".yml") are automatically staged after writing. + ```{r} +status(data_repo) write_delim_git(x = x, file = "my_data", repo = data_repo, sorting = c("y", "x")) status(data_repo) +status(repo) +``` + +Overwriting data which has different meta data than the original data will throw an error because this can potentially lead to large diffs. Differences in meta data occur when: + +- the sorting changes +- variables are added, removed or renamed +- the class of a variable changes +- the levels of a factor changes + +However you can bypass this by using `override = TRUE`. This will completely ignore the existing meta data and create new meta data based on the new `data.frame`. + +```{r error=TRUE} +y <- x +y$extra <- x$x +write_delim_git(y, file = "my_data", repo = data_repo, sorting = c("y", "x")) +write_delim_git( + y, file = "my_data", repo = data_repo, sorting = c("y", "x"), + override = TRUE +) +``` + +### Reading data + +Retrieving data is straight forward. Use `read_delim_git` and provide the `file` and the `repo`. The retrieved data is identical to the original data after applying the ordering of variables and observations. + +```{r} +y_stored <- read_delim_git(file = "my_data", repo = data_repo) +str(y_stored) +y_sorted <- y[order(y$y, y$x), colnames(y_stored)] +rownames(y_sorted) <- NULL +stopifnot(all.equal(y_sorted, y_stored)) +``` + +### Committing changes + +Commit the changes using `commit()` + +```{r} +write_delim_git(x, file = "sub/test", repo = data_repo, sorting = "x") +status(data_repo) +commit(data_repo, message = "Initial commit") +status(data_repo) +``` + +### Removing data + +Data objects can be removed with `rm_file`. Provide the data repository to `repo` and the data object name to `path`. `path` can take multiple data objects. + +```{r} +# remove a single data object +rm_file(repo = data_repo, path = "my_data") +status(data_repo) +# undo the remove by resetting to the last commit +reset(commits(data_repo)[[1]], "hard") +status(data_repo) +# remove multiple data objects +rm_file(repo = data_repo, path = c("my_data", "sub/test")) +status(data_repo) +``` + +Note that this will remove both the data and the meta data. In case you want to update the current data, it is better to overwrite the data than first delete it and then write it. Sometime you want to write a bunch of data object to the data repository and you cannot guarantee that all data objects will remain after the update. The workflow below is intended for such case. + +1. use `path = ".tsv"` in `rm_file()` to remove **all** ".tsv" in the data repository +1. use `write_delim_git()` to write all data objects. Since the ".yml" are still present, any existing meta data will be used. +1. use `path = ".yml"` in `rm_file()` to remove **any** ".yml" which have no associated ".tsv" +1. `commit()` the changes + +```{r} +# undo the remove by resetting to the last commit +reset(commits(data_repo)[[1]], "hard") +status(data_repo) +rm_file(data_repo, ".tsv") +status(data_repo) +write_delim_git(x, file = "sub/test", repo = data_repo, sorting = "x") +status(data_repo) +rm_file(data_repo, ".yml") +status(data_repo) +commit(data_repo, "Automated update") ``` From 6da37229020a56b43553486bb6b1b2ba4c66e373 Mon Sep 17 00:00:00 2001 From: ThierryO Date: Fri, 27 Jul 2018 12:10:37 +0200 Subject: [PATCH 32/52] write_delim_git() returns the hashes of the files Signed-off-by: Thierry Onkelinx --- R/data_repository.R | 9 +++++++-- man/write_delim_git.Rd | 3 ++- vignettes/data-repository.Rmd | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/R/data_repository.R b/R/data_repository.R index 5b0a504f4..f8e5a0b92 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -29,7 +29,8 @@ ##' variables are added or variables are deleted. Setting this to TRUE can ##' potentially lead to large diffs. Defaults to FALSE. ##' @inheritParams add -##' @return NULL (invisible) +##' @return a named vector with the hashes of the files. The names contains the +##' files with their paths relative to the root of the git_repository. ##' @export ##' @importFrom utils tail write.table write_delim_git <- function( @@ -115,7 +116,11 @@ write_delim_git <- function( ) add(repo, path = file, force = force) - return(invisible(NULL)) + hashes <- hashfile(file) + names(hashes) <- gsub(paste0("^", workdir(repo), "/"), "", file) + names(hashes) <- file.path(repo$project, names(hashes)) + + return(hashes) } compare_meta <- function(meta_data, old_meta_data) { diff --git a/man/write_delim_git.Rd b/man/write_delim_git.Rd index 2d983d617..15b6ce52b 100644 --- a/man/write_delim_git.Rd +++ b/man/write_delim_git.Rd @@ -28,7 +28,8 @@ potentially lead to large diffs. Defaults to FALSE.} \item{force}{Add ignored files. Default is FALSE.} } \value{ -NULL (invisible) +a named vector with the hashes of the files. The names contains the +files with their paths relative to the root of the git_repository. } \description{ This will create two files. The \code{".tsv"} file contains the raw data. diff --git a/vignettes/data-repository.Rmd b/vignettes/data-repository.Rmd index f71e3d533..b26fb3367 100644 --- a/vignettes/data-repository.Rmd +++ b/vignettes/data-repository.Rmd @@ -111,7 +111,7 @@ str(x) Another important part of the preprocessing is ordering both the variables as the observations. This is determined when the file is created and stored in the meta data. When the file is overwritten, the ordering is based on the existing meta data. Therefore it is important to think carefully about the ordering and make sure that is how you want it to stay **before** you create your first commit. The ordering of the observations is specified through the `sorting` argument. It holds a vector of variable names which are used in the ordering. Make sure that you add enough sorting variables to avoid ties. `sorting` also defines the order of the variables, the order of the other variables is taken from the `data.frame`. -The data file (".tsv") and the meta data file (".yml") are automatically staged after writing. +The data file (".tsv") and the meta data file (".yml") are automatically staged after writing. `write_delim_git()` returns the hashes of both the data file and the meta data file. The names of the vector contain the path of the files, relative to the root of the _git_ repository. ```{r} status(data_repo) From cbf21f126f75b603ebd205dd76d540cc09923f3a Mon Sep 17 00:00:00 2001 From: ThierryO Date: Fri, 27 Jul 2018 15:52:45 +0200 Subject: [PATCH 33/52] data repositories handles the Date class Signed-off-by: Thierry Onkelinx --- NAMESPACE | 1 + R/data_repository.R | 12 ++++++++++++ tests/data_repository.R | 5 +++++ 3 files changed, 18 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index a1d9bc28c..98a433f16 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,6 +24,7 @@ S3method(merge,character) S3method(merge,data_repository) S3method(merge,git_branch) S3method(merge,git_repository) +S3method(meta,Date) S3method(meta,POSIXct) S3method(meta,character) S3method(meta,complex) diff --git a/R/data_repository.R b/R/data_repository.R index f8e5a0b92..e5c71176d 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -206,6 +206,11 @@ read_delim_git <- function(file, repo = ".") { raw_data[[id]] <- as.POSIXct(raw_data[[id]], origin = "1970-01-01") } + col_date <- which(col_classes == "Date") + for (id in col_date) { + raw_data[[id]] <- as.Date(raw_data[[id]], origin = "1970-01-01") + } + return(raw_data) } @@ -282,6 +287,13 @@ meta.POSIXct <- function(x) { return(z) } +##' @export +meta.Date <- function(x) { + z <- unclass(x) + attr(z, "meta") <- " class: Date\n origin: 1970-01-01\n" + return(z) +} + ##' Check if object is a data repository ##' @param object the object to check ##' @return TRUE is a data \code{git_repository}, else FALSE diff --git a/tests/data_repository.R b/tests/data_repository.R index 8682f3897..ac35a0697 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -180,6 +180,11 @@ y$timestamp <- seq( as.POSIXct("2050-01-01"), length = 26 ) +y$date <- seq( + as.Date("1970-01-01"), + as.Date("1970-01-26"), + length = 26 +) write_delim_git(y, "logical", data_repo, sorting = c("y", "logic")) z <- read_delim_git("logical", data_repo) y.sorted <- y[do.call(order, y[c("y", "logic")]), colnames(z)] From 6a5341d0d3775b0ed9ce97e773700bc2dca0d327 Mon Sep 17 00:00:00 2001 From: florisvdh Date: Tue, 14 Aug 2018 11:50:18 +0200 Subject: [PATCH 34/52] Vignette & rm_file documentation: fix typos & language Signed-off-by: Floris Vanderhaeghe --- R/index.R | 2 +- vignettes/data-repository.Rmd | 42 +++++++++++++++++------------------ 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/R/index.R b/R/index.R index c1f1193fa..615db7d96 100644 --- a/R/index.R +++ b/R/index.R @@ -147,7 +147,7 @@ add <- function(repo = ".", path = NULL, force = FALSE) ##' ##' \code{path = ".tsv"} is useful when updating a \code{data_repository} with a ##' variable number of files. First use \code{rm_file(repo, path = ".tsv")} to -##' remove all \code{.tsv} files. Then use \code{write_delim_git()} the store +##' remove all \code{.tsv} files. Then use \code{write_delim_git()} to store ##' all the data.frames. End by using \code{rm_file(repo, path = ".yml")}, which ##' will clean any left-over \code{.yml} files. As a result, any data.frame ##' which wasn't rewritten will be deleted. diff --git a/vignettes/data-repository.Rmd b/vignettes/data-repository.Rmd index b26fb3367..1bc4aba38 100644 --- a/vignettes/data-repository.Rmd +++ b/vignettes/data-repository.Rmd @@ -19,11 +19,11 @@ opts_chunk$set( ## Rationale -In some cases we want to place data under version control. Data refers here to the information which is stored in a rectangular format. The columns define variables, while the rows represent observations. There is no information in neither the order of the columns nor the order of the rows. We assume that the data is available a an R `data.frame`. +In some cases we want to place data under version control. Data refers here to the information which is stored in a rectangular format. The columns define variables, while the rows represent observations. There is no information in neither the order of the columns nor the order of the rows. We assume that the data is available as an R `data.frame`. -Although git can store binary files, its real power and efficiency is with plain text files. R has several functions to read and write `data.frames` as plain text files. Main downside is the loss of meta data, especially in case of `factors`. A `factor` will be written as a `character` and hence lose all information on the levels. This can be important when the levels are a) not sorted alphabetical, b) not all levels have observations or c) the `factor` is ordered. Besides that, storing a `factor` as `character` is not efficient. +Although git can store binary files, its real power and efficiency is with plain text files. R has several functions to read and write `data.frames` as plain text files. Main downside is the loss of metadata, especially in case of `factors`. A `factor` will be written as a `character` and hence lose all information on the levels. This can be important when the levels are a) not sorted alphabetically, b) not all levels have observations or c) the `factor` is ordered. Besides that, storing a `factor` as a `character` is not efficient. -Git stores the version history under the form of diffs: a list of lines which are deleted and a list of lines which are inserted at a specific line number in a file. We have to keep this in mind when storing data as plain text files in Git. The table below indicate what impact changing the information content has on the diff in Git. Note that changing observations requires a small diff, while changing variables results in a massive diff. +Git stores the version history under the form of diffs: a list of lines which are deleted and a list of lines which are inserted at a specific line number in a file. We have to keep this in mind when storing data as plain text files in Git. The table below indicates the impact of changing the information content on the diff in Git. Note that changing observations requires a small diff, while changing variables results in a massive diff. ```{r echo = FALSE} cases <- rbind( @@ -37,7 +37,7 @@ colnames(cases) <- c("Change in data", "Git diff") kable(cases, caption = "Changes in the information content of the data") ``` -In the next table we show the effect on the diff when making changes in the data which don't change in the information content in the data. +In the next table we show the effect on the diff when making changes in the data which don't change the information content of the data. ```{r echo = FALSE} cases <- rbind( @@ -48,13 +48,13 @@ colnames(cases) <- c("Change in data", "Git diff") kable(cases, caption = "Changes in the data without changing the information content") ``` -So in order to use Git a a performant and efficient version control system for data, we need to make sure to store the meta data and keep the diffs as small as possible. +So in order to use Git as a performant and efficient version control system for data, we need to make sure to store the metadata and keep the diffs as small as possible. ## Tools in `git2r` ### Data repository -A 'data repository' refers to a sub directory within the root of an existing git repository. Create a `data_repository` object with `init()` or `repository()` and specify the sub directory through the `project` argument. Most functions of the package handle a `data_repository` object in the same way as they handle a `git_repository` object. The main difference is that the working directory points to the sub directory of the root, rather that the root of the git repository. You can create multiple data repositories within the same git repository. +A 'data repository' refers to a subdirectory within the root of an existing git repository. Create a `data_repository` object with `init()` or `repository()` and specify the subdirectory through the `project` argument. Most functions of the package handle a `data_repository` object in the same way as they handle a `git_repository` object. The main difference is that the working directory points to the subdirectory of the root, rather than the root of the git repository. You can create multiple data repositories within the same git repository. ```{r} # Create a directory in tempdir @@ -75,17 +75,17 @@ data_repo <- init(path, project = "test") ### Storing data -Use `write_delim_git()` to store a `data.frame` into the data repository. The function will separate the data and the meta data. The data is stored as a headerless, unquoted tab delimited file with ".tsv" extension and UTF-8 encoding. The meta data is stored in YAML format with ".yml" format. Therefore any extension given to the `file` will be stripped (with a warning). +Use `write_delim_git()` to store a `data.frame` into the data repository. The function will separate the data and the metadata. The data is stored as a headerless, unquoted tab delimited file with ".tsv" extension and UTF-8 encoding. The metadata is stored in YAML format with ".yml" format. Therefore any extension given to the `file` will be stripped (with a warning). The function will do a lot more preprocessing to the data in order to keep the file and the diff as small as possible. That is the reason why the data is stored headerless and unquoted. -- `logical` is written as 0 (FALSE), 1 (TRUE) or NA to the data. The class is stored in the meta data. -- `integer` is written as is to the data. The class is stored in the meta data. -- `numeric` is written as is to the data. The class is stored in the meta data. -- `complex` is written as is to the data. The class is stored in the meta data. -- `character` is written as is and unquoted to the data. The class is stored in the meta data. -- `factor` is stored as its indices in the data. The labels of levels are stored in the meta data. -- `POSIXct` is written as an integer to the data. The class and the origin are stored in the meta data. +- `logical` is written as 0 (FALSE), 1 (TRUE) or NA to the data. The class is stored in the metadata. +- `integer` is written as is to the data. The class is stored in the metadata. +- `numeric` is written as is to the data. The class is stored in the metadata. +- `complex` is written as is to the data. The class is stored in the metadata. +- `character` is written as is and unquoted to the data. The class is stored in the metadata. +- `factor` is stored as its indices in the data. The labels of levels are stored in the metadata. +- `POSIXct` is written as an integer to the data. The class and the origin are stored in the metadata. ```{r} # Create dummy data @@ -109,9 +109,9 @@ x <- data.frame( str(x) ``` -Another important part of the preprocessing is ordering both the variables as the observations. This is determined when the file is created and stored in the meta data. When the file is overwritten, the ordering is based on the existing meta data. Therefore it is important to think carefully about the ordering and make sure that is how you want it to stay **before** you create your first commit. The ordering of the observations is specified through the `sorting` argument. It holds a vector of variable names which are used in the ordering. Make sure that you add enough sorting variables to avoid ties. `sorting` also defines the order of the variables, the order of the other variables is taken from the `data.frame`. +Another important part of the preprocessing is ordering both the variables and the observations. This is determined when the file is created and stored in the metadata. When the file is overwritten, the ordering is based on the existing metadata. Therefore it is important to think carefully about the ordering and make sure that it is how you want it to stay **before** you create your first commit. The ordering of the observations is specified through the `sorting` argument. It holds a vector of variable names which are used in the ordering. Make sure that you add enough sorting variables to avoid ties. `sorting` also defines the order of the variables. The order of the other variables is taken from the `data.frame`. -The data file (".tsv") and the meta data file (".yml") are automatically staged after writing. `write_delim_git()` returns the hashes of both the data file and the meta data file. The names of the vector contain the path of the files, relative to the root of the _git_ repository. +The data file (".tsv") and the metadata file (".yml") are automatically staged after writing. `write_delim_git()` returns the hashes of both the data file and the metadata file. The names of the vector contain the path of the files, relative to the root of the _git_ repository. ```{r} status(data_repo) @@ -120,14 +120,14 @@ status(data_repo) status(repo) ``` -Overwriting data which has different meta data than the original data will throw an error because this can potentially lead to large diffs. Differences in meta data occur when: +Overwriting data which has different metadata than the original data will throw an error because this can potentially lead to large diffs. Differences in metadata occur when: - the sorting changes - variables are added, removed or renamed - the class of a variable changes - the levels of a factor changes -However you can bypass this by using `override = TRUE`. This will completely ignore the existing meta data and create new meta data based on the new `data.frame`. +However you can bypass this by using `override = TRUE`. This will completely ignore the existing metadata and create new metadata based on the new `data.frame`. ```{r error=TRUE} y <- x @@ -164,7 +164,7 @@ status(data_repo) ### Removing data -Data objects can be removed with `rm_file`. Provide the data repository to `repo` and the data object name to `path`. `path` can take multiple data objects. +Data objects can be removed with `rm_file`. Provide the data repository to `repo` and the data object name to `path`.`path` can take multiple data objects. ```{r} # remove a single data object @@ -178,10 +178,10 @@ rm_file(repo = data_repo, path = c("my_data", "sub/test")) status(data_repo) ``` -Note that this will remove both the data and the meta data. In case you want to update the current data, it is better to overwrite the data than first delete it and then write it. Sometime you want to write a bunch of data object to the data repository and you cannot guarantee that all data objects will remain after the update. The workflow below is intended for such case. +Note that this will remove both the data and the metadata. In case you want to update the current data, it is better to overwrite the data than first delete it and then write it. Sometime you want to write a bunch of data object to the data repository and you cannot guarantee that all data objects will remain after the update. The workflow below is intended for such case. 1. use `path = ".tsv"` in `rm_file()` to remove **all** ".tsv" in the data repository -1. use `write_delim_git()` to write all data objects. Since the ".yml" are still present, any existing meta data will be used. +1. use `write_delim_git()` to write all data objects. Since the ".yml" are still present, any existing metadata will be used. 1. use `path = ".yml"` in `rm_file()` to remove **any** ".yml" which have no associated ".tsv" 1. `commit()` the changes From d20537f219a3d22448d5c298a466e0db98e326fa Mon Sep 17 00:00:00 2001 From: florisvdh Date: Tue, 14 Aug 2018 11:56:18 +0200 Subject: [PATCH 35/52] Vignette: copy over explanation from the rm_file documentation Signed-off-by: Thierry Onkelinx --- vignettes/data-repository.Rmd | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/vignettes/data-repository.Rmd b/vignettes/data-repository.Rmd index 1bc4aba38..5018d985f 100644 --- a/vignettes/data-repository.Rmd +++ b/vignettes/data-repository.Rmd @@ -178,7 +178,15 @@ rm_file(repo = data_repo, path = c("my_data", "sub/test")) status(data_repo) ``` -Note that this will remove both the data and the metadata. In case you want to update the current data, it is better to overwrite the data than first delete it and then write it. Sometime you want to write a bunch of data object to the data repository and you cannot guarantee that all data objects will remain after the update. The workflow below is intended for such case. +Note that this will remove both the data and the metadata. + +In case of a data repository, there are three options for `path`: + +- a vector of file names as used in `write_delim_git}`. This will remove all associated `.tsv` and `.yml` files. +- `".tsv"` will remove ALL `.tsv` files. +- `".yml"` will remove all `.yml` files which have no associated `.tsv` file. + +In case you want to update the current data, it is better to overwrite the data than first delete it and then write it. Sometime you want to write a bunch of data object to the data repository and you cannot guarantee that all data objects will remain after the update. The workflow below is intended for such case. 1. use `path = ".tsv"` in `rm_file()` to remove **all** ".tsv" in the data repository 1. use `write_delim_git()` to write all data objects. Since the ".yml" are still present, any existing metadata will be used. From 04b8ca08b5fa40d19d3fe7ee8ab4e90cc59f0691 Mon Sep 17 00:00:00 2001 From: florisvdh Date: Tue, 14 Aug 2018 14:00:36 +0200 Subject: [PATCH 36/52] Vignette: replace 'format' by 'extension' Signed-off-by: Thierry Onkelinx --- vignettes/data-repository.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/data-repository.Rmd b/vignettes/data-repository.Rmd index 5018d985f..4c3b01ba0 100644 --- a/vignettes/data-repository.Rmd +++ b/vignettes/data-repository.Rmd @@ -75,7 +75,7 @@ data_repo <- init(path, project = "test") ### Storing data -Use `write_delim_git()` to store a `data.frame` into the data repository. The function will separate the data and the metadata. The data is stored as a headerless, unquoted tab delimited file with ".tsv" extension and UTF-8 encoding. The metadata is stored in YAML format with ".yml" format. Therefore any extension given to the `file` will be stripped (with a warning). +Use `write_delim_git()` to store a `data.frame` into the data repository. The function will separate the data and the metadata. The data is stored as a headerless, unquoted tab delimited file with ".tsv" extension and UTF-8 encoding. The metadata is stored in YAML format with ".yml" extension. Therefore any extension given to the `file` will be stripped (with a warning). The function will do a lot more preprocessing to the data in order to keep the file and the diff as small as possible. That is the reason why the data is stored headerless and unquoted. From 1ae9f9cf9294e02cd230f4cd7e74ff2ecb5f62af Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Mon, 20 Aug 2018 19:17:24 +0200 Subject: [PATCH 37/52] write_delim_git() and read_delim_git() work with "git_repository" instead of "data_repository" Signed-off-by: Thierry Onkelinx --- R/data_repository.R | 9 ++------- tests/data_repository.R | 15 ++------------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/R/data_repository.R b/R/data_repository.R index e5c71176d..a6a9a8f5e 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -39,9 +39,7 @@ write_delim_git <- function( if (!inherits(x, "data.frame")) { stop("x is not a 'data.frame'") } - if (!is_data_repo(repo)) { - stop("repo is not a 'data_repository'") - } + repo <- lookup_repository(repo) if (!missing(sorting)) { if (length(sorting) == 0) { stop("at least one variable is required for sorting") @@ -118,7 +116,6 @@ write_delim_git <- function( hashes <- hashfile(file) names(hashes) <- gsub(paste0("^", workdir(repo), "/"), "", file) - names(hashes) <- file.path(repo$project, names(hashes)) return(hashes) } @@ -155,9 +152,7 @@ compare_meta <- function(meta_data, old_meta_data) { ##' @export ##' @importFrom utils read.table read_delim_git <- function(file, repo = ".") { - if (!is_data_repo(repo)) { - stop("repo is not a 'data_repository'") - } + repo <- lookup_repository(repo) file <- file.path(workdir(repo), file) file <- clean_data_path(file) diff --git a/tests/data_repository.R b/tests/data_repository.R index ac35a0697..439268ab5 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -23,11 +23,10 @@ sessionInfo() path <- tempfile(pattern = "git2r-") dir.create(path) -## Initialize a data repository -data_repo <- init(path, project = "test") +## Initialize a repository +data_repo <- init(path) stopifnot(inherits(data_repo, "git_repository")) stopifnot(all.equal(data_repo$path, file.path(path, ".git"))) -stopifnot(all.equal(data_repo$project, "test")) config(data_repo, user.name = "Alice", user.email = "alice@example.org") stopifnot(all.equal( @@ -120,10 +119,6 @@ stopifnot(all.equal( tools::assertError(read_delim_git("", data_repo))[[1]][["message"]], "raw file and/or meta file missing" )) -stopifnot(all.equal( - tools::assertError(read_delim_git("test", "."))[[1]][["message"]], - "repo is not a 'data_repository'" -)) write_delim_git(x, "junk/test", data_repo) commit(data_repo, "test") @@ -153,12 +148,6 @@ stopifnot( ) ) -stopifnot(all.equal( - tools::assertError( - write_delim_git(x, "test", repository(path)) - )[[1]][["message"]], - "repo is not a 'data_repository'" -)) stopifnot(all.equal( tools::assertError( write_delim_git(x, "test", sorting = character(0), data_repo) From dea6be986390bb5933cb8dd98a9e18b0e1ac5c2c Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Mon, 20 Aug 2018 19:50:35 +0200 Subject: [PATCH 38/52] remove the data_repository class Signed-off-by: Thierry Onkelinx --- NAMESPACE | 5 ----- R/index.R | 55 ++--------------------------------------------- R/merge.R | 6 ------ R/plot.R | 8 ------- R/repository.R | 55 ++++++----------------------------------------- R/reset.R | 4 ---- R/status.R | 15 +------------ man/clone.Rd | 3 --- man/init.Rd | 5 +---- man/repository.Rd | 16 ++------------ man/rm_file.Rd | 20 +---------------- 11 files changed, 14 insertions(+), 178 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 98a433f16..d0152abeb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,7 +3,6 @@ S3method("[",git_tree) S3method(as.POSIXct,git_time) S3method(as.character,git_time) -S3method(as.data.frame,data_repository) S3method(as.data.frame,git_commit) S3method(as.data.frame,git_repository) S3method(as.data.frame,git_tree) @@ -21,7 +20,6 @@ S3method(length,git_blob) S3method(length,git_diff) S3method(length,git_tree) S3method(merge,character) -S3method(merge,data_repository) S3method(merge,git_branch) S3method(merge,git_repository) S3method(meta,Date) @@ -32,9 +30,7 @@ S3method(meta,factor) S3method(meta,integer) S3method(meta,logical) S3method(meta,numeric) -S3method(plot,data_repository) S3method(plot,git_repository) -S3method(print,data_repository) S3method(print,git_blob) S3method(print,git_branch) S3method(print,git_commit) @@ -61,7 +57,6 @@ S3method(sha,git_reference) S3method(sha,git_reflog_entry) S3method(sha,git_tag) S3method(sha,git_tree) -S3method(summary,data_repository) S3method(summary,git_commit) S3method(summary,git_diff) S3method(summary,git_repository) diff --git a/R/index.R b/R/index.R index 615db7d96..3a299d724 100644 --- a/R/index.R +++ b/R/index.R @@ -118,9 +118,6 @@ add <- function(repo = ".", path = NULL, force = FALSE) ## directory. Substitute common prefix with "" sub(paste0("^", repo_wd), "", np) }, character(1)) - if (is_data_repo(repo)) { - path <- file.path(repo$project, path) - } .Call(git2r_index_add_all, repo, path, isTRUE(force)) @@ -132,25 +129,8 @@ add <- function(repo = ".", path = NULL, force = FALSE) ##' @template repo-param ##' @param path character vector with filenames to remove. The path ##' must be relative to the repository's working folder. Only -##' files known to Git are removed. Works different in case of -##' \code{data_repository}. See details. +##' files known to Git are removed. ##' @return invisible(NULL) -##' @details -##' In case of a \code{data_repository}, there are three options for \code{path} -##' \enumerate{ -##' \item{a vector of file names as used in \code{\link{write_delim_git}}}. -##' This will remove all associated \code{.tsv} and \code{.yml} files -##' \item{\code{".tsv"}} will remove ALL \code{.tsv} files. -##' \item{\code{".yml"}} will remove all \code{.yml} files which have no -##' associated \code{.tsv} file -##' } -##' -##' \code{path = ".tsv"} is useful when updating a \code{data_repository} with a -##' variable number of files. First use \code{rm_file(repo, path = ".tsv")} to -##' remove all \code{.tsv} files. Then use \code{write_delim_git()} to store -##' all the data.frames. End by using \code{rm_file(repo, path = ".yml")}, which -##' will clean any left-over \code{.yml} files. As a result, any data.frame -##' which wasn't rewritten will be deleted. ##' @export ##' @examples ##' \dontrun{ @@ -176,39 +156,11 @@ add <- function(repo = ".", path = NULL, force = FALSE) ##' status(repo) ##' } rm_file <- function(repo = ".", path = NULL) { - if (is.null(path) || !is.character(path)) { + if (is.null(path) || !is.character(path)) stop("'path' must be a character vector") - } repo <- lookup_repository(repo) - if (is_data_repo(repo)) { - if (length(path) == 1 && path %in% c(".tsv", ".yml")) { - if (path == ".tsv") { - path <- list.files( - workdir(repo), - pattern = "\\.tsv$", - recursive = TRUE - ) - } else { - yml <- list.files( - workdir(repo), - pattern = "\\.yml$", - recursive = TRUE - ) - tsv <- list.files( - workdir(repo), - pattern = "\\.tsv$", - recursive = TRUE - ) - both <- gsub("\\.yml$", "", yml) %in% gsub("\\.tsv$", "", tsv) - path <- yml[!both] - } - } else { - path <- clean_data_path(path) - } - } - if (length(path)) { wd <- workdir(repo) @@ -246,9 +198,6 @@ rm_file <- function(repo = ".", path = NULL) { .Call(git2r_index_remove_bypath, repo, x) }) } - if (is_data_repo(repo)) { - add(repo = repo, path = path) - } invisible(NULL) } diff --git a/R/merge.R b/R/merge.R index c43a02c9f..1d7975ef0 100644 --- a/R/merge.R +++ b/R/merge.R @@ -96,12 +96,6 @@ merge.git_repository <- function(x, y = NULL, commit_on_success = TRUE, merger = merger, fail = fail) } ##' @export -merge.data_repository <- function(x, y = NULL, commit_on_success = TRUE, - merger = NULL, fail = FALSE, ...) -{ - NextMethod() -} -##' @export ##' @rdname merge merge.character <- function(x = ".", y = NULL, commit_on_success = TRUE, merger = NULL, fail = FALSE, ...) diff --git a/R/plot.R b/R/plot.R index 312ac4a43..e9c207aec 100644 --- a/R/plot.R +++ b/R/plot.R @@ -65,11 +65,3 @@ plot.git_repository <- function(x, mp <- barplot(df$n, xlab = xlab, ylab = ylab, main = main, ...) axis(1, at = mp, labels = seq(min(df$when), max(df$when), breaks)) } -##' @export -plot.data_repository <- function(x, - breaks = c("month", "year", "quarter", "week", "day"), - main = NULL, - ...) -{ - NextMethod() -} diff --git a/R/repository.R b/R/repository.R index f5f2269d7..ef5e73929 100644 --- a/R/repository.R +++ b/R/repository.R @@ -79,32 +79,17 @@ as.data.frame.git_repository <- function(x, ...) { do.call("rbind", lapply(commits(x), as.data.frame)) } -##' @export -as.data.frame.data_repository <- function(x, ...) { - NextMethod() -} ##' Open a repository ##' ##' @param path A path to an existing local git repository. ##' @param discover Discover repository from path. Default is TRUE. -##' @param project The name of of project. Refers to a local path in case of a -##' data repository. Defaults to \code{NULL}, indicating a standard repository. -##' @return Either a \code{git_repository} object with entries: +##' @return A \code{git_repository} object with entries: ##' \describe{ ##' \item{path}{ ##' Path to a git repository ##' } ##' } -##' or a \code{data_repository} object with entries: -##' \describe{ -##' \item{path}{ -##' Path to a git repository -##' } -##' \item{project}{ -##' The local path to the project starting from the root of the repository -##' } -##' } ##' @export ##' @examples ##' \dontrun{ @@ -165,7 +150,7 @@ as.data.frame.data_repository <- function(x, ...) { ##' ## List all tags in repository ##' tags(repo) ##' } -repository <- function(path = ".", discover = TRUE, project = NULL) { +repository <- function(path = ".", discover = TRUE) { if (isTRUE(discover)) { path <- discover_repository(path) if (is.null(path)) @@ -179,21 +164,7 @@ repository <- function(path = ".", discover = TRUE, project = NULL) { if (!isTRUE(.Call(git2r_repository_can_open, path))) stop("Unable to open repository at 'path'") - if (is.null(project)) { - return(structure(list(path = path), class = "git_repository")) - } - - stopifnot(is.character(project)) - stopifnot(length(project) == 1) - - local_path <- file.path(path, project) - if (!dir.exists(local_path)) { - dir.create(local_path, recursive = TRUE) - } - structure( - list(path = path, project = project), - class = c("data_repository", "git_repository") - ) + structure(list(path = path), class = "git_repository") } ##' Init a repository @@ -221,12 +192,12 @@ repository <- function(path = ".", discover = TRUE, project = NULL) { ##' repo_bare <- init(path_bare, bare = TRUE) ##' is_bare(repo_bare) ##' } -init <- function(path = ".", bare = FALSE, project = NULL) { +init <- function(path = ".", bare = FALSE) { path <- normalizePath(path, winslash = "/", mustWork = TRUE) if (!file.info(path)$isdir) stop("'path' is not a directory") .Call(git2r_repository_init, path, bare) - repository(path, project = project) + repository(path) } ##' Clone a remote repository @@ -298,7 +269,7 @@ clone <- function(url = NULL, { .Call(git2r_clone, url, local_path, bare, branch, checkout, credentials, progress) - repository(local_path, project = project) + repository(local_path) } ##' Get HEAD for a repository @@ -617,10 +588,6 @@ print.git_repository <- function(x, ...) { h$summary)) } } -##' @export -print.data_repository <- function(x, ...) { - NextMethod() -} ##' Summary of repository ##' @@ -706,10 +673,6 @@ summary.git_repository <- function(object, ...) { invisible(NULL) } -##' @export -summary.data_repository <- function(object, ...) { - NextMethod() -} ## Strip trailing slash or backslash, unless it's the current drive ## root (/) or a Windows drive, for example, 'c:\'. @@ -739,11 +702,7 @@ strip_trailing_slash <- function(path) { ##' } workdir <- function(repo = ".") { path <- .Call(git2r_repository_workdir, lookup_repository(repo)) - path <- strip_trailing_slash(path) - if (!inherits(repo, "git_repository") || is.null(repo$project)) { - return(path) - } - strip_trailing_slash(file.path(path, repo$project)) + strip_trailing_slash(path) } ##' Find path to repository for any file diff --git a/R/reset.R b/R/reset.R index 9ce605189..d60842312 100644 --- a/R/reset.R +++ b/R/reset.R @@ -93,10 +93,6 @@ reset <- function(object, reset_type = c("soft", "mixed", "hard"), path = NULL) if (is_empty(object)) { .Call(git2r_index_remove_bypath, object, path) } else { - if (is_data_repo(object) && !is.null(path)) { - path <- file.path(object$project, path) - path <- clean_data_path(path) - } .Call(git2r_reset_default, object, path) } } diff --git a/R/status.R b/R/status.R index e18253cd7..2127c509d 100644 --- a/R/status.R +++ b/R/status.R @@ -77,17 +77,9 @@ status <- function(repo = ".", ignored = FALSE, all_untracked = FALSE) { - s <- structure(.Call(git2r_status_list, lookup_repository(repo), staged, + structure(.Call(git2r_status_list, lookup_repository(repo), staged, unstaged, untracked, all_untracked, ignored), class = "git_status") - if (!is_data_repo(repo)) { - return(s) - } - rgx <- paste0("^", repo$project, "/") - s <- lapply(s, function(x){lapply(x, gsub, pattern = rgx, replacement = "")}) - class(s) <- "git_status" - attr(s, "project") <- repo$project - return(s) } ##' @export @@ -106,11 +98,6 @@ print.git_status <- function(x, ...) invisible(NULL) } - project <- attr(x, "project", exact = TRUE) - if (!is.null(project)) { - cat("Project folder:", project, "\n\n") - } - if (max(sapply(x, length)) == 0L) cat("working directory clean\n") diff --git a/man/clone.Rd b/man/clone.Rd index c6544d3d8..d4acd5ea6 100644 --- a/man/clone.Rd +++ b/man/clone.Rd @@ -24,9 +24,6 @@ is TRUE.} access. Default is NULL. To use and query an ssh-agent for the ssh key credentials, let this parameter be NULL (the default).} -\item{project}{The name of of project. Refers to a local path in case of a -data repository. Defaults to \code{NULL}, indicating a standard repository.} - \item{progress}{Show progress. Default is TRUE.} } \value{ diff --git a/man/init.Rd b/man/init.Rd index ff8d24bd6..0d10d4a80 100644 --- a/man/init.Rd +++ b/man/init.Rd @@ -4,7 +4,7 @@ \alias{init} \title{Init a repository} \usage{ -init(path = ".", bare = FALSE, project = NULL) +init(path = ".", bare = FALSE) } \arguments{ \item{path}{A path to where to init a git repository} @@ -13,9 +13,6 @@ init(path = ".", bare = FALSE, project = NULL) is created at the pointed path. If FALSE, provided path will be considered as the working directory into which the .git directory will be created.} - -\item{project}{The name of of project. Refers to a local path in case of a -data repository. Defaults to \code{NULL}, indicating a standard repository.} } \value{ A \code{git_repository} object diff --git a/man/repository.Rd b/man/repository.Rd index 6a81a5cde..6fc2e4d14 100644 --- a/man/repository.Rd +++ b/man/repository.Rd @@ -4,32 +4,20 @@ \alias{repository} \title{Open a repository} \usage{ -repository(path = ".", discover = TRUE, project = NULL) +repository(path = ".", discover = TRUE) } \arguments{ \item{path}{A path to an existing local git repository.} \item{discover}{Discover repository from path. Default is TRUE.} - -\item{project}{The name of of project. Refers to a local path in case of a -data repository. Defaults to \code{NULL}, indicating a standard repository.} } \value{ -Either a \code{git_repository} object with entries: +A \code{git_repository} object with entries: \describe{ \item{path}{ Path to a git repository } } -or a \code{data_repository} object with entries: -\describe{ - \item{path}{ - Path to a git repository - } - \item{project}{ - The local path to the project starting from the root of the repository - } -} } \description{ Open a repository diff --git a/man/rm_file.Rd b/man/rm_file.Rd index 6c93bc048..13219dd6b 100644 --- a/man/rm_file.Rd +++ b/man/rm_file.Rd @@ -12,8 +12,7 @@ object. Default is '.'} \item{path}{character vector with filenames to remove. The path must be relative to the repository's working folder. Only -files known to Git are removed. Works different in case of -\code{data_repository}. See details.} +files known to Git are removed.} } \value{ invisible(NULL) @@ -21,23 +20,6 @@ invisible(NULL) \description{ Remove files from the working tree and from the index } -\details{ -In case of a \code{data_repository}, there are three options for \code{path} -\enumerate{ - \item{a vector of file names as used in \code{\link{write_delim_git}}}. - This will remove all associated \code{.tsv} and \code{.yml} files - \item{\code{".tsv"}} will remove ALL \code{.tsv} files. - \item{\code{".yml"}} will remove all \code{.yml} files which have no - associated \code{.tsv} file -} - -\code{path = ".tsv"} is useful when updating a \code{data_repository} with a -variable number of files. First use \code{rm_file(repo, path = ".tsv")} to -remove all \code{.tsv} files. Then use \code{write_delim_git()} the store -all the data.frames. End by using \code{rm_file(repo, path = ".yml")}, which -will clean any left-over \code{.yml} files. As a result, any data.frame -which wasn't rewritten will be deleted. -} \examples{ \dontrun{ ## Initialize a repository From 5281e40ce98a51b15041e3234973124743d528dc Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Mon, 20 Aug 2018 20:19:08 +0200 Subject: [PATCH 39/52] add rm_data() Signed-off-by: Thierry Onkelinx --- NAMESPACE | 1 + R/data_repository.R | 47 +++++++++++++++++++++++++++++++++++++++++ R/repository.R | 1 - man/clone.Rd | 2 +- man/rm_data.Rd | 24 +++++++++++++++++++++ tests/data_repository.R | 12 +++++------ 6 files changed, 79 insertions(+), 8 deletions(-) create mode 100644 man/rm_data.Rd diff --git a/NAMESPACE b/NAMESPACE index d0152abeb..17981e591 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -147,6 +147,7 @@ export(repository) export(repository_head) export(reset) export(revparse_single) +export(rm_data) export(rm_file) export(sha) export(ssh_path) diff --git a/R/data_repository.R b/R/data_repository.R index a6a9a8f5e..e99f057d4 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -315,3 +315,50 @@ clean_data_path <- function(path) { path <- normalizePath(unique(path), mustWork = FALSE) c(raw_file = paste0(path, ".tsv"), meta_file = paste0(path, ".yml")) } + +##' Remove data files +##' Remove all tsv and/or yml files within the path +##' @template repo-param +##' @param path the directory in which to clean all the data files +##' @param type which file type should be removed +##' @param recursive remove files in subdirectories too +##' @export +rm_data <- function( + repo = ".", path = NULL, type = c("tsv", "yml", "both"), recursive = TRUE +) { + repo <- lookup_repository(repo) + if (is.null(path) || !is.character(path)) + stop("'path' must be a character vector") + if (length(path) != 1) + stop("'path' must be a single value") + type <- match.arg(type) + + local_path <- file.path(workdir(repo), path) + if (type == "tsv") { + to_do <- list.files( + path = local_path, + pattern = "\\.tsv$", + recursive = recursive + ) + } else if (type == "both") { + to_do <- list.files( + path = local_path, + pattern = "\\.(tsv|yml)$", + recursive = recursive + ) + } else { + to_do <- list.files( + path = local_path, + pattern = "\\.yml$", + recursive = recursive + ) + keep <- list.files( + path = local_path, + pattern = "\\.tsv$", + recursive = recursive + ) + keep <- gsub("\\.tsv$", ".yml", keep) + to_do <- to_do[!to_do %in% keep] + } + rm_file(repo = repo, path = file.path(path, to_do)) +} diff --git a/R/repository.R b/R/repository.R index ef5e73929..a43b1f7a3 100644 --- a/R/repository.R +++ b/R/repository.R @@ -264,7 +264,6 @@ clone <- function(url = NULL, branch = NULL, checkout = TRUE, credentials = NULL, - project = NULL, progress = TRUE) { .Call(git2r_clone, url, local_path, bare, diff --git a/man/clone.Rd b/man/clone.Rd index d4acd5ea6..97afb89a6 100644 --- a/man/clone.Rd +++ b/man/clone.Rd @@ -5,7 +5,7 @@ \title{Clone a remote repository} \usage{ clone(url = NULL, local_path = NULL, bare = FALSE, branch = NULL, - checkout = TRUE, credentials = NULL, project = NULL, progress = TRUE) + checkout = TRUE, credentials = NULL, progress = TRUE) } \arguments{ \item{url}{The remote repository to clone} diff --git a/man/rm_data.Rd b/man/rm_data.Rd new file mode 100644 index 000000000..5d04b4ea7 --- /dev/null +++ b/man/rm_data.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_repository.R +\name{rm_data} +\alias{rm_data} +\title{Remove data files +Remove all tsv and/or yml files within the path} +\usage{ +rm_data(repo = ".", path = NULL, type = c("tsv", "yml", "both"), + recursive = TRUE) +} +\arguments{ +\item{repo}{a path to a repository or a \code{git_repository} +object. Default is '.'} + +\item{path}{the directory in which to clean all the data files} + +\item{type}{which file type should be removed} + +\item{recursive}{remove files in subdirectories too} +} +\description{ +Remove data files +Remove all tsv and/or yml files within the path +} diff --git a/tests/data_repository.R b/tests/data_repository.R index 439268ab5..1851b1868 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -122,25 +122,25 @@ stopifnot(all.equal( write_delim_git(x, "junk/test", data_repo) commit(data_repo, "test") -rm_file(data_repo, ".tsv") +rm_data(data_repo, ".", "tsv") stopifnot( all.equal( - status(data_repo)$s, + status(data_repo)$unstaged, list(deleted = "junk/test.tsv", deleted = "test.tsv") ) ) write_delim_git(x, "junk/test", data_repo) -rm_file(data_repo, ".yml") +rm_data(data_repo, ".", "yml") stopifnot( all.equal( - status(data_repo)$s, + status(data_repo)$unstaged, list(deleted = "test.tsv", deleted = "test.yml") ) ) -rm_file(data_repo, "junk/test") +rm_data(data_repo, ".", "both") stopifnot( all.equal( - status(data_repo)$s, + status(data_repo)$unstaged, list( deleted = "junk/test.tsv", deleted = "junk/test.yml", deleted = "test.tsv", deleted = "test.yml" From 42bddcd02347e7ad7ffde3bb11426a75e4b1578e Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Mon, 20 Aug 2018 20:57:49 +0200 Subject: [PATCH 40/52] update vignette --- vignettes/data-repository.Rmd | 119 +++++++++++++++------------------- 1 file changed, 51 insertions(+), 68 deletions(-) diff --git a/vignettes/data-repository.Rmd b/vignettes/data-repository.Rmd index 4c3b01ba0..04c764fbf 100644 --- a/vignettes/data-repository.Rmd +++ b/vignettes/data-repository.Rmd @@ -25,36 +25,30 @@ Although git can store binary files, its real power and efficiency is with plain Git stores the version history under the form of diffs: a list of lines which are deleted and a list of lines which are inserted at a specific line number in a file. We have to keep this in mind when storing data as plain text files in Git. The table below indicates the impact of changing the information content on the diff in Git. Note that changing observations requires a small diff, while changing variables results in a massive diff. -```{r echo = FALSE} -cases <- rbind( - c("remove 1 observation", "remove 1 line"), - c("add 1 observation", "add 1 line"), - c("update 1 observation", "remove 1 line and add 1 line"), - c("remove 1 variable", "remove all lines and add all lines"), - c("add 1 variable", "remove all lines and add all lines") -) -colnames(cases) <- c("Change in data", "Git diff") -kable(cases, caption = "Changes in the information content of the data") -``` + +| Change in data | Git diff | +|:----------------------|:----------| +| remove 1 observation | remove 1 line | +| add 1 observation | add 1 line | +| update 1 observation | remove 1 line and add 1 line | +| remove 1 variable | remove all lines and add all lines | +| add 1 variable | remove all lines and add all lines | +Changes in the information content of the data: In the next table we show the effect on the diff when making changes in the data which don't change the information content of the data. -```{r echo = FALSE} -cases <- rbind( - c("move 1 observation", "remove 1 line and add 1 line"), - c("move 1 variable", "remove all lines and add all lines") -) -colnames(cases) <- c("Change in data", "Git diff") -kable(cases, caption = "Changes in the data without changing the information content") -``` -So in order to use Git as a performant and efficient version control system for data, we need to make sure to store the metadata and keep the diffs as small as possible. +|Change in data |Git diff | +|:------------------|:----------------------------------| +|move 1 observation |remove 1 line and add 1 line | +|move 1 variable |remove all lines and add all lines | +Changes in the data without changing the information content -## Tools in `git2r` +So in order to use Git as a performant and efficient version control system for data, we need to make sure to store the metadata and keep the diffs as small as possible. In the next section, the way git2r handles this will be explained. -### Data repository +## Tools in `git2r` -A 'data repository' refers to a subdirectory within the root of an existing git repository. Create a `data_repository` object with `init()` or `repository()` and specify the subdirectory through the `project` argument. Most functions of the package handle a `data_repository` object in the same way as they handle a `git_repository` object. The main difference is that the working directory points to the subdirectory of the root, rather than the root of the git repository. You can create multiple data repositories within the same git repository. +### Connect to a repository ```{r} # Create a directory in tempdir @@ -64,18 +58,16 @@ dir.create(path) library(git2r) # inititialize a new git repository repo <- init(path) -# use a data repository in an existing git repository -data_repo <- repository(path, project = "test") ``` ```{r eval = FALSE} -# inititialize a new git repository and use a data repository within it -data_repo <- init(path, project = "test") +# connect to an existing repository +repo <- repository(path) ``` ### Storing data -Use `write_delim_git()` to store a `data.frame` into the data repository. The function will separate the data and the metadata. The data is stored as a headerless, unquoted tab delimited file with ".tsv" extension and UTF-8 encoding. The metadata is stored in YAML format with ".yml" extension. Therefore any extension given to the `file` will be stripped (with a warning). +Use `write_delim_git()` to store a `data.frame` into the repository. The function will separate the data and the metadata. The data is stored as a headerless, unquoted tab delimited file with ".tsv" extension and UTF-8 encoding. The metadata is stored in YAML format with ".yml" extension. Therefore any extension given to the `file` will be stripped (with a warning). The function will do a lot more preprocessing to the data in order to keep the file and the diff as small as possible. That is the reason why the data is stored headerless and unquoted. @@ -114,9 +106,8 @@ Another important part of the preprocessing is ordering both the variables and t The data file (".tsv") and the metadata file (".yml") are automatically staged after writing. `write_delim_git()` returns the hashes of both the data file and the metadata file. The names of the vector contain the path of the files, relative to the root of the _git_ repository. ```{r} -status(data_repo) -write_delim_git(x = x, file = "my_data", repo = data_repo, sorting = c("y", "x")) -status(data_repo) +status(repo) +write_delim_git(x = x, file = "my_data", repo = repo, sorting = c("y", "x")) status(repo) ``` @@ -132,9 +123,9 @@ However you can bypass this by using `override = TRUE`. This will completely ign ```{r error=TRUE} y <- x y$extra <- x$x -write_delim_git(y, file = "my_data", repo = data_repo, sorting = c("y", "x")) +write_delim_git(y, file = "my_data", repo = repo, sorting = c("y", "x")) write_delim_git( - y, file = "my_data", repo = data_repo, sorting = c("y", "x"), + y, file = "my_data", repo = repo, sorting = c("y", "x"), override = TRUE ) ``` @@ -144,7 +135,7 @@ write_delim_git( Retrieving data is straight forward. Use `read_delim_git` and provide the `file` and the `repo`. The retrieved data is identical to the original data after applying the ordering of variables and observations. ```{r} -y_stored <- read_delim_git(file = "my_data", repo = data_repo) +y_stored <- read_delim_git(file = "my_data", repo = repo) str(y_stored) y_sorted <- y[order(y$y, y$x), colnames(y_stored)] rownames(y_sorted) <- NULL @@ -156,53 +147,45 @@ stopifnot(all.equal(y_sorted, y_stored)) Commit the changes using `commit()` ```{r} -write_delim_git(x, file = "sub/test", repo = data_repo, sorting = "x") -status(data_repo) -commit(data_repo, message = "Initial commit") -status(data_repo) +write_delim_git(x, file = "sub/test", repo = repo, sorting = "x") +status(repo) +commit(repo, message = "Initial commit") +status(repo) ``` ### Removing data -Data objects can be removed with `rm_file`. Provide the data repository to `repo` and the data object name to `path`.`path` can take multiple data objects. +Data objects are files, so you remove them with `rm_file`. ```{r} -# remove a single data object -rm_file(repo = data_repo, path = "my_data") -status(data_repo) +# remove a single file +rm_file(repo = repo, path = "my_data.tsv") +rm_file(repo = repo, path = "my_data.yml") +status(repo) # undo the remove by resetting to the last commit -reset(commits(data_repo)[[1]], "hard") -status(data_repo) -# remove multiple data objects -rm_file(repo = data_repo, path = c("my_data", "sub/test")) -status(data_repo) +reset(commits(repo)[[1]], "hard") +status(repo) +# remove all data objects in a given path +rm_data(repo = repo, path = ".", type = "both") +status(repo) ``` -Note that this will remove both the data and the metadata. - -In case of a data repository, there are three options for `path`: - -- a vector of file names as used in `write_delim_git}`. This will remove all associated `.tsv` and `.yml` files. -- `".tsv"` will remove ALL `.tsv` files. -- `".yml"` will remove all `.yml` files which have no associated `.tsv` file. - -In case you want to update the current data, it is better to overwrite the data than first delete it and then write it. Sometime you want to write a bunch of data object to the data repository and you cannot guarantee that all data objects will remain after the update. The workflow below is intended for such case. +Sometimes you want to write a bunch of data objects to the repository and you cannot guarantee that all data objects will remain after the update. The workflow below is intended for such case. -1. use `path = ".tsv"` in `rm_file()` to remove **all** ".tsv" in the data repository +1. use `rm_data()` with `type = "tsv"` to remove **all** ".tsv" in a specific path of the repository 1. use `write_delim_git()` to write all data objects. Since the ".yml" are still present, any existing metadata will be used. -1. use `path = ".yml"` in `rm_file()` to remove **any** ".yml" which have no associated ".tsv" +1. use `type = "yml"` in `rm_data()` to remove **any** ".yml" which have no associated ".tsv" 1. `commit()` the changes ```{r} # undo the remove by resetting to the last commit -reset(commits(data_repo)[[1]], "hard") -status(data_repo) -rm_file(data_repo, ".tsv") -status(data_repo) -write_delim_git(x, file = "sub/test", repo = data_repo, sorting = "x") -status(data_repo) -rm_file(data_repo, ".yml") -status(data_repo) -commit(data_repo, "Automated update") +reset(commits(repo)[[1]], "hard") +status(repo) +rm_data(repo, path = ".", type = "tsv") +status(repo) +write_delim_git(x, file = "sub/test", repo = repo, sorting = "x") +status(repo) +rm_data(repo, path = ".", type = "yml") +status(repo) +commit(repo, "Automated update", all = TRUE) ``` - From cbc982946751703468f9dfeff87320dc655cf314 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Tue, 21 Aug 2018 14:53:15 +0200 Subject: [PATCH 41/52] write_delim_git() gains a stage argument Signed-off-by: Thierry Onkelinx --- NAMESPACE | 1 - R/data_repository.R | 25 +++++++++++-------------- man/is_data_repo.Rd | 20 -------------------- man/rm_data.Rd | 4 +++- man/write_delim_git.Rd | 4 +++- tests/data_repository.R | 3 ++- vignettes/data-repository.Rmd | 3 +++ 7 files changed, 22 insertions(+), 38 deletions(-) delete mode 100644 man/is_data_repo.Rd diff --git a/NAMESPACE b/NAMESPACE index 17981e591..653c89e6a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -107,7 +107,6 @@ export(is_binary) export(is_blob) export(is_branch) export(is_commit) -export(is_data_repo) export(is_detached) export(is_empty) export(is_head) diff --git a/R/data_repository.R b/R/data_repository.R index e99f057d4..d75669691 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -28,13 +28,14 @@ ##' @param override Ignore existing meta data. This is required when new ##' variables are added or variables are deleted. Setting this to TRUE can ##' potentially lead to large diffs. Defaults to FALSE. +##' @param stage immediatly stage the changes ##' @inheritParams add ##' @return a named vector with the hashes of the files. The names contains the ##' files with their paths relative to the root of the git_repository. ##' @export ##' @importFrom utils tail write.table write_delim_git <- function( - x, file, repo = ".", sorting, override = FALSE, force = FALSE + x, file, repo = ".", sorting, override = FALSE, stage = FALSE, force = FALSE ) { if (!inherits(x, "data.frame")) { stop("x is not a 'data.frame'") @@ -112,7 +113,9 @@ write_delim_git <- function( quote = FALSE, sep = "\t", eol = "\n", dec = ".", row.names = FALSE, col.names = FALSE, fileEncoding = "UTF-8" ) - add(repo, path = file, force = force) + if (stage) { + add(repo, path = file, force = force) + } hashes <- hashfile(file) names(hashes) <- gsub(paste0("^", workdir(repo), "/"), "", file) @@ -289,17 +292,6 @@ meta.Date <- function(x) { return(z) } -##' Check if object is a data repository -##' @param object the object to check -##' @return TRUE is a data \code{git_repository}, else FALSE -##' @seealso repo init -##' @export -is_data_repo <- function(object) { - inherits(object, "git_repository") && - inherits(object, "data_repository") && - !is.null(object$project) -} - ##' Clean the data path ##' Strips any file extension from the path and adds the ".tsv" and ".yml" file extensions ##' @param path the paths @@ -322,9 +314,11 @@ clean_data_path <- function(path) { ##' @param path the directory in which to clean all the data files ##' @param type which file type should be removed ##' @param recursive remove files in subdirectories too +##' @inheritParams write_delim_git ##' @export rm_data <- function( - repo = ".", path = NULL, type = c("tsv", "yml", "both"), recursive = TRUE + repo = ".", path = NULL, type = c("tsv", "yml", "both"), recursive = TRUE, + stage = FALSE ) { repo <- lookup_repository(repo) if (is.null(path) || !is.character(path)) @@ -361,4 +355,7 @@ rm_data <- function( to_do <- to_do[!to_do %in% keep] } rm_file(repo = repo, path = file.path(path, to_do)) + if (stage) { + add(repo, path = to_do) + } } diff --git a/man/is_data_repo.Rd b/man/is_data_repo.Rd deleted file mode 100644 index 1a1e1d6fb..000000000 --- a/man/is_data_repo.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data_repository.R -\name{is_data_repo} -\alias{is_data_repo} -\title{Check if object is a data repository} -\usage{ -is_data_repo(object) -} -\arguments{ -\item{object}{the object to check} -} -\value{ -TRUE is a data \code{git_repository}, else FALSE -} -\description{ -Check if object is a data repository -} -\seealso{ -repo init -} diff --git a/man/rm_data.Rd b/man/rm_data.Rd index 5d04b4ea7..4d6f07a6e 100644 --- a/man/rm_data.Rd +++ b/man/rm_data.Rd @@ -6,7 +6,7 @@ Remove all tsv and/or yml files within the path} \usage{ rm_data(repo = ".", path = NULL, type = c("tsv", "yml", "both"), - recursive = TRUE) + recursive = TRUE, stage = FALSE) } \arguments{ \item{repo}{a path to a repository or a \code{git_repository} @@ -17,6 +17,8 @@ object. Default is '.'} \item{type}{which file type should be removed} \item{recursive}{remove files in subdirectories too} + +\item{stage}{immediatly stage the changes} } \description{ Remove data files diff --git a/man/write_delim_git.Rd b/man/write_delim_git.Rd index 15b6ce52b..2f9899621 100644 --- a/man/write_delim_git.Rd +++ b/man/write_delim_git.Rd @@ -5,7 +5,7 @@ \title{Write a \code{data.frame} to a git repository} \usage{ write_delim_git(x, file, repo = ".", sorting, override = FALSE, - force = FALSE) + stage = FALSE, force = FALSE) } \arguments{ \item{x}{the \code{data.frame}} @@ -25,6 +25,8 @@ sorting \code{x} and in what order to use them. Defaults to variables are added or variables are deleted. Setting this to TRUE can potentially lead to large diffs. Defaults to FALSE.} +\item{stage}{immediatly stage the changes} + \item{force}{Add ignored files. Default is FALSE.} } \value{ diff --git a/tests/data_repository.R b/tests/data_repository.R index 1851b1868..cbcb7019b 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -54,7 +54,7 @@ stopifnot(all.equal( z <- status(data_repo) print(z) stopifnot( - all.equal(z$s, list(new = "test.tsv", new = "test.yml")) + all.equal(z$untracked, list(untracked = "test.tsv", untracked = "test.yml")) ) write_delim_git(x, "test", data_repo) stopifnot(all.equal(status(data_repo), z)) @@ -121,6 +121,7 @@ stopifnot(all.equal( )) write_delim_git(x, "junk/test", data_repo) +add(data_repo, path = ".") commit(data_repo, "test") rm_data(data_repo, ".", "tsv") stopifnot( diff --git a/vignettes/data-repository.Rmd b/vignettes/data-repository.Rmd index 04c764fbf..eaa7b8af6 100644 --- a/vignettes/data-repository.Rmd +++ b/vignettes/data-repository.Rmd @@ -149,6 +149,8 @@ Commit the changes using `commit()` ```{r} write_delim_git(x, file = "sub/test", repo = repo, sorting = "x") status(repo) +add(repo, ".") +status(repo) commit(repo, message = "Initial commit") status(repo) ``` @@ -188,4 +190,5 @@ status(repo) rm_data(repo, path = ".", type = "yml") status(repo) commit(repo, "Automated update", all = TRUE) +status(repo) ``` From f476e182415ca6eba820f46ca0c1ae6622a8c2a5 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Tue, 21 Aug 2018 17:40:11 +0200 Subject: [PATCH 42/52] write_delim_git() gains a optimize argument Signed-off-by: Thierry Onkelinx --- R/data_repository.R | 87 +++++++++++++++++++++++++++++++----------- man/meta.Rd | 8 ++-- man/write_delim_git.Rd | 4 +- 3 files changed, 73 insertions(+), 26 deletions(-) diff --git a/R/data_repository.R b/R/data_repository.R index d75669691..4203d4f20 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -30,12 +30,14 @@ ##' potentially lead to large diffs. Defaults to FALSE. ##' @param stage immediatly stage the changes ##' @inheritParams add +##' @inheritParams meta ##' @return a named vector with the hashes of the files. The names contains the ##' files with their paths relative to the root of the git_repository. ##' @export ##' @importFrom utils tail write.table write_delim_git <- function( - x, file, repo = ".", sorting, override = FALSE, stage = FALSE, force = FALSE + x, file, repo = ".", sorting, override = FALSE, stage = FALSE, + optimize = TRUE, force = FALSE ) { if (!inherits(x, "data.frame")) { stop("x is not a 'data.frame'") @@ -58,7 +60,10 @@ write_delim_git <- function( if (!dir.exists(dirname(file["raw_file"]))) { dir.create(dirname(file["raw_file"]), recursive = TRUE) } - raw_data <- as.data.frame(lapply(x, meta), stringsAsFactors = FALSE) + raw_data <- as.data.frame( + lapply(x, meta, optimize = optimize), + stringsAsFactors = FALSE + ) meta_data <- paste( colnames(x), vapply(raw_data, attr, "", which = "meta"), @@ -72,13 +77,29 @@ write_delim_git <- function( to_sort <- colnames(x) %in% sorting meta_data <- meta_data[c(sorting, colnames(x)[!to_sort])] meta_data[sorting] <- paste0(meta_data[sorting], "\n sort") - writeLines(meta_data, file["meta_file"]) + if (optimize) { + store_meta_data <- c(meta_data, "optimized") + } else { + store_meta_data <- c(meta_data, "verbose") + } + writeLines(store_meta_data, file["meta_file"]) } else { old_meta_data <- readLines(file["meta_file"]) + if (tail(old_meta_data, 1) == "verbose") { + if (optimize) { + stop("old data was stored verbose") + } + } else if (tail(old_meta_data, 1) == "optimized") { + if (!optimize) { + stop("old data was stored optimized") + } + } else { + stop("error in existing metadata") + } meta_cols <- grep("^\\S*:$", old_meta_data) positions <- cbind( start = meta_cols, - end = c(tail(meta_cols, -1) - 1, length(old_meta_data)) + end = c(tail(meta_cols, -1) - 1, length(old_meta_data) - 1) ) old_meta_data <- apply( positions, @@ -110,8 +131,8 @@ write_delim_git <- function( raw_data <- raw_data[do.call(order, raw_data[sorting]), ] write.table( x = raw_data, file = file["raw_file"], append = FALSE, - quote = FALSE, sep = "\t", eol = "\n", dec = ".", - row.names = FALSE, col.names = FALSE, fileEncoding = "UTF-8" + quote = !optimize, sep = "\t", eol = "\n", dec = ".", + row.names = FALSE, col.names = !optimize, fileEncoding = "UTF-8" ) if (stage) { add(repo, path = file, force = force) @@ -165,9 +186,16 @@ read_delim_git <- function(file, repo = ".") { meta_data <- readLines(file["meta_file"]) meta_cols <- grep("^\\S*:$", meta_data) col_names <- gsub(":", "", meta_data[meta_cols]) + if (tail(meta_data, 1) == "optimized") { + optimize <- TRUE + } else if (tail(meta_data, 1) == "verbose") { + optimize <- FALSE + } else { + stop("error in metadata") + } raw_data <- read.table( - file = file["raw_file"], header = FALSE, - sep = "\t", quote = "", dec = ".", + file = file["raw_file"], header = !optimize, + sep = "\t", quote = ifelse(optimize, "", "\"'"), dec = ".", as.is = TRUE, col.names = col_names ) @@ -212,8 +240,9 @@ read_delim_git <- function(file, repo = ".") { return(raw_data) } -##' Optimise a vector for storage in to a git repository and add meta data +##' optimize a vector for storage in to a git repository and add meta data ##' @param x the vector +##' @param optimize recode the data to get smaller text files. Defaults to TRUE ##' @details ##' \itemize{ ##' \item \code{meta.character} checks for the presence of \code{'NA'}. @@ -224,12 +253,12 @@ read_delim_git <- function(file, repo = ".") { ##' detected. ##' } ##' @export -meta <- function(x) { +meta <- function(x, optimize = TRUE) { UseMethod("meta") } ##' @export -meta.character <- function(x) { +meta.character <- function(x, optimize = TRUE) { attr(x, "meta") <- " class: character" if (any(is.na(x))) { stop( @@ -243,20 +272,24 @@ factor." } ##' @export -meta.integer <- function(x) { +meta.integer <- function(x, optimize = TRUE) { attr(x, "meta") <- " class: integer" return(x) } ##' @export -meta.numeric <- function(x) { +meta.numeric <- function(x, optimize = TRUE) { attr(x, "meta") <- " class: numeric" return(x) } ##' @export -meta.factor <- function(x) { - z <- as.integer(x) +meta.factor <- function(x, optimize = TRUE) { + if (optimize) { + z <- as.integer(x) + } else { + z <- x + } attr(z, "meta") <- paste( " class: factor\n levels:", paste(" -", levels(x), collapse = "\n"), @@ -266,28 +299,38 @@ meta.factor <- function(x) { } ##' @export -meta.logical <- function(x) { - x <- as.integer(x) +meta.logical <- function(x, optimize = TRUE) { + if (optimize) { + x <- as.integer(x) + } attr(x, "meta") <- " class: logical" return(x) } ##' @export -meta.complex <- function(x) { +meta.complex <- function(x, optimize = TRUE) { attr(x, "meta") <- " class: complex" return(x) } ##' @export -meta.POSIXct <- function(x) { - z <- unclass(x) +meta.POSIXct <- function(x, optimize = TRUE) { + if (optimize) { + z <- unclass(x) + } else { + z <- x + } attr(z, "meta") <- " class: POSIXct\n origin: 1970-01-01\n" return(z) } ##' @export -meta.Date <- function(x) { - z <- unclass(x) +meta.Date <- function(x, optimize = TRUE) { + if (optimize) { + z <- unclass(x) + } else { + z <- x + } attr(z, "meta") <- " class: Date\n origin: 1970-01-01\n" return(z) } diff --git a/man/meta.Rd b/man/meta.Rd index dbf90e058..3ddc11c2f 100644 --- a/man/meta.Rd +++ b/man/meta.Rd @@ -2,15 +2,17 @@ % Please edit documentation in R/data_repository.R \name{meta} \alias{meta} -\title{Optimise a vector for storage in to a git repository and add meta data} +\title{optimize a vector for storage in to a git repository and add meta data} \usage{ -meta(x) +meta(x, optimize = TRUE) } \arguments{ \item{x}{the vector} + +\item{optimize}{recode the data to get smaller text files. Defaults to TRUE} } \description{ -Optimise a vector for storage in to a git repository and add meta data +optimize a vector for storage in to a git repository and add meta data } \details{ \itemize{ diff --git a/man/write_delim_git.Rd b/man/write_delim_git.Rd index 2f9899621..1991d5039 100644 --- a/man/write_delim_git.Rd +++ b/man/write_delim_git.Rd @@ -5,7 +5,7 @@ \title{Write a \code{data.frame} to a git repository} \usage{ write_delim_git(x, file, repo = ".", sorting, override = FALSE, - stage = FALSE, force = FALSE) + stage = FALSE, optimize = TRUE, force = FALSE) } \arguments{ \item{x}{the \code{data.frame}} @@ -27,6 +27,8 @@ potentially lead to large diffs. Defaults to FALSE.} \item{stage}{immediatly stage the changes} +\item{optimize}{recode the data to get smaller text files. Defaults to TRUE} + \item{force}{Add ignored files. Default is FALSE.} } \value{ From d78f9f18dd018a4f2b6748a323dcc2307decda36 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Tue, 21 Aug 2018 17:59:36 +0200 Subject: [PATCH 43/52] revert unneeded changes Signed-off-by: Thierry Onkelinx --- R/merge.R | 1 + R/repository.R | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/R/merge.R b/R/merge.R index 1d7975ef0..548df6c48 100644 --- a/R/merge.R +++ b/R/merge.R @@ -95,6 +95,7 @@ merge.git_repository <- function(x, y = NULL, commit_on_success = TRUE, merge.git_branch(b, commit_on_success = commit_on_success, merger = merger, fail = fail) } + ##' @export ##' @rdname merge merge.character <- function(x = ".", y = NULL, commit_on_success = TRUE, diff --git a/R/repository.R b/R/repository.R index a43b1f7a3..d8e954bfe 100644 --- a/R/repository.R +++ b/R/repository.R @@ -174,7 +174,6 @@ repository <- function(path = ".", discover = TRUE) { ##' is created at the pointed path. If FALSE, provided path will ##' be considered as the working directory into which the .git ##' directory will be created. -##' @inheritParams repository ##' @return A \code{git_repository} object ##' @export ##' @seealso \link{repository} @@ -213,7 +212,6 @@ init <- function(path = ".", bare = FALSE) { ##' access. Default is NULL. To use and query an ssh-agent for the ##' ssh key credentials, let this parameter be NULL (the default). ##' @param progress Show progress. Default is TRUE. -##' @inheritParams repository ##' @return A \code{git_repository} object. ##' @seealso \link{repository}, \code{\link{cred_user_pass}}, ##' \code{\link{cred_ssh_key}} From ee881ca5856b8c1fb7b857628190d54e02659358 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Wed, 22 Aug 2018 10:39:54 +0200 Subject: [PATCH 44/52] Add more unit tests Signed-off-by: Thierry Onkelinx --- R/data_repository.R | 22 +++++++++----- tests/data_repository.R | 64 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 74 insertions(+), 12 deletions(-) diff --git a/R/data_repository.R b/R/data_repository.R index 4203d4f20..4f8deb7e5 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -213,14 +213,22 @@ read_delim_git <- function(file, repo = ".") { list(character(0)) ) names(col_factor_level) <- col_names[col_factor] - for (id in names(col_factor_level)) { - raw_data[[id]] <- factor( - raw_data[[id]], - levels = seq_along(col_factor_level[[id]]), - labels = col_factor_level[[id]] - ) + if (optimize) { + for (id in names(col_factor_level)) { + raw_data[[id]] <- factor( + raw_data[[id]], + levels = seq_along(col_factor_level[[id]]), + labels = col_factor_level[[id]] + ) + } + } else { + for (id in names(col_factor_level)) { + raw_data[[id]] <- factor( + raw_data[[id]], + levels = col_factor_level[[id]] + ) + } } - col_logical <- which(col_classes == "logical") for (id in col_logical) { raw_data[[id]] <- as.logical(raw_data[[id]]) diff --git a/tests/data_repository.R b/tests/data_repository.R index cbcb7019b..26c88a575 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -47,14 +47,14 @@ x <- data.frame( stopifnot(all.equal( tools::assertWarning( - write_delim_git(x, "test.txt", data_repo) + write_delim_git(x, "test.txt", data_repo, stage = TRUE) )[[1]]$message, "file extensions are stripped" )) z <- status(data_repo) print(z) stopifnot( - all.equal(z$untracked, list(untracked = "test.tsv", untracked = "test.yml")) + all.equal(z$staged, list(new = "test.tsv", new = "test.yml")) ) write_delim_git(x, "test", data_repo) stopifnot(all.equal(status(data_repo), z)) @@ -138,13 +138,18 @@ stopifnot( list(deleted = "test.tsv", deleted = "test.yml") ) ) -rm_data(data_repo, ".", "both") +rm_data(data_repo, ".", "both", stage = TRUE) stopifnot( all.equal( status(data_repo)$unstaged, + list(deleted = "test.tsv", deleted = "test.yml") + ) +) +stopifnot( + all.equal( + status(data_repo)$staged, list( - deleted = "junk/test.tsv", deleted = "junk/test.yml", - deleted = "test.tsv", deleted = "test.yml" + deleted = "junk/test.tsv", deleted = "junk/test.yml" ) ) ) @@ -180,3 +185,52 @@ z <- read_delim_git("logical", data_repo) y.sorted <- y[do.call(order, y[c("y", "logic")]), colnames(z)] rownames(y.sorted) <- NULL stopifnot(all.equal(y.sorted, z)) + +stopifnot(all.equal( + tools::assertError( + write_delim_git( + y, "logical", data_repo, sorting = c("y", "logic"), optimize = FALSE + ) + )[[1]][["message"]], + "old data was stored optimized" +)) + +write_delim_git(y, "verbose", data_repo, optimize = FALSE) +z <- read_delim_git("verbose", data_repo) +stopifnot(all.equal(y, z)) + +stopifnot(all.equal( + tools::assertError( + write_delim_git(y, "verbose", data_repo, optimize = TRUE) + )[[1]][["message"]], + "old data was stored verbose" +)) +yml <- file.path(path, "verbose.yml") +meta <- head(readLines(yml), -1) +writeLines(text = meta, con = yml) +stopifnot(all.equal( + tools::assertError( + read_delim_git("verbose", data_repo) + )[[1]][["message"]], + "error in metadata" +)) +stopifnot(all.equal( + tools::assertError( + write_delim_git(y, "verbose", data_repo, optimize = FALSE) + )[[1]][["message"]], + "error in existing metadata" +)) + +stopifnot(all.equal( + tools::assertError( + rm_data(path) + )[[1]][["message"]], + "'path' must be a character vector" +)) +stopifnot(all.equal( + tools::assertError( + rm_data(path, c(".", "junk")) + )[[1]][["message"]], + "'path' must be a single value" +)) + From 3fe6a6fa90ada1fadce6402de7132588db1f05f5 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Wed, 22 Aug 2018 11:34:51 +0200 Subject: [PATCH 45/52] More work on the vignette Signed-off-by: Thierry Onkelinx --- vignettes/data-repository.Rmd | 45 +++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/vignettes/data-repository.Rmd b/vignettes/data-repository.Rmd index eaa7b8af6..b3c9211e6 100644 --- a/vignettes/data-repository.Rmd +++ b/vignettes/data-repository.Rmd @@ -33,7 +33,7 @@ Git stores the version history under the form of diffs: a list of lines which ar | update 1 observation | remove 1 line and add 1 line | | remove 1 variable | remove all lines and add all lines | | add 1 variable | remove all lines and add all lines | -Changes in the information content of the data: +Table: Changes in the information content of the data In the next table we show the effect on the diff when making changes in the data which don't change the information content of the data. @@ -42,7 +42,7 @@ In the next table we show the effect on the diff when making changes in the data |:------------------|:----------------------------------| |move 1 observation |remove 1 line and add 1 line | |move 1 variable |remove all lines and add all lines | -Changes in the data without changing the information content +Table: Changes in the data without changing the information content So in order to use Git as a performant and efficient version control system for data, we need to make sure to store the metadata and keep the diffs as small as possible. In the next section, the way git2r handles this will be explained. @@ -78,6 +78,7 @@ The function will do a lot more preprocessing to the data in order to keep the f - `character` is written as is and unquoted to the data. The class is stored in the metadata. - `factor` is stored as its indices in the data. The labels of levels are stored in the metadata. - `POSIXct` is written as an integer to the data. The class and the origin are stored in the metadata. +- `Date` is written as an integer to the data. The class and the origin are stored in the metadata. ```{r} # Create dummy data @@ -192,3 +193,43 @@ status(repo) commit(repo, "Automated update", all = TRUE) status(repo) ``` + +### Verbose data storage + +`write_delim_git()` will store the data by default in an optimize way in the repository. The downside of this is that the stored data is less human-readable. + +```{r} +# the first 10 lines of the raw data file in optimized format +opt <- readLines(file.path(path, "sub/test.tsv")) +cat(head(opt, 10), sep = "\n") +``` + +Setting `optimize = FALSE` will store the data in a human-readable format. The main differences are: + +1. the raw data file will have a header row with the column names +1. factor, POSIXct and Date are stored as characters instead of integer +1. all character columns are quoted + +```{r} +# the first 10 lines of the raw data file in verbose format +write_delim_git(x, file = "verb", repo = repo, sorting = "x", optimize = FALSE) +verb <- readLines(file.path(path, "verb.tsv")) +cat(head(verb, 10), sep = "\n") +``` + + +Verbose storage of the raw data will require more disk space. The table below indicates the number of bytes required to store the data types which can be optimized. In case of factor and character this will highly depend on the length of the text. For sake of simplicity we used three examples: 1) _short_: 1 character per level and < 10 levels; 2) _medium_: 10 characters per level and < 100 levels; 3) _long_: 50 characters per level and < 1000 levels. Note that factors are more efficient than characters, unless the character column contains only unique values. + +| data type | verbose | optimized | ratio | +| ---------------- | -------:| ---------:| -----:| +| POSIXct | 19 | 16 | 0.842 | +| Date | 10 | 5 | 0.500 | +| Logical | 4 | 1 | 0.250 | +| short factor | 3 | 1 | 0.333 | +| medium factor | 12 | 2 | 0.167 | +| long factor | 52 | 3 | 0.058 | +| short character | 3 | 1 | 0.333 | +| medium character | 12 | 10 | 0.833 | +| long character | 52 | 50 | 0.962 | +Table: Comparison of the number of bytes required to store a single value in verbose or optimized format + From d26aaf9ace42d095aa1295bcbb59adb1928a7d91 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Wed, 22 Aug 2018 12:01:38 +0200 Subject: [PATCH 46/52] bugfixes Signed-off-by: Thierry Onkelinx --- tests/data_repository.R | 2 -- vignettes/data-repository.Rmd | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/data_repository.R b/tests/data_repository.R index 26c88a575..8da2b7549 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -25,8 +25,6 @@ dir.create(path) ## Initialize a repository data_repo <- init(path) -stopifnot(inherits(data_repo, "git_repository")) -stopifnot(all.equal(data_repo$path, file.path(path, ".git"))) config(data_repo, user.name = "Alice", user.email = "alice@example.org") stopifnot(all.equal( diff --git a/vignettes/data-repository.Rmd b/vignettes/data-repository.Rmd index b3c9211e6..0d2d7dd4b 100644 --- a/vignettes/data-repository.Rmd +++ b/vignettes/data-repository.Rmd @@ -58,6 +58,7 @@ dir.create(path) library(git2r) # inititialize a new git repository repo <- init(path) +config(repo, user.name = "Alice", user.email = "alice@example.org") ``` ```{r eval = FALSE} From 24dc997b42eed23b7279b3a64d5fe10a4823bb60 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Wed, 22 Aug 2018 12:23:47 +0200 Subject: [PATCH 47/52] use file.exists() instead of dir.exists() dir.exists() is not available in R < 3.2.0 Signed-off-by: Thierry Onkelinx --- R/data_repository.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data_repository.R b/R/data_repository.R index 4f8deb7e5..b2a04c902 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -57,7 +57,7 @@ write_delim_git <- function( file <- file.path(workdir(repo), file) file <- clean_data_path(file) - if (!dir.exists(dirname(file["raw_file"]))) { + if (!file.exists(dirname(file["raw_file"]))) { dir.create(dirname(file["raw_file"]), recursive = TRUE) } raw_data <- as.data.frame( From 51c4d7ea87ab65a48b0a56d592f5cb627b485ade Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Fri, 24 Aug 2018 09:50:08 +0200 Subject: [PATCH 48/52] fix typos Signed-off-by: Thierry Onkelinx --- vignettes/data-repository.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/data-repository.Rmd b/vignettes/data-repository.Rmd index 0d2d7dd4b..3456bf69b 100644 --- a/vignettes/data-repository.Rmd +++ b/vignettes/data-repository.Rmd @@ -134,7 +134,7 @@ write_delim_git( ### Reading data -Retrieving data is straight forward. Use `read_delim_git` and provide the `file` and the `repo`. The retrieved data is identical to the original data after applying the ordering of variables and observations. +Retrieving data is straightforward. Use `read_delim_git` and provide the `file` and the `repo`. The retrieved data is identical to the original data after applying the ordering of variables and observations. ```{r} y_stored <- read_delim_git(file = "my_data", repo = repo) @@ -197,7 +197,7 @@ status(repo) ### Verbose data storage -`write_delim_git()` will store the data by default in an optimize way in the repository. The downside of this is that the stored data is less human-readable. +`write_delim_git()` will store the data by default in an optimized way in the repository. The downside of this is that the stored data is less human-readable. ```{r} # the first 10 lines of the raw data file in optimized format From fa8cddb8c4f0f2e9c6c3f1072bc90f20f255aa5f Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Wed, 29 Aug 2018 12:07:58 +0200 Subject: [PATCH 49/52] add recent_commit() Signed-off-by: Thierry Onkelinx --- NAMESPACE | 1 + R/data_repository.R | 31 +++++++++++++++++++ man/recent_commit.Rd | 21 +++++++++++++ tests/data_repository.R | 66 +++++++++++++++++++++++++++++++++++++++-- 4 files changed, 116 insertions(+), 3 deletions(-) create mode 100644 man/recent_commit.Rd diff --git a/NAMESPACE b/NAMESPACE index 653c89e6a..6eb16bfa6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -133,6 +133,7 @@ export(pull) export(punch_card) export(push) export(read_delim_git) +export(recent_commit) export(references) export(reflog) export(remote_add) diff --git a/R/data_repository.R b/R/data_repository.R index b2a04c902..75bb677b6 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -410,3 +410,34 @@ rm_data <- function( add(repo, path = to_do) } } + +##' Recent file change +##' Retrieve the most recent commit in which a file or data object was changed. +##' @template repo-param +##' @param path the path to the file or the data object. File extensions are silently ignored in case of a data object +##' @param data refers path to a file (FALSE) or a data object (TRUE). Defaults to FALSE +##' @export +##' @return A data.frame with commit, author and timestamp. Will contain multiple rows when multiple commits are made within the same second +recent_commit <- function(repo, path = NULL, data = FALSE) { + repo <- lookup_repository(repo) + if (is.null(path) || !is.character(path)) + stop("'path' must be a character vector") + if (length(path) != 1) + stop("'path' must be a single value") + if (data) { + path <- clean_data_path(path) + } + name <- basename(path) + path <- unique(dirname(path)) + if (path == ".") { + path <- "" + } + blobs <- odb_blobs(repo) + blobs <- blobs[blobs$path == path & blobs$name %in% name, ] + blobs <- blobs[blobs$when == max(blobs$when), c("commit", "author", "when")] + blobs <- unique(blobs) + if (nrow(blobs) > 1) { + warning("Multiple commits within the same second") + } + blobs +} diff --git a/man/recent_commit.Rd b/man/recent_commit.Rd new file mode 100644 index 000000000..13b44de69 --- /dev/null +++ b/man/recent_commit.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_repository.R +\name{recent_commit} +\alias{recent_commit} +\title{Recent file change +Retrieve the most commit in which a file or data object was changed.} +\usage{ +recent_commit(repo, path = NULL, data = FALSE) +} +\arguments{ +\item{repo}{a path to a repository or a \code{git_repository} +object. Default is '.'} + +\item{path}{the path to the file or the data object. File extensions are silently ignored in case of a data object} + +\item{data}{refers path to a file (FALSE) or a data object (TRUE). Defaults to FALSE} +} +\description{ +Recent file change +Retrieve the most commit in which a file or data object was changed. +} diff --git a/tests/data_repository.R b/tests/data_repository.R index 8da2b7549..a0f0b6826 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -23,6 +23,10 @@ sessionInfo() path <- tempfile(pattern = "git2r-") dir.create(path) +# currently odb_blobs() can't handle subsecond commits +# when TRUE Sys.sleep(1) is added before each commit +subsecond <- TRUE + ## Initialize a repository data_repo <- init(path) config(data_repo, user.name = "Alice", user.email = "alice@example.org") @@ -50,12 +54,15 @@ stopifnot(all.equal( "file extensions are stripped" )) z <- status(data_repo) -print(z) stopifnot( all.equal(z$staged, list(new = "test.tsv", new = "test.yml")) ) write_delim_git(x, "test", data_repo) stopifnot(all.equal(status(data_repo), z)) +add(data_repo, path = ".") +if (subsecond) Sys.sleep(1) +commit_1 <- commit(data_repo, "initial commit") + stopifnot(all.equal( tools::assertError( write_delim_git(x[, 1:3], "test", data_repo) @@ -117,10 +124,15 @@ stopifnot(all.equal( tools::assertError(read_delim_git("", data_repo))[[1]][["message"]], "raw file and/or meta file missing" )) +add(data_repo, path = ".") +if (subsecond) Sys.sleep(1) +commit_2 <- commit(data_repo, "test") write_delim_git(x, "junk/test", data_repo) add(data_repo, path = ".") -commit(data_repo, "test") +if (subsecond) Sys.sleep(1) +commit_3 <- commit(data_repo, "test") + rm_data(data_repo, ".", "tsv") stopifnot( all.equal( @@ -164,6 +176,8 @@ stopifnot(all.equal( )[[1]][["message"]], "use only variables of 'x' for sorting" )) +if (subsecond) Sys.sleep(1) +commit_4 <- commit(data_repo, "more") y <- x y$logic <- sample(c(TRUE, FALSE, NA), replace = TRUE, size = nrow(y)) @@ -183,6 +197,9 @@ z <- read_delim_git("logical", data_repo) y.sorted <- y[do.call(order, y[c("y", "logic")]), colnames(z)] rownames(y.sorted) <- NULL stopifnot(all.equal(y.sorted, z)) +add(data_repo, path = ".") +if (subsecond) Sys.sleep(1) +commit_5 <- commit(data_repo, "logical") stopifnot(all.equal( tools::assertError( @@ -203,9 +220,15 @@ stopifnot(all.equal( )[[1]][["message"]], "old data was stored verbose" )) +add(data_repo, path = ".") +if (subsecond) Sys.sleep(1) +commit_6 <- commit(data_repo, "verbose") + yml <- file.path(path, "verbose.yml") meta <- head(readLines(yml), -1) writeLines(text = meta, con = yml) +add(data_repo, path = ".") +commit_7 <- commit(data_repo, "fast") stopifnot(all.equal( tools::assertError( read_delim_git("verbose", data_repo) @@ -218,7 +241,6 @@ stopifnot(all.equal( )[[1]][["message"]], "error in existing metadata" )) - stopifnot(all.equal( tools::assertError( rm_data(path) @@ -232,3 +254,41 @@ stopifnot(all.equal( "'path' must be a single value" )) +com <- recent_commit(data_repo, "test.tsv") +stopifnot(inherits(com, "data.frame")) +stopifnot(all.equal(colnames(com), c("commit", "author", "when"))) +stopifnot(all.equal(com$commit, commit_2$sha)) + +com <- recent_commit(data_repo, "test", data = TRUE) +stopifnot(inherits(com, "data.frame")) +stopifnot(all.equal(colnames(com), c("commit", "author", "when"))) +stopifnot(all.equal(com$commit, commit_2$sha)) + +com <- recent_commit(data_repo, "junk/test", data = TRUE) +stopifnot(all.equal(com$commit, commit_3$sha)) +com <- recent_commit(data_repo, "junk/test.tsv") +stopifnot(all.equal(com$commit, commit_3$sha)) +com <- recent_commit(data_repo, "junk/test.yml") +stopifnot(all.equal(com$commit, commit_3$sha)) + +stopifnot(all.equal( + tools::assertError( + recent_commit(data_repo, TRUE) + )[[1]][["message"]], + "'path' must be a character vector" +)) +stopifnot(all.equal( + tools::assertError( + recent_commit(data_repo, c("junk", "test")) + )[[1]][["message"]], + "'path' must be a single value" +)) + +stopifnot(all.equal( + tools::assertWarning( + com <- recent_commit(data_repo, "verbose.yml") + )[[1]][["message"]], + "Multiple commits within the same second" +)) +stopifnot(nrow(com) == 2) +stopifnot(all(com$commit %in% c(commit_6$sha, commit_7$sha))) From 1ddbb149dc1104d39b904ded66f8937451b50753 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Wed, 29 Aug 2018 14:02:14 +0200 Subject: [PATCH 50/52] try to debug unit test on recent_commit() Signed-off-by: Thierry Onkelinx --- R/data_repository.R | 1 + man/recent_commit.Rd | 7 +++++-- tests/data_repository.R | 19 +++++++++++-------- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/R/data_repository.R b/R/data_repository.R index 75bb677b6..a07e9f7ac 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -429,6 +429,7 @@ recent_commit <- function(repo, path = NULL, data = FALSE) { } name <- basename(path) path <- unique(dirname(path)) +message("path: ", path) if (path == ".") { path <- "" } diff --git a/man/recent_commit.Rd b/man/recent_commit.Rd index 13b44de69..ecafb61da 100644 --- a/man/recent_commit.Rd +++ b/man/recent_commit.Rd @@ -3,7 +3,7 @@ \name{recent_commit} \alias{recent_commit} \title{Recent file change -Retrieve the most commit in which a file or data object was changed.} +Retrieve the most recent commit in which a file or data object was changed.} \usage{ recent_commit(repo, path = NULL, data = FALSE) } @@ -15,7 +15,10 @@ object. Default is '.'} \item{data}{refers path to a file (FALSE) or a data object (TRUE). Defaults to FALSE} } +\value{ +A data.frame with commit, author and timestamp. Will contain multiple rows when multiple commits are made within the same second +} \description{ Recent file change -Retrieve the most commit in which a file or data object was changed. +Retrieve the most recent commit in which a file or data object was changed. } diff --git a/tests/data_repository.R b/tests/data_repository.R index a0f0b6826..d325f2701 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -24,7 +24,7 @@ path <- tempfile(pattern = "git2r-") dir.create(path) # currently odb_blobs() can't handle subsecond commits -# when TRUE Sys.sleep(1) is added before each commit +# when TRUE Sys.sleep(1.1) is added before each commit subsecond <- TRUE ## Initialize a repository @@ -60,7 +60,7 @@ stopifnot( write_delim_git(x, "test", data_repo) stopifnot(all.equal(status(data_repo), z)) add(data_repo, path = ".") -if (subsecond) Sys.sleep(1) +if (subsecond) Sys.sleep(1.1) commit_1 <- commit(data_repo, "initial commit") stopifnot(all.equal( @@ -125,12 +125,12 @@ stopifnot(all.equal( "raw file and/or meta file missing" )) add(data_repo, path = ".") -if (subsecond) Sys.sleep(1) +if (subsecond) Sys.sleep(1.1) commit_2 <- commit(data_repo, "test") write_delim_git(x, "junk/test", data_repo) add(data_repo, path = ".") -if (subsecond) Sys.sleep(1) +if (subsecond) Sys.sleep(1.1) commit_3 <- commit(data_repo, "test") rm_data(data_repo, ".", "tsv") @@ -176,7 +176,7 @@ stopifnot(all.equal( )[[1]][["message"]], "use only variables of 'x' for sorting" )) -if (subsecond) Sys.sleep(1) +if (subsecond) Sys.sleep(1.1) commit_4 <- commit(data_repo, "more") y <- x @@ -198,7 +198,7 @@ y.sorted <- y[do.call(order, y[c("y", "logic")]), colnames(z)] rownames(y.sorted) <- NULL stopifnot(all.equal(y.sorted, z)) add(data_repo, path = ".") -if (subsecond) Sys.sleep(1) +if (subsecond) Sys.sleep(1.1) commit_5 <- commit(data_repo, "logical") stopifnot(all.equal( @@ -221,7 +221,7 @@ stopifnot(all.equal( "old data was stored verbose" )) add(data_repo, path = ".") -if (subsecond) Sys.sleep(1) +if (subsecond) Sys.sleep(1.1) commit_6 <- commit(data_repo, "verbose") yml <- file.path(path, "verbose.yml") @@ -259,10 +259,13 @@ stopifnot(inherits(com, "data.frame")) stopifnot(all.equal(colnames(com), c("commit", "author", "when"))) stopifnot(all.equal(com$commit, commit_2$sha)) +bl <- odb_blobs(data_repo) +bl[grepl("test", bl$name) & bl$path == "", -1] +commit_2$sha com <- recent_commit(data_repo, "test", data = TRUE) +stopifnot(all.equal(com$commit, commit_2$sha)) stopifnot(inherits(com, "data.frame")) stopifnot(all.equal(colnames(com), c("commit", "author", "when"))) -stopifnot(all.equal(com$commit, commit_2$sha)) com <- recent_commit(data_repo, "junk/test", data = TRUE) stopifnot(all.equal(com$commit, commit_3$sha)) From b42717502786bb699b75bff6f90d85cdd4499b1b Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Wed, 29 Aug 2018 14:47:54 +0200 Subject: [PATCH 51/52] use forward slashes when normalising path Signed-off-by: Thierry Onkelinx --- R/data_repository.R | 3 ++- tests/data_repository.R | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/R/data_repository.R b/R/data_repository.R index a07e9f7ac..f21a02f1b 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -355,7 +355,7 @@ clean_data_path <- function(path) { if (any(not_root)) { path[not_root] <- file.path(dir_name[not_root], path[not_root]) } - path <- normalizePath(unique(path), mustWork = FALSE) + path <- normalizePath(unique(path), winslash = "/", mustWork = FALSE) c(raw_file = paste0(path, ".tsv"), meta_file = paste0(path, ".yml")) } @@ -426,6 +426,7 @@ recent_commit <- function(repo, path = NULL, data = FALSE) { stop("'path' must be a single value") if (data) { path <- clean_data_path(path) +message("path: ", path) } name <- basename(path) path <- unique(dirname(path)) diff --git a/tests/data_repository.R b/tests/data_repository.R index d325f2701..523d1e679 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -260,7 +260,7 @@ stopifnot(all.equal(colnames(com), c("commit", "author", "when"))) stopifnot(all.equal(com$commit, commit_2$sha)) bl <- odb_blobs(data_repo) -bl[grepl("test", bl$name) & bl$path == "", -1] +bl[grepl("test", bl$name) & bl$path == "", c(2:3, 5, 7)] commit_2$sha com <- recent_commit(data_repo, "test", data = TRUE) stopifnot(all.equal(com$commit, commit_2$sha)) From b748146b2e7fab6a8c71a1bee0f8f24d034bf055 Mon Sep 17 00:00:00 2001 From: Thierry Onkelinx Date: Wed, 29 Aug 2018 15:16:37 +0200 Subject: [PATCH 52/52] clean_data_path() gains a normalize argument Signed-off-by: Thierry Onkelinx --- R/data_repository.R | 11 ++++++----- tests/data_repository.R | 5 +---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/R/data_repository.R b/R/data_repository.R index f21a02f1b..9b1dc0279 100644 --- a/R/data_repository.R +++ b/R/data_repository.R @@ -346,16 +346,19 @@ meta.Date <- function(x, optimize = TRUE) { ##' Clean the data path ##' Strips any file extension from the path and adds the ".tsv" and ".yml" file extensions ##' @param path the paths +##' @param normalize normalize the path? Defaults to TRUE ##' @return a named vector with "raw_file" and "meta_file", refering to the ".tsv" and ".yml" files ##' @noRd -clean_data_path <- function(path) { +clean_data_path <- function(path, normalize = TRUE) { dir_name <- dirname(path) not_root <- dir_name != "." path <- gsub("\\..*$", "", basename(path)) if (any(not_root)) { path[not_root] <- file.path(dir_name[not_root], path[not_root]) } - path <- normalizePath(unique(path), winslash = "/", mustWork = FALSE) + if (isTRUE(normalize)) { + path <- normalizePath(unique(path), winslash = "/", mustWork = FALSE) + } c(raw_file = paste0(path, ".tsv"), meta_file = paste0(path, ".yml")) } @@ -425,12 +428,10 @@ recent_commit <- function(repo, path = NULL, data = FALSE) { if (length(path) != 1) stop("'path' must be a single value") if (data) { - path <- clean_data_path(path) -message("path: ", path) + path <- clean_data_path(path, normalize = FALSE) } name <- basename(path) path <- unique(dirname(path)) -message("path: ", path) if (path == ".") { path <- "" } diff --git a/tests/data_repository.R b/tests/data_repository.R index 523d1e679..48d76bf5f 100644 --- a/tests/data_repository.R +++ b/tests/data_repository.R @@ -259,13 +259,10 @@ stopifnot(inherits(com, "data.frame")) stopifnot(all.equal(colnames(com), c("commit", "author", "when"))) stopifnot(all.equal(com$commit, commit_2$sha)) -bl <- odb_blobs(data_repo) -bl[grepl("test", bl$name) & bl$path == "", c(2:3, 5, 7)] -commit_2$sha com <- recent_commit(data_repo, "test", data = TRUE) -stopifnot(all.equal(com$commit, commit_2$sha)) stopifnot(inherits(com, "data.frame")) stopifnot(all.equal(colnames(com), c("commit", "author", "when"))) +stopifnot(all.equal(com$commit, commit_2$sha)) com <- recent_commit(data_repo, "junk/test", data = TRUE) stopifnot(all.equal(com$commit, commit_3$sha))