Title: | Construct Reproducible Analytic Data Sets as R Packages |
---|---|
Description: | A framework to help construct R data packages in a reproducible manner. Potentially time consuming processing of raw data sets into analysis ready data sets is done in a reproducible manner and decoupled from the usual 'R CMD build' process so that data sets can be processed into R objects in the data package and the data package can then be shared, built, and installed by others without the need to repeat computationally costly data processing. The package maintains data provenance by turning the data processing scripts into package vignettes, as well as enforcing documentation and version checking of included data objects. Data packages can be version controlled on 'GitHub', and used to share data for manuscripts, collaboration and reproducible research. |
Authors: | Greg Finak [aut, cph] (Original author and creator of DataPackageR), Paul Obrecht [ctb], Ellis Hughes [ctb] , Jimmy Fulp [ctb], Marie Vendettuoli [ctb] , Dave Slager [ctb, cre] , Jason Taylor [ctb], Kara Woo [rev] (Kara reviewed the package for rOpenSci, see <https://github.com/ropensci/onboarding/issues/230>), William Landau [rev] (William reviewed the package for rOpenSci, see <https://github.com/ropensci/onboarding/issues/230>) |
Maintainer: | Dave Slager <[email protected]> |
License: | MIT + file LICENSE |
Version: | 0.16.1 |
Built: | 2025-01-15 06:12:14 UTC |
Source: | https://github.com/ropensci/DataPackageR |
Assert that a data version in a data package matches an expectation.
assert_data_version( data_package_name = NULL, version_string = NULL, acceptable = "equal", ... )
assert_data_version( data_package_name = NULL, version_string = NULL, acceptable = "equal", ... )
data_package_name |
|
version_string |
|
acceptable |
|
... |
additional arguments passed to data_version (such as lib.loc) |
Tests the DataVersion string in data_package_name
against version_string
testing the major, minor and revision portion.
Tests "data_package_name version equal version_string" or "data_package_name version equal_or_greater version_string".
invisible logical
TRUE if success, otherwise stop on mismatch.
if(rmarkdown::pandoc_available()){ f <- tempdir() f <- file.path(f, "foo.Rmd") con <- file(f) writeLines("```{r}\n vec = 1:10 \n```\n",con = con) close(con) pname <- basename(tempfile()) datapackage_skeleton(name = pname, path=tempdir(), force = TRUE, r_object_names = "vec", code_files = f) package_build(file.path(tempdir(),pname), install = FALSE) pkgload::load_all(file.path(tempdir(),pname)) assert_data_version(data_package_name = pname,version_string = "0.1.0",acceptable = "equal") }
if(rmarkdown::pandoc_available()){ f <- tempdir() f <- file.path(f, "foo.Rmd") con <- file(f) writeLines("```{r}\n vec = 1:10 \n```\n",con = con) close(con) pname <- basename(tempfile()) datapackage_skeleton(name = pname, path=tempdir(), force = TRUE, r_object_names = "vec", code_files = f) package_build(file.path(tempdir(),pname), install = FALSE) pkgload::load_all(file.path(tempdir(),pname)) assert_data_version(data_package_name = pname,version_string = "0.1.0",acceptable = "equal") }
Constructs a datapackager.yml configuration object from a vector of file names and a vector of object names (all quoted).
Can be written to disk via yml_write
.
render_root
is set to a randomly generated named subdirectory of tempdir()
.
construct_yml_config(code = NULL, data = NULL, render_root = NULL)
construct_yml_config(code = NULL, data = NULL, render_root = NULL)
code |
A vector of filenames |
data |
A vector of quoted object names |
render_root |
The root directory where the package data processing code will be rendered.
Defaults to is set to a randomly generated named subdirectory of |
a datapackager.yml configuration represented as an R object
conf <- construct_yml_config(code = c('file1.rmd','file2.rmd'), data=c('object1','object2')) tmp <- normalizePath(tempdir(), winslash = "/") yml_write(conf,path=tmp)
conf <- construct_yml_config(code = c('file1.rmd','file2.rmd'), data=c('object1','object2')) tmp <- normalizePath(tempdir(), winslash = "/") yml_write(conf,path=tmp)
Retrieves the DataVersion of a package if available
data_version(pkg, lib.loc = NULL)
data_version(pkg, lib.loc = NULL)
pkg |
|
lib.loc |
|
Object of class 'package_version' and 'numeric_version' specifying the DataVersion of the package
if(rmarkdown::pandoc_available()){ f <- tempdir() f <- file.path(f,"foo.Rmd") con <- file(f) writeLines("```{r}\n vec = 1:10 \n```\n",con=con) close(con) pname <- basename(tempfile()) datapackage_skeleton(name = pname, path=tempdir(), force = TRUE, r_object_names = "vec", code_files = f) package_build(file.path(tempdir(),pname), install = FALSE) pkgload::load_all(file.path(tempdir(),pname)) data_version(pname) }
if(rmarkdown::pandoc_available()){ f <- tempdir() f <- file.path(f,"foo.Rmd") con <- file(f) writeLines("```{r}\n vec = 1:10 \n```\n",con=con) close(con) pname <- basename(tempfile()) datapackage_skeleton(name = pname, path=tempdir(), force = TRUE, r_object_names = "vec", code_files = f) package_build(file.path(tempdir(),pname), install = FALSE) pkgload::load_all(file.path(tempdir(),pname)) data_version(pname) }
Creates a package skeleton directory structure for use with DataPackageR. Adds the DataVersion string to DESCRIPTION, creates the DATADIGEST file, and the data-raw directory. Updates the Read-and-delete-me file to reflect the additional necessary steps.
datapackage_skeleton( name = NULL, path = ".", force = FALSE, code_files = character(), r_object_names = character(), raw_data_dir = character(), dependencies = character() )
datapackage_skeleton( name = NULL, path = ".", force = FALSE, code_files = character(), r_object_names = character(), raw_data_dir = character(), dependencies = character() )
name |
|
path |
A |
force |
|
code_files |
Optional |
r_object_names |
|
raw_data_dir |
|
dependencies |
|
No return value, called for side effects
if(rmarkdown::pandoc_available()){ f <- tempdir() f <- file.path(f,"foo.Rmd") con <- file(f) writeLines("```{r}\n tbl = data.frame(1:10) \n```\n",con=con) close(con) pname <- basename(tempfile()) datapackage_skeleton(name = pname, path = tempdir(), force = TRUE, r_object_names = "tbl", code_files = f) }
if(rmarkdown::pandoc_available()){ f <- tempdir() f <- file.path(f,"foo.Rmd") con <- file(f) writeLines("```{r}\n tbl = data.frame(1:10) \n```\n",con=con) close(con) pname <- basename(tempfile()) datapackage_skeleton(name = pname, path = tempdir(), force = TRUE, r_object_names = "tbl", code_files = f) }
Read an object created in a previously run processing script.
datapackager_object_read(name)
datapackager_object_read(name)
name |
|
This function is only accessible within an R or Rmd file processed by DataPackageR.
It searches for an environment named ENVS
within the current environment,
that holds the object with the given name
. Such an environment is constructed and populated
with objects specified in the yaml objects
property and passed along
to subsequent R and Rmd files as DataPackageR processes them in order.
An R object.
if(rmarkdown::pandoc_available()){ ENVS <- new.env() # ENVS would be in the environment # where the data processing is run. It is # handled automatically by the package. assign("find_me", 100, ENVS) #This is done automatically by DataPackageR find_me <- datapackager_object_read("find_me") # This would appear in an Rmd processed by # DataPackageR to access the object named "find_me" created # by a previous script. "find_me" would also need to # appear in the objects property of datapackager.yml }
if(rmarkdown::pandoc_available()){ ENVS <- new.env() # ENVS would be in the environment # where the data processing is run. It is # handled automatically by the package. assign("find_me", 100, ENVS) #This is done automatically by DataPackageR find_me <- datapackager_object_read("find_me") # This would appear in an Rmd processed by # DataPackageR to access the object named "find_me" created # by a previous script. "find_me" would also need to # appear in the objects property of datapackager.yml }
User-configurable options consulted by DataPackageR, which provide a mechanism for setting default behaviors for various functions.
If the built-in defaults don't suit you, set one or more of these options.
Typically, this is done in the .Rprofile
startup file, which you can open
for editing with usethis::edit_r_profile()
- this will set the specified
options for all future R sessions. The following setting is recommended to
not be prompted upon each package build for a NEWS update:
options(DataPackageR_interact = FALSE)
- DataPackageR_interact
: Upon package load, this defaults to the value of
interactive()
, unless the option has been previously set (e.g., in
.Rprofile
). TRUE prompts user interactively for a NEWS update on
package_build()
. See the example above and the
rOpenSci blog
post for more details on how to set this to FALSE, which will never prompt
user for a NEWS update. FALSE is also the setting used for DataPackageR
internal package tests.
- DataPackageR_verbose
: Default upon package load is TRUE. FALSE suppresses
all console output and is currently only used for automated
unit tests of the DataPackageR package.
- DataPackageR_packagebuilding
: Default upon package load is FALSE. This
option is used internally for package operations and changing it is not
recommended.
These functions are defunct and no longer supported. Calling them will result in an error.
When possible, alternatives are suggested.
datapackage.skeleton(...) dataVersion(...) keepDataObjects(...)
datapackage.skeleton(...) dataVersion(...) keepDataObjects(...)
... |
All arguments are now ignored. |
Defunct function. No return value.
Build documentation for a data package using DataPackageR.
document(path = ".", install = FALSE, ...)
document(path = ".", install = FALSE, ...)
path |
|
install |
|
... |
additional arguments to |
Called for side effects. Returns TRUE on successful exit.
# A simple Rmd file that creates one data object # named "tbl". if(rmarkdown::pandoc_available()){ f <- tempdir() f <- file.path(f,"foo.Rmd") con <- file(f) writeLines("```{r}\n tbl = data.frame(1:10) \n```\n",con=con) close(con) # construct a data package skeleton named "MyDataPackage" and pass # in the Rmd file name with full path, and the name of the object(s) it # creates. pname <- basename(tempfile()) datapackage_skeleton(name=pname, path=tempdir(), force = TRUE, r_object_names = "tbl", code_files = f) # call package_build to run the "foo.Rmd" processing and # build a data package. package_build(file.path(tempdir(), pname), install = FALSE) document(path = file.path(tempdir(), pname), install = FALSE) }
# A simple Rmd file that creates one data object # named "tbl". if(rmarkdown::pandoc_available()){ f <- tempdir() f <- file.path(f,"foo.Rmd") con <- file(f) writeLines("```{r}\n tbl = data.frame(1:10) \n```\n",con=con) close(con) # construct a data package skeleton named "MyDataPackage" and pass # in the Rmd file name with full path, and the name of the object(s) it # creates. pname <- basename(tempfile()) datapackage_skeleton(name=pname, path=tempdir(), force = TRUE, r_object_names = "tbl", code_files = f) # call package_build to run the "foo.Rmd" processing and # build a data package. package_build(file.path(tempdir(), pname), install = FALSE) document(path = file.path(tempdir(), pname), install = FALSE) }
Combines the preprocessing, documentation, and build steps into one.
package_build( packageName = NULL, vignettes = FALSE, log = INFO, deps = TRUE, install = FALSE, ... )
package_build( packageName = NULL, vignettes = FALSE, log = INFO, deps = TRUE, install = FALSE, ... )
packageName |
|
vignettes |
|
log |
log level |
deps |
|
install |
|
... |
additional arguments passed to |
Note that if package_build
returns an error when rendering an .Rmd
internally, but that same .Rmd
can be run successfully manually using rmarkdown::render
,
then the following code facilitates debugging. Set options(error = function(){ sink(); recover()})
before running package_build
. This will enable examination of the active function calls at the time of the error,
with output printed to the console rather than knitr
's default sink.
After debugging, evaluate options(error = NULL)
to revert to default error handling.
See section "22.5.3 RMarkdown" at https://adv-r.hadley.nz/debugging.html for more details.
Character vector. File path of the built package.
if(rmarkdown::pandoc_available()){ f <- tempdir() f <- file.path(f,"foo.Rmd") con <- file(f) writeLines("```{r}\n tbl = data.frame(1:10) \n```\n",con=con) close(con) pname <- basename(tempfile()) datapackage_skeleton(name=pname, path=tempdir(), force = TRUE, r_object_names = "tbl", code_files = f) package_build(file.path(tempdir(),pname), install = FALSE) }
if(rmarkdown::pandoc_available()){ f <- tempdir() f <- file.path(f,"foo.Rmd") con <- file(f) writeLines("```{r}\n tbl = data.frame(1:10) \n```\n",con=con) close(con) pname <- basename(tempfile()) datapackage_skeleton(name=pname, path=tempdir(), force = TRUE, r_object_names = "tbl", code_files = f) package_build(file.path(tempdir(),pname), install = FALSE) }
Get DataPackageR data path
project_data_path(file = NULL)
project_data_path(file = NULL)
file |
|
Returns the path to the data package data subdirectory, or constructs a path to a file in the data subdirectory from the file argument.
character
if(rmarkdown::pandoc_available()){ project_data_path( file = "data.rda" ) }
if(rmarkdown::pandoc_available()){ project_data_path( file = "data.rda" ) }
Get DataPackageR extdata path
project_extdata_path(file = NULL)
project_extdata_path(file = NULL)
file |
|
Returns the path to the data package extdata subdirectory, or constructs a path to a file in the extdata subdirectory from the file argument.
character
if(rmarkdown::pandoc_available()){ project_extdata_path(file = "mydata.csv") }
if(rmarkdown::pandoc_available()){ project_extdata_path(file = "mydata.csv") }
Get DataPackageR Project Root Path
project_path(file = NULL)
project_path(file = NULL)
file |
|
Returns the path to the data package project root, or constructs a path to a file in the project root from the file argument.
character
if(rmarkdown::pandoc_available()){ project_path( file = "DESCRIPTION" ) }
if(rmarkdown::pandoc_available()){ project_path( file = "DESCRIPTION" ) }
The data object will be added to the yml configuration file.
use_data_object(object_name = NULL)
use_data_object(object_name = NULL)
object_name |
Name of the data object. Should be created by a processing script in data-raw. |
invisibly returns TRUE for success.
if(rmarkdown::pandoc_available()){ myfile <- tempfile() file <- system.file("extdata", "tests", "extra.Rmd", package = "DataPackageR") datapackage_skeleton( name = "datatest", path = tempdir(), code_files = file, force = TRUE, r_object_names = "data") use_data_object(object_name = "newobject") }
if(rmarkdown::pandoc_available()){ myfile <- tempfile() file <- system.file("extdata", "tests", "extra.Rmd", package = "DataPackageR") datapackage_skeleton( name = "datatest", path = tempdir(), code_files = file, force = TRUE, r_object_names = "data") use_data_object(object_name = "newobject") }
Ignore specific files by git and R build.
use_ignore(file = NULL, path = NULL)
use_ignore(file = NULL, path = NULL)
file |
|
path |
|
invisibly returns 0.
datapackage_skeleton(name="test",path = tempdir()) use_ignore("foo", ".")
datapackage_skeleton(name="test",path = tempdir()) use_ignore("foo", ".")
The Rmd or R file or directory specified by file
will be moved into
the data-raw directory. It will also be added to the yml configuration file.
Any existing file by that name will be overwritten when overwrite is set to TRUE
use_processing_script( file = NULL, title = NULL, author = NULL, overwrite = FALSE )
use_processing_script( file = NULL, title = NULL, author = NULL, overwrite = FALSE )
file |
|
title |
|
author |
|
overwrite |
|
invisibly returns TRUE for success. Stops on failure.
if(rmarkdown::pandoc_available()){ myfile <- tempfile() file <- system.file("extdata", "tests", "extra.Rmd", package = "DataPackageR") datapackage_skeleton( name = "datatest", path = tempdir(), code_files = file, force = TRUE, r_object_names = "data") use_processing_script(file = "newScript.Rmd", title = "Processing a new dataset", author = "Y.N. Here.") }
if(rmarkdown::pandoc_available()){ myfile <- tempfile() file <- system.file("extdata", "tests", "extra.Rmd", package = "DataPackageR") datapackage_skeleton( name = "datatest", path = tempdir(), code_files = file, force = TRUE, r_object_names = "data") use_processing_script(file = "newScript.Rmd", title = "Processing a new dataset", author = "Y.N. Here.") }
The file or directory specified by path
will be moved into
the inst/extdata directory.
use_raw_dataset(path = NULL, ignore = FALSE)
use_raw_dataset(path = NULL, ignore = FALSE)
path |
|
ignore |
|
invisibly returns TRUE for success. Stops on failure.
if(rmarkdown::pandoc_available()){ myfile <- tempfile() file <- system.file("extdata", "tests", "extra.Rmd", package = "DataPackageR") raw_data <- system.file("extdata", "tests", "raw_data", package = "DataPackageR") datapackage_skeleton( name = "datatest", path = tempdir(), code_files = file, force = TRUE, r_object_names = "data") use_raw_dataset(raw_data) }
if(rmarkdown::pandoc_available()){ myfile <- tempfile() file <- system.file("extdata", "tests", "extra.Rmd", package = "DataPackageR") raw_data <- system.file("extdata", "tests", "raw_data", package = "DataPackageR") datapackage_skeleton( name = "datatest", path = tempdir(), code_files = file, force = TRUE, r_object_names = "data") use_raw_dataset(raw_data) }
Edit a yaml configuration file via an API.
yml_find(path) yml_add_files(config, filenames) yml_disable_compile(config, filenames) yml_enable_compile(config, filenames) yml_add_objects(config, objects) yml_list_objects(config) yml_list_files(config) yml_remove_objects(config, objects) yml_remove_files(config, filenames) yml_write(config, path = NULL)
yml_find(path) yml_add_files(config, filenames) yml_disable_compile(config, filenames) yml_enable_compile(config, filenames) yml_add_objects(config, objects) yml_list_objects(config) yml_list_files(config) yml_remove_objects(config, objects) yml_remove_files(config, filenames) yml_write(config, path = NULL)
path |
Path to the data package source or path to write config file (for |
config |
an R representation of the datapackager.yml config, returned by yml_find, or a path to the package root. |
filenames |
A vector of filenames. |
objects |
A vector of R object names. |
Add, remove files and objects, enable or disable parsing of specific files, list objects or files in a yaml config, or write a config back to a package.
A yaml configuration structured as an R nested list.
if(rmarkdown::pandoc_available()){ f <- tempdir() f <- file.path(f,"foo.Rmd") con <- file(f) writeLines("```{r}\n vec = 1:10\n```\n",con=con) close(con) pname <- basename(tempfile()) datapackage_skeleton(name=pname, path = tempdir(), force = TRUE, r_object_names = "vec", code_files = f) yml <- yml_find(file.path(tempdir(),pname)) yml <- yml_add_files(yml,"foo.Rmd") yml_list_files(yml) yml <- yml_disable_compile(yml,"foo.Rmd") yml <- yml_enable_compile(yml,"foo.Rmd") yml <- yml_add_objects(yml,"data1") yml_list_objects(yml) yml <- yml_remove_objects(yml,"data1") yml <- yml_remove_files(yml,"foo.Rmd") }
if(rmarkdown::pandoc_available()){ f <- tempdir() f <- file.path(f,"foo.Rmd") con <- file(f) writeLines("```{r}\n vec = 1:10\n```\n",con=con) close(con) pname <- basename(tempfile()) datapackage_skeleton(name=pname, path = tempdir(), force = TRUE, r_object_names = "vec", code_files = f) yml <- yml_find(file.path(tempdir(),pname)) yml <- yml_add_files(yml,"foo.Rmd") yml_list_files(yml) yml <- yml_disable_compile(yml,"foo.Rmd") yml <- yml_enable_compile(yml,"foo.Rmd") yml <- yml_add_objects(yml,"data1") yml_list_objects(yml) yml <- yml_remove_objects(yml,"data1") yml <- yml_remove_files(yml,"foo.Rmd") }