targets and grid objects

Objective

targets is a powerful workflow management for reproducibility. chopin grid partitioning is a way to parallelize the repeated tasks across unit grids by applying patterns. This vignette demonstrates how to use targets and chopin together.

Installation

Despite the targets is not referenced in the DESCRIPTION file, it is required to install targets package to run the code in this vignette.

rlang::check_installed("targets")

Example

par_pad_grid() or par_pad_balanced() functions have an argument return_wkt to return the grid partition as well-known text (WKT) format characters. This format is exported to the parallel workers regardless of the parallel backend such as future::multisession and mirai::daemons, which cannot interoperate with externalpnt objects for C++ functions. Using WKT character objects, we can easily convert them to sf or terra objects inside a function running on a parallel worker and use them in the targets workflow with standard branching/patterning interface such as map(), cross(), and others.

The example below will generate a grid partition of the North Carolina state and demonstrate how to use the grid partition in the targets workflow.

Random points in NC

  • For demonstration of par_pad_grid(), we use moderately clustered point locations generated inside the counties of North Carolina.
library(chopin)
library(sf)
library(spatstat.random)

sf::sf_use_s2(FALSE)
set.seed(202404)
ncpoly <- system.file("shape/nc.shp", package = "sf")
ncsf <- sf::read_sf(ncpoly)
ncsf <- sf::st_transform(ncsf, "EPSG:5070")
plot(sf::st_geometry(ncsf))

ncpoints <-
  sf::st_sample(
    x = ncsf,
    type = "Thomas",
    mu = 20,
    scale = 1e4,
    kappa = 1.25e-9
  )
ncpoints <- sf::st_as_sf(ncpoints)
ncpoints <- sf::st_set_crs(ncpoints, "EPSG:5070")
ncpoints$pid <- sprintf("PID-%05d", seq(1, nrow(ncpoints)))
plot(sf::st_geometry(ncpoints))

Grid partition of NC

ncgrid_sf <-
  par_pad_grid(
    input = ncpoints,
    mode = "grid",
    nx = 6L,
    ny = 3L,
    padding = 1e4L,
    return_wkt = FALSE
  )

ncgrid_sf$original
## Simple feature collection with 18 features and 1 field
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 1057207 ymin: 1355827 xmax: 1830521 ymax: 1676488
## Projected CRS: NAD83 / Conus Albers
## First 10 features:
##                          geometry CGRIDID
## 1  POLYGON ((1057207 1355827, ...       1
## 2  POLYGON ((1186092 1355827, ...       2
## 3  POLYGON ((1314978 1355827, ...       3
## 4  POLYGON ((1443864 1355827, ...       4
## 5  POLYGON ((1572750 1355827, ...       5
## 6  POLYGON ((1701635 1355827, ...       6
## 7  POLYGON ((1057207 1462714, ...       7
## 8  POLYGON ((1186092 1462714, ...       8
## 9  POLYGON ((1314978 1462714, ...       9
## 10 POLYGON ((1443864 1462714, ...      10
ncgrid_sf$padded
## Simple feature collection with 18 features and 1 field
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: 1047207 ymin: 1345827 xmax: 1840521 ymax: 1686488
## Projected CRS: NAD83 / Conus Albers
## First 10 features:
##    CGRIDID                       geometry
## 1        1 POLYGON ((1047207 1345827, ...
## 2        2 POLYGON ((1176092 1345827, ...
## 3        3 POLYGON ((1304978 1345827, ...
## 4        4 POLYGON ((1433864 1345827, ...
## 5        5 POLYGON ((1562750 1345827, ...
## 6        6 POLYGON ((1691635 1345827, ...
## 7        7 POLYGON ((1047207 1452714, ...
## 8        8 POLYGON ((1176092 1452714, ...
## 9        9 POLYGON ((1304978 1452714, ...
## 10      10 POLYGON ((1433864 1452714, ...

Since sf objects are exportable to the parallel workers, we can also consider these as a part of the targets workflow.

ncgrid_wkt <-
  par_pad_grid(
    input = ncpoints,
    mode = "grid",
    nx = 6L,
    ny = 3L,
    padding = 1e4L,
    return_wkt = TRUE
  )

ncgrid_wkt$original
##  [1] "POLYGON ((1057207 1355827, 1186092 1355827, 1186092 1462714, 1057207 1462714, 1057207 1355827))"
##  [2] "POLYGON ((1186092 1355827, 1314978 1355827, 1314978 1462714, 1186092 1462714, 1186092 1355827))"
##  [3] "POLYGON ((1314978 1355827, 1443864 1355827, 1443864 1462714, 1314978 1462714, 1314978 1355827))"
##  [4] "POLYGON ((1443864 1355827, 1572750 1355827, 1572750 1462714, 1443864 1462714, 1443864 1355827))"
##  [5] "POLYGON ((1572750 1355827, 1701635 1355827, 1701635 1462714, 1572750 1462714, 1572750 1355827))"
##  [6] "POLYGON ((1701635 1355827, 1830521 1355827, 1830521 1462714, 1701635 1462714, 1701635 1355827))"
##  [7] "POLYGON ((1057207 1462714, 1186092 1462714, 1186092 1569601, 1057207 1569601, 1057207 1462714))"
##  [8] "POLYGON ((1186092 1462714, 1314978 1462714, 1314978 1569601, 1186092 1569601, 1186092 1462714))"
##  [9] "POLYGON ((1314978 1462714, 1443864 1462714, 1443864 1569601, 1314978 1569601, 1314978 1462714))"
## [10] "POLYGON ((1443864 1462714, 1572750 1462714, 1572750 1569601, 1443864 1569601, 1443864 1462714))"
## [11] "POLYGON ((1572750 1462714, 1701635 1462714, 1701635 1569601, 1572750 1569601, 1572750 1462714))"
## [12] "POLYGON ((1701635 1462714, 1830521 1462714, 1830521 1569601, 1701635 1569601, 1701635 1462714))"
## [13] "POLYGON ((1057207 1569601, 1186092 1569601, 1186092 1676488, 1057207 1676488, 1057207 1569601))"
## [14] "POLYGON ((1186092 1569601, 1314978 1569601, 1314978 1676488, 1186092 1676488, 1186092 1569601))"
## [15] "POLYGON ((1314978 1569601, 1443864 1569601, 1443864 1676488, 1314978 1676488, 1314978 1569601))"
## [16] "POLYGON ((1443864 1569601, 1572750 1569601, 1572750 1676488, 1443864 1676488, 1443864 1569601))"
## [17] "POLYGON ((1572750 1569601, 1701635 1569601, 1701635 1676488, 1572750 1676488, 1572750 1569601))"
## [18] "POLYGON ((1701635 1569601, 1830521 1569601, 1830521 1676488, 1701635 1676488, 1701635 1569601))"
ncgrid_wkt$padded
##  [1] "POLYGON ((1047207 1345827, 1047207 1472714, 1196092 1472714, 1196092 1345827, 1047207 1345827))"
##  [2] "POLYGON ((1176092 1345827, 1176092 1472714, 1324978 1472714, 1324978 1345827, 1176092 1345827))"
##  [3] "POLYGON ((1304978 1345827, 1304978 1472714, 1453864 1472714, 1453864 1345827, 1304978 1345827))"
##  [4] "POLYGON ((1433864 1345827, 1433864 1472714, 1582750 1472714, 1582750 1345827, 1433864 1345827))"
##  [5] "POLYGON ((1562750 1345827, 1562750 1472714, 1711635 1472714, 1711635 1345827, 1562750 1345827))"
##  [6] "POLYGON ((1691635 1345827, 1691635 1472714, 1840521 1472714, 1840521 1345827, 1691635 1345827))"
##  [7] "POLYGON ((1047207 1452714, 1047207 1579601, 1196092 1579601, 1196092 1452714, 1047207 1452714))"
##  [8] "POLYGON ((1176092 1452714, 1176092 1579601, 1324978 1579601, 1324978 1452714, 1176092 1452714))"
##  [9] "POLYGON ((1304978 1452714, 1304978 1579601, 1453864 1579601, 1453864 1452714, 1304978 1452714))"
## [10] "POLYGON ((1433864 1452714, 1433864 1579601, 1582750 1579601, 1582750 1452714, 1433864 1452714))"
## [11] "POLYGON ((1562750 1452714, 1562750 1579601, 1711635 1579601, 1711635 1452714, 1562750 1452714))"
## [12] "POLYGON ((1691635 1452714, 1691635 1579601, 1840521 1579601, 1840521 1452714, 1691635 1452714))"
## [13] "POLYGON ((1047207 1559601, 1047207 1686488, 1196092 1686488, 1196092 1559601, 1047207 1559601))"
## [14] "POLYGON ((1176092 1559601, 1176092 1686488, 1324978 1686488, 1324978 1559601, 1176092 1559601))"
## [15] "POLYGON ((1304978 1559601, 1304978 1686488, 1453864 1686488, 1453864 1559601, 1304978 1559601))"
## [16] "POLYGON ((1433864 1559601, 1433864 1686488, 1582750 1686488, 1582750 1559601, 1433864 1559601))"
## [17] "POLYGON ((1562750 1559601, 1562750 1686488, 1711635 1686488, 1711635 1559601, 1562750 1559601))"
## [18] "POLYGON ((1691635 1559601, 1691635 1686488, 1840521 1686488, 1840521 1559601, 1691635 1559601))"

Targets workflow

Assume that we design a function calc_something() that calculates something from the grid partition. We can use the grid partition as an input to the function. In sf object centered workflow, we can use sf functions to interact with the exported grid partition objects. Let’s consider a binary spatial operation where x and y are involved. x is a dataset at the variable is calculated whereas y is a raster file path from which we extract the values. Please note that SpatRaster objects cannot be exported to parallel workers as it is. We will read the object in parallel workers. To branch out across the grid partition, the function for the unit grid should handle subsetting x to narrow down the calculation scope to each grid. Therefore, a synopsis of the function should look like this:

calc_something <- function(x, y, unit_grid, pad_grid, ...) {
  # 0. restore unit_grid and pad_grid to sf objects if they are in WKT format
  # 1-1. make x subset using intersect logic between x and unit_grid
  # 1-2. read y subset using intersect logic between y and pad_grid
  # 2. make buffer of x
  # 3. do actual calculation (use ... wisely to pass additional arguments)
  # 4. return the result
}

map(unit_grid, pad_grid) to pattern argument tar_target() will do it for you.

calc_something <- function(x, y, unit_grid, pad_grid, ...) {
  # 1-1. make x subset using intersect logic between x and unit_grid
  x <- x[unit_grid, ]
  # 1-2. read y subset using intersect logic between y and pad_grid
  yext <- terra::ext(sf::st_bbox(pad_grid))
  yras <- terra::rast(y, win = yext)
  # 2. make buffer of x
  xbuffer <- sf::st_buffer(x, units::set_units(10, "km"))
  # 3. do actual calculation (use ... wisely to pass additional arguments)
  xycalc <- exactextractr::exact_extract(
    yras,
    xbuffer,
    force_df = TRUE,
    fun = "mean",
    append_cols = "pid", # assume that pid is a unique identifier
    progress = FALSE
  )
  # 4. return the result
  return(xycalc)
}

sf object inherits data.frame class. To align this object with targets branching, it will be clear to convert this object into a list object to pattern across the grid partition. par_split_list in chopin does it for you.

ncgrid_sflist <-
  par_split_list(ncgrid_sf)

When WKT format is used, the function should be modified to restore the grid partition to sf objects. The function should be modified as follows:

calc_something <- function(x, y, unit_grid, pad_grid, ...) {
  # 0. restore unit_grid and pad_grid to sf objects if they are in WKT format
  unit_grid <- sf::st_as_sf(wkt = unit_grid)
  pad_grid <- sf::st_as_sf(wkt = pad_grid)
  # 1-1. make x subset using intersect logic between x and unit_grid
  x <- x[unit_grid, ]
  # 1-2. read y subset using intersect logic between y and pad_grid
  yext <- terra::ext(sf::st_bbox(pad_grid))
  yras <- terra::rast(y, win = yext)
  # 2. make buffer of x
  xbuffer <- sf::st_buffer(x, units::set_units(10, "km"))
  # 3. do actual calculation (use ... wisely to pass additional arguments)
  xycalc <- exactextractr::exact_extract(
    yras,
    xbuffer,
    fun = "mean",
    force_df = TRUE,
    append_cols = "pid", # assume that pid is a unique identifier
    progress = FALSE
  )
  # 4. return the result
  return(xycalc)
}
ncgrid_wktlist <-
  par_split_list(ncgrid_wkt)

tar_target can use this list object with our function calc_something to branch out. A workable example of tar_target with a proper _targets.R file is as follows:

list(
  tar_target(
    name = points,
    command = sf::st_read("path_to_points.format")
  ),
  tar_target(
    name = raster,
    command = "path_to_raster.format",
    format = "file"
  ),
  tar_target(
    name = chopingrid,
    command = par_pad_grid(points, input = points, nx = 6L, ny = 3L, padding = 1e4L, return_wkt = FALSE)
  ),
  tar_target(
    name = chopingrid_split,
    command = mapply(
      function(listorig, row) {
        list(listorig$original[row, ], listorig$padded[row, ])
      },
      chopingrid, seq_len(nrow(chopingrid$original)),
      SIMPLIFY = FALSE
    ),
    iteration = "list"
  ),
  tar_target(
    name = result,
    command =
    calc_something(
      points, raster,
      chopingrid_split[[1]], chopingrid_split[[2]]
    ),
    pattern = map(chopingrid_split),
    iteration = "list"
  )
)

The target result will be a list of data.frames that contain the calculation results.