cmap {disk.frame}R Documentation

Apply the same function to all chunks

Description

Apply the same function to all chunks

'cimap.disk.frame' accepts a two argument function where the first argument is a data.frame and the second is the chunk ID

'lazy' is convenience function to apply '.f' to every chunk

'delayed' is an alias for lazy and is consistent with the naming in Dask and Dagger.jl

Usage

cmap(.x, .f, ...)

## S3 method for class 'disk.frame'
cmap(
  .x,
  .f,
  ...,
  outdir = NULL,
  keep = NULL,
  chunks = nchunks(.x),
  compress = 50,
  lazy = TRUE,
  overwrite = FALSE,
  vars_and_pkgs = future::getGlobalsAndPackages(.f, envir = parent.frame()),
  .progress = TRUE
)

cmap_dfr(.x, .f, ..., .id = NULL)

## S3 method for class 'disk.frame'
cmap_dfr(.x, .f, ..., .id = NULL, use.names = fill, fill = FALSE, idcol = NULL)

cimap(.x, .f, ...)

## S3 method for class 'disk.frame'
cimap(
  .x,
  .f,
  outdir = NULL,
  keep = NULL,
  chunks = nchunks(.x),
  compress = 50,
  lazy = TRUE,
  overwrite = FALSE,
  ...
)

cimap_dfr(.x, .f, ..., .id = NULL)

## S3 method for class 'disk.frame'
cimap_dfr(
  .x,
  .f,
  ...,
  .id = NULL,
  use.names = fill,
  fill = FALSE,
  idcol = NULL
)

lazy(.x, .f, ...)

## S3 method for class 'disk.frame'
lazy(.x, .f, ...)

delayed(.x, .f, ...)

chunk_lapply(...)

map(.x, .f, ...)

## S3 method for class 'disk.frame'
map(...)

## Default S3 method:
map(.x, .f, ...)

imap_dfr(.x, .f, ..., .id = NULL)

## S3 method for class 'disk.frame'
imap_dfr(...)

## Default S3 method:
imap_dfr(.x, .f, ..., .id = NULL)

imap(.x, .f, ...)

## Default S3 method:
imap(.x, .f, ...)

## S3 method for class 'disk.frame'
map_dfr(...)

## Default S3 method:
map_dfr(.x, .f, ..., .id = NULL)

Arguments

.x

a disk.frame

.f

a function to apply to each of the chunks

...

for compatibility with 'purrr::map'

outdir

the output directory

keep

the columns to keep from the input

chunks

The number of chunks to output

compress

0-100 fst compression ratio

lazy

if TRUE then do this lazily

overwrite

if TRUE removes any existing chunks in the data

vars_and_pkgs

variables and packages to send to a background session. This is typically automatically detected

.progress

A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From furrr

.id

not used

use.names

for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist

fill

for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist

idcol

for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist

Examples

cars.df = as.disk.frame(cars)

# return the first row of each chunk lazily
# 
cars2 = cmap(cars.df, function(chunk) {
 chunk[,1]
})

collect(cars2)

# same as above but using purrr 
cars2 = cmap(cars.df, ~.x[1,])

collect(cars2)

# return the first row of each chunk eagerly as list
cmap(cars.df, ~.x[1,], lazy = FALSE)

# return the first row of each chunk eagerly as data.table/data.frame by row-binding
cmap_dfr(cars.df, ~.x[1,])

# lazy and delayed are just an aliases for cmap(..., lazy = TRUE)
collect(lazy(cars.df, ~.x[1,]))
collect(delayed(cars.df, ~.x[1,]))

# clean up cars.df
delete(cars.df)
cars.df = as.disk.frame(cars)

# .x is the chunk and .y is the ID as an integer

# lazy = TRUE support is not available at the moment
cimap(cars.df, ~.x[, id := .y], lazy = FALSE)

cimap_dfr(cars.df, ~.x[, id := .y])

# clean up cars.df
delete(cars.df)

[Package disk.frame version 0.3.7 Index]