Work with the Palmer Penguins Dataset and TileDB-R

tutorials

arrays

reads

writes

With the TileDB-R API, you can create a TileDB array from a data.frame object by using the fromDataFrame function. This tutorial shows how to do this with the Palmer Penguins dataset.

This tutorial demonstrates how to work with the Palmer Penguins dataset in the TileDB-R API. The tutorial focuses on creating a sparse array, since sparse arrays support string dimensions. To explore how to use fromDataFrame() with dense arrays, visit the fromDataFrame() tutorial

During this tutorial, you’ll download the palmerpenguins dataset, which contains data about penguin species in the Palmer Archipelago, Antarctica.

First, import the necessary libraries and set the array URI (that is, its path, which in this tutorial will be on local storage).

# Install palmerpengins package if not already installed
if (!requireNamespace("palmerpenguins", quietly = TRUE)) {
  install.packages("palmerpenguins")
}

# Import necessary libraries
library(tiledb)
library(palmerpenguins)

# Set array URI
(array_uri <- tempfile("palmer_penguins_r"))

Preview the data in the dataset:

print(penguins)

# A tibble: 344 × 8
   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
 1 Adelie  Torgersen           39.1          18.7               181        3750
 2 Adelie  Torgersen           39.5          17.4               186        3800
 3 Adelie  Torgersen           40.3          18                 195        3250
 4 Adelie  Torgersen           NA            NA                  NA          NA
 5 Adelie  Torgersen           36.7          19.3               193        3450
 6 Adelie  Torgersen           39.3          20.6               190        3650
 7 Adelie  Torgersen           38.9          17.8               181        3625
 8 Adelie  Torgersen           39.2          19.6               195        4675
 9 Adelie  Torgersen           34.1          18.1               193        3475
10 Adelie  Torgersen           42            20.2               190        4250
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>

You can take the penguins dataframe and use TileDB’s fromDataFrame() function to create a TileDB array.

# Create the array and ingest data in one step.
# This array uses `species` and `island` as dimensions.
fromDataFrame(penguins, array_uri, col_index = 1:2)

Review the schema of the array you just created:

# Confirm the array schema is in place
arr <- tiledb_array(array_uri, return_as = "data.frame")
schema(arr)

tiledb_array_schema(
    domain=tiledb_domain(c(
        tiledb_dim(name="species", domain=c(NULL,NULL), tile=NULL, type="ASCII", filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1)))),
        tiledb_dim(name="island", domain=c(NULL,NULL), tile=NULL, type="ASCII", filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))))
    )),
    attrs=c(
        tiledb_attr(name="bill_length_mm", type="FLOAT64", ncells=1, nullable=TRUE, filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1)))),
        tiledb_attr(name="bill_depth_mm", type="FLOAT64", ncells=1, nullable=TRUE, filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1)))),
        tiledb_attr(name="flipper_length_mm", type="INT32", ncells=1, nullable=TRUE, filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1)))),
        tiledb_attr(name="body_mass_g", type="INT32", ncells=1, nullable=TRUE, filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1)))),
        tiledb_attr(name="sex", type="INT32", ncells=1, nullable=TRUE, filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))), dictionary=c("female","male")),
        tiledb_attr(name="year", type="INT32", ncells=1, nullable=FALSE, filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))))
    ),
    cell_order="COL_MAJOR", tile_order="COL_MAJOR", capacity=10000, sparse=TRUE, allows_dups=TRUE,
    coords_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))),
    offsets_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))),
    validity_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("RLE"),"COMPRESSION_LEVEL",-1)))
)

To confirm TileDB wrote the data correctly, you can pull the first few rows of the array with the head() function:

head(arr[])

A data.frame: 6 × 8
	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex	year
	<chr>	<chr>	<dbl>	<dbl>	<int>	<int>	<fct>	<int>
1	Adelie	Biscoe	37.6	17.0	185	3600	female	2008
2	Adelie	Biscoe	42.7	18.3	196	4075	male	2009
3	Adelie	Biscoe	41.1	18.2	192	4050	male	2008
4	Adelie	Biscoe	37.7	18.7	180	3600	male	2007
5	Adelie	Biscoe	38.8	17.2	180	3800	male	2007
6	Adelie	Biscoe	35.3	18.9	187	3800	female	2007

Clean up in the end by deleting the array.

if (file.exists(array_uri)) {
  unlink(array_uri, recursive = TRUE)
}