This tutorial demonstrates how to work with the Palmer Penguins dataset in the TileDB-R API. The tutorial focuses on creating a sparse array, since sparse arrays support string dimensions. To explore how to use fromDataFrame()
with dense arrays, visit the fromDataFrame()
tutorial
During this tutorial, you’ll download the palmerpenguins
dataset, which contains data about penguin species in the Palmer Archipelago, Antarctica.
First, import the necessary libraries and set the array URI (that is, its path, which in this tutorial will be on local storage).
# Install palmerpengins package if not already installed
if (! requireNamespace ("palmerpenguins" , quietly = TRUE )) {
install.packages ("palmerpenguins" )
}
# Import necessary libraries
library (tiledb)
library (palmerpenguins)
# Set array URI
(array_uri <- tempfile ("palmer_penguins_r" ))
Preview the data in the dataset:
# A tibble: 344 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
7 Adelie Torgersen 38.9 17.8 181 3625
8 Adelie Torgersen 39.2 19.6 195 4675
9 Adelie Torgersen 34.1 18.1 193 3475
10 Adelie Torgersen 42 20.2 190 4250
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>
You can take the penguins
dataframe and use TileDB’s fromDataFrame()
function to create a TileDB array.
# Create the array and ingest data in one step.
# This array uses `species` and `island` as dimensions.
fromDataFrame (penguins, array_uri, col_index = 1 : 2 )
Review the schema of the array you just created:
# Confirm the array schema is in place
arr <- tiledb_array (array_uri, return_as = "data.frame" )
schema (arr)
tiledb_array_schema(
domain=tiledb_domain(c(
tiledb_dim(name="species", domain=c(NULL,NULL), tile=NULL, type="ASCII", filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1)))),
tiledb_dim(name="island", domain=c(NULL,NULL), tile=NULL, type="ASCII", filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))))
)),
attrs=c(
tiledb_attr(name="bill_length_mm", type="FLOAT64", ncells=1, nullable=TRUE, filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1)))),
tiledb_attr(name="bill_depth_mm", type="FLOAT64", ncells=1, nullable=TRUE, filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1)))),
tiledb_attr(name="flipper_length_mm", type="INT32", ncells=1, nullable=TRUE, filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1)))),
tiledb_attr(name="body_mass_g", type="INT32", ncells=1, nullable=TRUE, filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1)))),
tiledb_attr(name="sex", type="INT32", ncells=1, nullable=TRUE, filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))), dictionary=c("female","male")),
tiledb_attr(name="year", type="INT32", ncells=1, nullable=FALSE, filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))))
),
cell_order="COL_MAJOR", tile_order="COL_MAJOR", capacity=10000, sparse=TRUE, allows_dups=TRUE,
coords_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))),
offsets_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))),
validity_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("RLE"),"COMPRESSION_LEVEL",-1)))
)
To confirm TileDB wrote the data correctly, you can pull the first few rows of the array with the head()
function:
A data.frame: 6 × 8
<chr>
<chr>
<dbl>
<dbl>
<int>
<int>
<fct>
<int>
1
Adelie
Biscoe
37.6
17.0
185
3600
female
2008
2
Adelie
Biscoe
42.7
18.3
196
4075
male
2009
3
Adelie
Biscoe
41.1
18.2
192
4050
male
2008
4
Adelie
Biscoe
37.7
18.7
180
3600
male
2007
5
Adelie
Biscoe
38.8
17.2
180
3800
male
2007
6
Adelie
Biscoe
35.3
18.9
187
3800
female
2007
Clean up in the end by deleting the array.
if (file.exists (array_uri)) {
unlink (array_uri, recursive = TRUE )
}