Data Layout

Learn how the tile order and cell order affect the data layout on storage.

How to run this tutorial

You can run this tutorial in two ways:

Locally on your machine.
On TileDB Cloud.

However, since TileDB Cloud has a free tier, we strongly recommend that you sign up and run everything there, as that requires no installations or deployment.

This tutorial describes how to set the tile and cell order for your arrays and how it affects querying arrays using the global order. You’ll create four dense arrays, each with a different combination of tile and cell order. You’ll also create two sparse arrays: one with row-major tile and cell order, and another with Hilbert cell order.

For more information about data layout, visit the Key Concepts: Data Layout doc.

First, import the necessary libraries, set the array URIs (that is, their paths, which in this tutorial will be on local storage), and delete any previously created arrays with the same names.

Python
R

# Import necessary libraries
import os.path
import shutil

import numpy as np
import tiledb

# Set array URIs
array_uri_dense_cr_tr = os.path.expanduser("~/data_layout_dense_cr_tr_python")
array_uri_dense_cr_tc = os.path.expanduser("~/data_layout_dense_cr_tc_python")
array_uri_dense_cc_tr = os.path.expanduser("~/data_layout_dense_cc_tr_python")
array_uri_dense_cc_tc = os.path.expanduser("~/data_layout_dense_cc_tc_python")
array_uri_sparse_cr_tr = os.path.expanduser("~/data_layout_sparse_cr_tr_python")
array_uri_sparse_hilbert = os.path.expanduser("~/data_layout_sparse_hilbert_python")

# Delete the arrays if they already exist
if os.path.exists(array_uri_dense_cr_tr):
    shutil.rmtree(array_uri_dense_cr_tr)
if os.path.exists(array_uri_dense_cr_tc):
    shutil.rmtree(array_uri_dense_cr_tc)
if os.path.exists(array_uri_dense_cc_tr):
    shutil.rmtree(array_uri_dense_cc_tr)
if os.path.exists(array_uri_dense_cc_tc):
    shutil.rmtree(array_uri_dense_cc_tc)
if os.path.exists(array_uri_sparse_cr_tr):
    shutil.rmtree(array_uri_sparse_cr_tr)
if os.path.exists(array_uri_sparse_hilbert):
    shutil.rmtree(array_uri_sparse_hilbert)

library(tiledb)

# Set array URIs
array_uri_dense_cr_tr <- path.expand("~/data_layout_dense_cr_tr_r")
array_uri_dense_cr_tc <- path.expand("~/data_layout_dense_cr_tc_r")
array_uri_dense_cc_tr <- path.expand("~/data_layout_dense_cc_tr_r")
array_uri_dense_cc_tc <- path.expand("~/data_layout_dense_cc_tc_r")
array_uri_sparse_cr_tr <- path.expand("~/data_layout_sparse_cr_tr_r")
array_uri_sparse_hilbert <- path.expand("~/data_layout_sparse_hilbert_r")

# Delete the arrays if they already exist
if (file.exists(array_uri_dense_cr_tr)) {
  unlink(array_uri_dense_cr_tr, recursive = TRUE)
}
if (file.exists(array_uri_dense_cr_tc)) {
  unlink(array_uri_dense_cr_tc, recursive = TRUE)
}
if (file.exists(array_uri_dense_cc_tr)) {
  unlink(array_uri_dense_cc_tr, recursive = TRUE)
}
if (file.exists(array_uri_dense_cc_tc)) {
  unlink(array_uri_dense_cc_tc, recursive = TRUE)
}
if (file.exists(array_uri_sparse_cr_tr)) {
  unlink(array_uri_sparse_cr_tr, recursive = TRUE)
}
if (file.exists(array_uri_sparse_hilbert)) {
  unlink(array_uri_sparse_hilbert, recursive = TRUE)
}

Next, create the underlying schema objects, including dimensions, domains, and an attribute:

Python
R

# Create the two dense dimensions
d1_dense = tiledb.Dim(name="d1", domain=(1, 4), tile=2, dtype=np.int32)
d2_dense = tiledb.Dim(name="d2", domain=(1, 4), tile=2, dtype=np.int32)

# Create the two sparse dimensions
d1_sparse = tiledb.Dim(name="d1", domain=(0, 3), tile=2, dtype=np.int32)
d2_sparse = tiledb.Dim(name="d2", domain=(0, 3), tile=2, dtype=np.int32)

# Create a dense domain using the two dense dimensions
dom_dense = tiledb.Domain(d1_dense, d2_dense)

# Create a sparse domain using the two sparse dimensions
dom_sparse = tiledb.Domain(d1_sparse, d2_sparse)

# Create an attribute
a = tiledb.Attr(name="a", dtype=np.int32)

# Create the two dense dimensions
d1_dense <- tiledb_dim("d1", c(1L, 4L), 2L, "INT32")
d2_dense <- tiledb_dim("d2", c(1L, 4L), 2L, "INT32")

# Create the two sparse dimensions
d1_sparse <- tiledb_dim("d1", c(0L, 3L), 2L, "INT32")
d2_sparse <- tiledb_dim("d2", c(0L, 3L), 2L, "INT32")

# Create a dense domain using the two dense dimensions
dom_dense <- tiledb_domain(dims = c(d1_dense, d2_dense))

# Create a sparse domain using the two sparse dimensions
dom_sparse <- tiledb_domain(dims = c(d1_sparse, d2_sparse))

# Create an attribute
a <- tiledb_attr("a", type = "INT32")

Once you created the necessary schema objects, create the schema for each array:

Python
R

sch_dense_cr_tr = tiledb.ArraySchema(domain=dom_dense, attrs=[a], sparse=False)

# "row-major" is the default for both cell order and tile order
# the following statement is functionally the same:
# sch_dense_cr_tr = tiledb.ArraySchema(
#     domain=dom, attrs=[a], cell_order="row-major", tile_order="row-major", sparse=False
# )

sch_dense_cr_tc = tiledb.ArraySchema(
    domain=dom_dense,
    attrs=[a],
    cell_order="row-major",
    tile_order="col-major",
    sparse=False,
)
sch_dense_cc_tr = tiledb.ArraySchema(
    domain=dom_dense,
    attrs=[a],
    cell_order="col-major",
    tile_order="row-major",
    sparse=False,
)
sch_dense_cc_tc = tiledb.ArraySchema(
    domain=dom_dense,
    attrs=[a],
    cell_order="col-major",
    tile_order="col-major",
    sparse=False,
)
sch_sparse_cr_tr = tiledb.ArraySchema(domain=dom_sparse, attrs=[a], sparse=True)
sch_sparse_hilbert = tiledb.ArraySchema(
    domain=dom_sparse, attrs=[a], cell_order="hilbert", sparse=True
)

sch_dense_cr_tr <- tiledb_array_schema(dom_dense, a, cell_order = "ROW_MAJOR", tile_order = "ROW_MAJOR", sparse = FALSE)
sch_dense_cr_tc <- tiledb_array_schema(dom_dense, a, cell_order = "ROW_MAJOR", tile_order = "COL_MAJOR", sparse = FALSE)
sch_dense_cc_tr <- tiledb_array_schema(dom_dense, a, cell_order = "COL_MAJOR", tile_order = "ROW_MAJOR", sparse = FALSE)
sch_dense_cc_tc <- tiledb_array_schema(dom_dense, a, sparse = FALSE)
# "COL_MAJOR" is the default for both cell order and tile order
# The following statement is functionally the same:
# sch_dense_cc_tc <- tiledb_array_schema(dom_dense, a, cell_order = "COL_MAJOR", tile_order = "COL_MAJOR", sparse = FALSE)
sch_sparse_cr_tr <- tiledb_array_schema(dom_sparse, a, cell_order = "ROW_MAJOR", tile_order = "ROW_MAJOR", sparse = TRUE)
sch_sparse_hilbert <- tiledb_array_schema(dom_sparse, a, cell_order = "HILBERT", sparse = TRUE)

Next, create the arrays with their corresponding schemas:

Python
R

tiledb.Array.create(array_uri_dense_cr_tr, sch_dense_cr_tr)
tiledb.Array.create(array_uri_dense_cr_tc, sch_dense_cr_tc)
tiledb.Array.create(array_uri_dense_cc_tr, sch_dense_cc_tr)
tiledb.Array.create(array_uri_dense_cc_tc, sch_dense_cc_tc)
tiledb.Array.create(array_uri_sparse_cr_tr, sch_sparse_cr_tr)
tiledb.Array.create(array_uri_sparse_hilbert, sch_sparse_hilbert)

arr_dense_cr_tr <- tiledb_array_create(array_uri_dense_cr_tr, sch_dense_cr_tr)
arr_dense_cr_tc <- tiledb_array_create(array_uri_dense_cr_tc, sch_dense_cr_tc)
arr_dense_cc_tr <- tiledb_array_create(array_uri_dense_cc_tr, sch_dense_cc_tr)
arr_dense_cc_tc <- tiledb_array_create(array_uri_dense_cc_tc, sch_dense_cc_tc)
arr_sparse_cr_tr <- tiledb_array_create(array_uri_sparse_cr_tr, sch_sparse_cr_tr)
arr_sparse_hilbert <- tiledb_array_create(array_uri_sparse_hilbert, sch_sparse_hilbert)

Now prepare some data for each array.

Python
R

# Prepare some data in NumPy arrays
dense_data = np.array(
    [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]], dtype=np.int32
)

sparse_d1_data = np.array([2, 0, 3, 2, 0, 1], dtype=np.int32)
sparse_d2_data = np.array([0, 1, 1, 2, 3, 3], dtype=np.int32)
sparse_a_data = np.array([4, 1, 6, 5, 2, 3], dtype=np.int32)

dense_data <- t(array(1:16, dim = c(4, 4)))
sparse_d1_data <- c(2L, 0L, 3L, 2L, 0L, 1L)
sparse_d2_data <- c(0L, 1L, 1L, 2L, 3L, 3L)
sparse_a_data <- c(4L, 1L, 6L, 5L, 2L, 3L)

Write the corresponding data to each array.

Python
R

# Write data to the arrays
with tiledb.open(array_uri_dense_cr_tr, "w") as A:
    A[:] = dense_data
with tiledb.open(array_uri_dense_cr_tc, "w") as A:
    A[:] = dense_data
with tiledb.open(array_uri_dense_cc_tr, "w") as A:
    A[:] = dense_data
with tiledb.open(array_uri_dense_cc_tc, "w") as A:
    A[:] = dense_data
with tiledb.open(array_uri_sparse_cr_tr, "w") as A:
    A[sparse_d1_data, sparse_d2_data] = sparse_a_data
with tiledb.open(array_uri_sparse_hilbert, "w") as A:
    A[sparse_d1_data, sparse_d2_data] = sparse_a_data

# Write data to the arrays
arr_dense_cr_tr <- tiledb_array(uri = array_uri_dense_cr_tr, query_type = "WRITE", return_as = "data.frame")
arr_dense_cr_tr[] <- dense_data
arr_dense_cr_tr <- tiledb_array_close(arr_dense_cr_tr)

arr_dense_cr_tc <- tiledb_array(uri = array_uri_dense_cr_tc, query_type = "WRITE", return_as = "data.frame")
arr_dense_cr_tc[] <- dense_data
arr_dense_cr_tc <- tiledb_array_close(arr_dense_cr_tc)

arr_dense_cc_tr <- tiledb_array(uri = array_uri_dense_cc_tr, query_type = "WRITE", return_as = "data.frame")
arr_dense_cc_tr[] <- dense_data
arr_dense_cc_tr <- tiledb_array_close(arr_dense_cc_tr)

arr_dense_cc_tc <- tiledb_array(uri = array_uri_dense_cc_tc, query_type = "WRITE", return_as = "data.frame")
arr_dense_cc_tc[] <- dense_data
arr_dense_cc_tc <- tiledb_array_close(arr_dense_cc_tc)

arr_sparse_cr_tr <- tiledb_array(uri = array_uri_sparse_cr_tr, query_type = "WRITE", return_as = "data.frame")
arr_sparse_cr_tr[sparse_d1_data, sparse_d2_data] <- sparse_a_data
arr_sparse_cr_tr <- tiledb_array_close(arr_sparse_cr_tr)

arr_sparse_hilbert <- tiledb_array(uri = array_uri_sparse_hilbert, query_type = "WRITE", return_as = "data.frame")
arr_sparse_hilbert[sparse_d1_data, sparse_d2_data] <- sparse_a_data
arr_sparse_hilbert <- tiledb_array_close(arr_sparse_hilbert)

Now read the data from each array.

Python
R

# read data from each array
with tiledb.open(array_uri_dense_cr_tr, "r") as A:
    print("Dense array - row-major cell order and tile order:")
    print(A.query(attrs=["a"], dims=["d1", "d2"], order="G")[:]["a"])
with tiledb.open(array_uri_dense_cr_tc, "r") as A:
    print("Dense array - row-major cell order and col-major tile order:")
    print(A.query(attrs=["a"], dims=["d1", "d2"], order="G")[:]["a"])
with tiledb.open(array_uri_dense_cc_tr, "r") as A:
    print("Dense array - col-major cell order and row-major tile order:")
    print(A.query(attrs=["a"], dims=["d1", "d2"], order="G")[:]["a"])
with tiledb.open(array_uri_dense_cc_tc, "r") as A:
    print("Dense array - col-major cell order tile order:")
    print(A.query(attrs=["a"], dims=["d1", "d2"], order="G")[:]["a"])
with tiledb.open(array_uri_sparse_cr_tr, "r") as A:
    print("Sparse array - row-major cell order and tile order:")
    print(A.query(order="G")[:]["a"])
with tiledb.open(array_uri_sparse_hilbert, "r") as A:
    print("Sparse array - Hilbert order:")
    print(A.query(order="G")[:]["a"])

Dense array - row-major cell order and tile order:
[ 1  2  5  6  3  4  7  8  9 10 13 14 11 12 15 16]
Dense array - row-major cell order and col-major tile order:
[ 1  2  5  6  9 10 13 14  3  4  7  8 11 12 15 16]
Dense array - col-major cell order and row-major tile order:
[ 1  5  2  6  3  7  4  8  9 13 10 14 11 15 12 16]
Dense array - col-major cell order tile order:
[ 1  5  2  6  9 13 10 14  3  7  4  8 11 15 12 16]
Sparse array - row-major cell order and tile order:
[1 2 3 4 6 5]
Sparse array - Hilbert order:
[1 2 3 5 6 4]

# Read data from each array
global_order_query <- function(arr, subarray) {
  qry <- tiledb_query(arr, "READ")
  qry <- tiledb_query_set_subarray(qry, subarray)
  dim1 <- integer(16)
  dim2 <- integer(16)
  values <- integer(16)
  tiledb_query_set_buffer(qry, "d1", dim1)
  tiledb_query_set_buffer(qry, "d2", dim2)
  tiledb_query_set_buffer(qry, "a", values)
  tiledb_query_set_layout(qry, layout = "GLOBAL")
  tiledb_query_submit(qry)
  tiledb_query_finalize(qry)
  n <- tiledb_query_result_buffer_elements(qry, "a")
  print(data.frame(rows = values)[1:n, ])
}

dense_subarray <- c(1L, 4L, 1L, 4L)
sparse_subarray <- c(0L, 3L, 0L, 3L)

global_order_query(arr_dense_cr_tr, dense_subarray)
global_order_query(arr_dense_cr_tc, dense_subarray)
global_order_query(arr_dense_cc_tr, dense_subarray)
global_order_query(arr_dense_cc_tc, dense_subarray)
global_order_query(arr_sparse_cr_tr, sparse_subarray)
global_order_query(arr_sparse_hilbert, sparse_subarray)

 [1]  1  2  5  6  3  4  7  8  9 10 13 14 11 12 15 16
 [1]  1  2  5  6  9 10 13 14  3  4  7  8 11 12 15 16
 [1]  1  5  2  6  3  7  4  8  9 13 10 14 11 15 12 16
 [1]  1  5  2  6  9 13 10 14  3  7  4  8 11 15 12 16
[1] 1 2 3 4 6 5
[1] 1 2 3 5 6 4

Clean up in the end by deleting the array directories.

Python
R

# Delete the arrays if they already exist
if os.path.exists(array_uri_dense_cr_tr):
    shutil.rmtree(array_uri_dense_cr_tr)
if os.path.exists(array_uri_dense_cr_tc):
    shutil.rmtree(array_uri_dense_cr_tc)
if os.path.exists(array_uri_dense_cc_tr):
    shutil.rmtree(array_uri_dense_cc_tr)
if os.path.exists(array_uri_dense_cc_tc):
    shutil.rmtree(array_uri_dense_cc_tc)
if os.path.exists(array_uri_sparse_cr_tr):
    shutil.rmtree(array_uri_sparse_cr_tr)
if os.path.exists(array_uri_sparse_hilbert):
    shutil.rmtree(array_uri_sparse_hilbert)

# Delete the arrays if they already exist
if (file.exists(array_uri_dense_cr_tr)) {
  unlink(array_uri_dense_cr_tr, recursive = TRUE)
}
if (file.exists(array_uri_dense_cr_tc)) {
  unlink(array_uri_dense_cr_tc, recursive = TRUE)
}
if (file.exists(array_uri_dense_cc_tr)) {
  unlink(array_uri_dense_cc_tr, recursive = TRUE)
}
if (file.exists(array_uri_dense_cc_tc)) {
  unlink(array_uri_dense_cc_tc, recursive = TRUE)
}
if (file.exists(array_uri_sparse_cr_tr)) {
  unlink(array_uri_sparse_cr_tr, recursive = TRUE)
}
if (file.exists(array_uri_sparse_hilbert)) {
  unlink(array_uri_sparse_hilbert, recursive = TRUE)
}