Run Array Schema Evolution

arrays

tutorials

python

schema evolution

As your data and business requirements evolve, so should your array schema. TileDB allows for versioned updates to your array schema.

How to run this tutorial

You can run this tutorial in two ways:

Locally on your machine.
On TileDB Cloud.

However, since TileDB Cloud has a free tier, we strongly recommend that you sign up and run everything there, as that requires no installations or deployment.

This tutorial describes the array schema evolution functionality in TileDB. For more details, visit the Key Concepts: Schema Evolution section.

First, import the necessary libraries, set the array URI (i.e., its path, which in this tutorial will be on local storage), and delete any previously created arrays with the same name.

Python
R

# Import necessary libraries
import os.path
import shutil

import numpy as np
import tiledb

# Set array URI
array_uri = os.path.expanduser("~/schema_evolution")

# Delete array if it already exists
if os.path.exists(array_uri):
    shutil.rmtree(array_uri)

# Import necessary libraries
library(tiledb)

# Set array URI
array_uri <- path.expand("~/schema_evolution_r")

# Delete array if it already exists
if (file.exists(array_uri)) {
  unlink(array_uri, recursive = TRUE)
}

Next, create an array by specifying its schema. This example uses a dense array, but this described functionality is applicable to sparse arrays as well. The array initially contains two attributes.

Python
R

# Create the two dimensions
d1 = tiledb.Dim(name="d1", domain=(1, 4), tile=2, dtype=np.int32)
d2 = tiledb.Dim(name="d2", domain=(1, 4), tile=2, dtype=np.int32)

# Create a domain using the two dimensions
dom = tiledb.Domain(d1, d2)

# Create two attributes
a1 = tiledb.Attr(name="a1", dtype=np.int32)
a2 = tiledb.Attr(name="a2", dtype=np.float32)

# Create the array schema, setting `sparse=False` to indicate a dense array.
sch = tiledb.ArraySchema(domain=dom, sparse=False, attrs=[a1, a2])

# Create the array on disk (it will initially be empty)
tiledb.Array.create(array_uri, sch)

# Create the two dimensions
d1 <- tiledb_dim("d1", c(1L, 4L), 2L, "INT32")
d2 <- tiledb_dim("d2", c(1L, 4L), 2L, "INT32")

# Create a domain using the two dimensions
dom <- tiledb_domain(dims = c(d1, d2))

# Create two attributes
a1 <- tiledb_attr("a1", type = "INT32")
a2 <- tiledb_attr("a2", type = "FLOAT64")

# Create the array schema, setting `sparse = FALSE` to indicate a dense array
sch <- tiledb_array_schema(dom, c(a1, a2), sparse = FALSE)

# Create the array on disk (it will initially be empty)
arr <- tiledb_array_create(array_uri, sch)

Populate the TileDB array using 2-dimensional input arrays, one for each attribute.

Python
R

# Prepare some data in NumPy arrays
a1_data = np.array(
    [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]], dtype=np.int32
)
a2_data = np.array(
    [
        [1.1, 2.2, 3.3, 4.4],
        [5.5, 6.6, 7.7, 8.8],
        [9.9, 10.10, 11.11, 12.12],
        [13.13, 14.14, 15.15, 16.16],
    ],
    dtype=np.float32,
)

# Write data to the array
with tiledb.open(array_uri, "w") as A:
    A[:] = {"a1": a1_data, "a2": a2_data}

# Prepare some data in two arrays, one for each attribute
a1_data <- t(array(1:16, dim = c(4, 4)))

a2_data <- array(
  c(
    1.1, 2.2, 3.3, 4.4,
    5.5, 6.6, 7.7, 8.8,
    9.9, 10.10, 11.11, 12.12,
    13.13, 14.14, 15.15, 16.16
  ),
  dim = c(4L, 4L)
)

# Open the array for writing and write data to the array
arr <- tiledb_array(
  uri = array_uri,
  query_type = "WRITE",
  return_as = "data.frame"
)

arr[] <- list(
  a1 = a1_data,
  a2 = a2_data
)

# Close the array
arr <- tiledb_array_close(arr)

The array schema and contents at this moment are as follows.

Python
R

with tiledb.open(array_uri, "r") as A:
    print(A.schema)
    print(A[:])

ArraySchema(
  domain=Domain(*[
    Dim(name='d1', domain=(1, 4), tile=2, dtype='int32', filters=FilterList([ZstdFilter(level=-1), ])),
    Dim(name='d2', domain=(1, 4), tile=2, dtype='int32', filters=FilterList([ZstdFilter(level=-1), ])),
  ]),
  attrs=[
    Attr(name='a1', dtype='int32', var=False, nullable=False, enum_label=None),
    Attr(name='a2', dtype='float32', var=False, nullable=False, enum_label=None),
  ],
  cell_order='row-major',
  tile_order='row-major',
  sparse=False,
)

OrderedDict({'a1': array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12],
       [13, 14, 15, 16]], dtype=int32), 'a2': array([[ 1.1 ,  2.2 ,  3.3 ,  4.4 ],
       [ 5.5 ,  6.6 ,  7.7 ,  8.8 ],
       [ 9.9 , 10.1 , 11.11, 12.12],
       [13.13, 14.14, 15.15, 16.16]], dtype=float32)})

arr <- tiledb_array_open(arr)
print(schema(arr))
print(arr[])

tiledb_array_schema(
    domain=tiledb_domain(c(
        tiledb_dim(name="d1", domain=c(1L,4L), tile=2L, type="INT32", filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1)))),
        tiledb_dim(name="d2", domain=c(1L,4L), tile=2L, type="INT32", filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))))
    )),
    attrs=c(
        tiledb_attr(name="a1", type="INT32", ncells=1, nullable=FALSE),
        tiledb_attr(name="a2", type="FLOAT64", ncells=1, nullable=FALSE)
    ),
    cell_order="COL_MAJOR", tile_order="COL_MAJOR", capacity=10000, sparse=FALSE, allows_dups=FALSE,
    coords_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))),
    offsets_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))),
    validity_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("RLE"),"COMPRESSION_LEVEL",-1)))
)
   d1 d2 a1    a2
1   1  1  1  1.10
2   2  1  5  2.20
3   3  1  9  3.30
4   4  1 13  4.40
5   1  2  2  5.50
6   2  2  6  6.60
7   3  2 10  7.70
8   4  2 14  8.80
9   1  3  3  9.90
10  2  3  7 10.10
11  3  3 11 11.11
12  4  3 15 12.12
13  1  4  4 13.13
14  2  4  8 14.14
15  3  4 12 15.15
16  4  4 16 16.16

Drop attribute a1 from the array.

Python
R

se = tiledb.ArraySchemaEvolution()
se.drop_attribute("a1")
se.array_evolve(array_uri)

se <- tiledb_array_schema_evolution()
tiledb_array_schema_evolution_drop_attribute(se, "a1")
tiledb_array_schema_evolution_array_evolve(se, array_uri)

The array schema and contents after this change are as follows.

Python
R

with tiledb.open(array_uri, "r") as A:
    print(A.schema)
    print(A[:])

ArraySchema(
  domain=Domain(*[
    Dim(name='d1', domain=(1, 4), tile=2, dtype='int32', filters=FilterList([ZstdFilter(level=-1), ])),
    Dim(name='d2', domain=(1, 4), tile=2, dtype='int32', filters=FilterList([ZstdFilter(level=-1), ])),
  ]),
  attrs=[
    Attr(name='a2', dtype='float32', var=False, nullable=False, enum_label=None),
  ],
  cell_order='row-major',
  tile_order='row-major',
  sparse=False,
)

OrderedDict({'a2': array([[ 1.1 ,  2.2 ,  3.3 ,  4.4 ],
       [ 5.5 ,  6.6 ,  7.7 ,  8.8 ],
       [ 9.9 , 10.1 , 11.11, 12.12],
       [13.13, 14.14, 15.15, 16.16]], dtype=float32)})

print(schema(arr))
print(arr[])

tiledb_array_schema(
    domain=tiledb_domain(c(
        tiledb_dim(name="d1", domain=c(1L,4L), tile=2L, type="INT32", filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1)))),
        tiledb_dim(name="d2", domain=c(1L,4L), tile=2L, type="INT32", filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))))
    )),
    attrs=c(
        tiledb_attr(name="a2", type="FLOAT64", ncells=1, nullable=FALSE)
    ),
    cell_order="COL_MAJOR", tile_order="COL_MAJOR", capacity=10000, sparse=FALSE, allows_dups=FALSE,
    coords_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))),
    offsets_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))),
    validity_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("RLE"),"COMPRESSION_LEVEL",-1)))
)
   d1 d2    a2
1   1  1  1.10
2   2  1  2.20
3   3  1  3.30
4   4  1  4.40
5   1  2  5.50
6   2  2  6.60
7   3  2  7.70
8   4  2  8.80
9   1  3  9.90
10  2  3 10.10
11  3  3 11.11
12  4  3 12.12
13  1  4 13.13
14  2  4 14.14
15  3  4 15.15
16  4  4 16.16

Add a new attribute a to the array.

Python
R

a = tiledb.Attr("a", dtype=np.int8)
se = tiledb.ArraySchemaEvolution()
se.add_attribute(a)
se.array_evolve(array_uri)

a <- tiledb_attr("a", type = "INT8")
se <- tiledb_array_schema_evolution()
tiledb_array_schema_evolution_add_attribute(se, a)
tiledb_array_schema_evolution_array_evolve(se, array_uri)

The array schema and contents after this second change are as follows. Observe that attribute a has no contents (value -128 is a fill value that in this case indicates an empty cell).

Python
R

with tiledb.open(array_uri, "r") as A:
    print(A.schema)
    print(A[:])

ArraySchema(
  domain=Domain(*[
    Dim(name='d1', domain=(1, 4), tile=2, dtype='int32', filters=FilterList([ZstdFilter(level=-1), ])),
    Dim(name='d2', domain=(1, 4), tile=2, dtype='int32', filters=FilterList([ZstdFilter(level=-1), ])),
  ]),
  attrs=[
    Attr(name='a2', dtype='float32', var=False, nullable=False, enum_label=None),
    Attr(name='a', dtype='int8', var=False, nullable=False, enum_label=None),
  ],
  cell_order='row-major',
  tile_order='row-major',
  sparse=False,
)

OrderedDict({'a2': array([[ 1.1 ,  2.2 ,  3.3 ,  4.4 ],
       [ 5.5 ,  6.6 ,  7.7 ,  8.8 ],
       [ 9.9 , 10.1 , 11.11, 12.12],
       [13.13, 14.14, 15.15, 16.16]], dtype=float32), 'a': array([[-128, -128, -128, -128],
       [-128, -128, -128, -128],
       [-128, -128, -128, -128],
       [-128, -128, -128, -128]], dtype=int8)})

print(schema(arr))
print(arr[])
arr <- tiledb_array_close(arr)

tiledb_array_schema(
    domain=tiledb_domain(c(
        tiledb_dim(name="d1", domain=c(1L,4L), tile=2L, type="INT32", filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1)))),
        tiledb_dim(name="d2", domain=c(1L,4L), tile=2L, type="INT32", filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))))
    )),
    attrs=c(
        tiledb_attr(name="a2", type="FLOAT64", ncells=1, nullable=FALSE),
        tiledb_attr(name="a", type="INT8", ncells=1, nullable=FALSE)
    ),
    cell_order="COL_MAJOR", tile_order="COL_MAJOR", capacity=10000, sparse=FALSE, allows_dups=FALSE,
    coords_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))),
    offsets_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("ZSTD"),"COMPRESSION_LEVEL",-1))),
    validity_filter_list=tiledb_filter_list(c(tiledb_filter_set_option(tiledb_filter("RLE"),"COMPRESSION_LEVEL",-1)))
)
   d1 d2    a2    a
1   1  1  1.10 -128
2   2  1  2.20 -128
3   3  1  3.30 -128
4   4  1  4.40 -128
5   1  2  5.50 -128
6   2  2  6.60 -128
7   3  2  7.70 -128
8   4  2  8.80 -128
9   1  3  9.90 -128
10  2  3 10.10 -128
11  3  3 11.11 -128
12  4  3 12.12 -128
13  1  4 13.13 -128
14  2  4 14.14 -128
15  3  4 15.15 -128
16  4  4 16.16 -128

Clean up in the end by deleting the array.

Python
R

# Delete the array
if os.path.exists(array_uri):
    shutil.rmtree(array_uri)

if (file.exists(array_uri)) {
  unlink(array_uri, recursive = TRUE)
}

Note

If you wish to evolve the schema at a particular timestamp, similar to writing at a timestamp for fragments and array metadata (visit the Tutorials: Writing at a Timestamp section for details), you can set a timestamp to the schema evolution object. For an example, see how this is used in the Tutorials: Time Traveling - Schema Evolution section.