Learn how to add query conditions to trim your result sets so only the data you need are returned.
How to run this tutorial
You can run this tutorial in two ways:
Locally on your machine.
On TileDB Cloud.
However, since TileDB Cloud has a free tier, we strongly recommend that you sign up and run everything there, as that requires no installations or deployment.
This tutorial explains how query conditions work with TileDB arrays. The example uses a sparse array, but everything discussed here applies to dense arrays as well.
First, import the necessary libraries, set the array URI (that is, its path, which in this tutorial will be on local storage), and delete any previously created arrays with the same name.
# Create the two dimensionsd1 = tiledb.Dim(name="d1", domain=(0, 3), tile=2, dtype=np.int32)d2 = tiledb.Dim(name="d2", domain=(0, 3), tile=2, dtype=np.int32)# Create a domain using the two dimensionsdom = tiledb.Domain(d1, d2)# Create an attributea = tiledb.Attr(name="a", dtype=np.int32)# Create the array schema with `sparse=True`sch = tiledb.ArraySchema(domain=dom, sparse=True, attrs=[a])# Create the array on disk (it will initially be empty)tiledb.Array.create(array_uri, sch)
# Create the two dimensionsd1 <-tiledb_dim("d1", c(0L, 3L), 2L, "INT32")d2 <-tiledb_dim("d2", c(0L, 3L), 2L, "INT32")# Create a domain using the two dimensionsdom <-tiledb_domain(dims =c(d1, d2))# Create an attributea <-tiledb_attr("a", type ="INT32")# Create the array schema with `sparse = TRUE`sch <-tiledb_array_schema(dom, a, sparse =TRUE)# Create the array on disk (it will initially be empty)arr <-tiledb_array_create(sparse_array, sch)
Populate the TileDB array with 1D input arrays in coordinate (COO) format.
# Prepare some data in numpy arraysd1_data = np.array([2, 0, 3, 2, 0, 1], dtype=np.int32)d2_data = np.array([0, 1, 1, 2, 3, 3], dtype=np.int32)a_data = np.array([4, 1, 6, 5, 2, 3], dtype=np.int32)# Open the array in write mode and write the data in coordinate (COO) formatwith tiledb.open(array_uri, "w") as A: A[d1_data, d2_data] = a_data
# Prepare some data in an arrayd1_data <-c(2L, 0L, 3L, 2L, 0L, 1L)d2_data <-c(0L, 1L, 1L, 2L, 3L, 3L)a_data <-c(4L, 1L, 6L, 5L, 2L, 3L)# Open the array for writing and write data to the arrayarr <-tiledb_array(uri = sparse_array, query_type ="WRITE", return_as ="data.frame")arr[d1_data, d2_data] <- a_data# Close the arrayinvisible(tiledb_array_close(arr))
Read the data by using the query conditions in different ways.
# Open the array in read modeA = tiledb.open(array_uri, mode="r")# Prepare a query with a condition and print the result.# NOTE: You don't need to use string quotes for dimensions# and attributes in the condition.q = A.query(cond="d1 >= 2 and a != 5")print("Condition 'd1 >=2 and a != 5':", q[:])# NOTE: The following is equivalent to the above.# This is useful if the dimensions and attributes# contain special characters.# q = A.query(cond="dim('d1') >= 2 and attr('a') != 5")# You can apply extra slicing conditions to the query.# The result will be the intersection of all conditions.print("Condition plus slicing:", q[3:4, :])# You can subselect on attributes and dimensions, if your# condition does not include them and you also don't# need them in your result.q = A.query(attrs=[], dims=["d1"], cond="d1 >= 1")print("Condition plus attribute / dimension subselection: ", q[:])# Alwasy close the array when doneA.close()
# Prepare a query with a condition and print the result.# NOTE: You don nott need to use string quotes for# dimensions and attributes in the condition.qry <-tiledb_query(arr, "READ")dim1 <-integer(16)dim2 <-integer(16)values <-integer(16)tiledb_query_set_buffer(qry, "d1", dim1)tiledb_query_set_buffer(qry, "d2", dim2)tiledb_query_set_buffer(qry, "a", values)# Define query conditions that match the following:# d1 >= 2 and a != 5qc <-parse_query_condition(d1 >=2&& a !=5)qry <-tiledb_query_set_condition(qry, qc)tiledb_query_submit(qry)tiledb_query_finalize(qry)n <-tiledb_query_result_buffer_elements(qry, "a")cat("Condition:\n")print(data.frame(rows = dim1, cols = dim2, a = values)[1:n, ])cat("Condition plus slicing:\n")print(data.frame(rows = dim1, cols = dim2, a = values)[2:2, ])cat("Condition plus attribute / dimension subselection:\n")arr <-tiledb_array_close(arr)qry <-tiledb_query(arr, "READ")tiledb_query_set_buffer(qry, "d1", dim1)tiledb_query_set_buffer(qry, "d2", dim2)tiledb_query_set_buffer(qry, "a", values)qc2 <-tiledb_query_condition_init("d1", 1L, "INT32", "GE")qry <-tiledb_query_set_condition(qry, qc2)tiledb_query_submit(qry)tiledb_query_finalize(qry)n <-tiledb_query_result_buffer_elements(qry, "d1")print(data.frame(rows = dim1)[1:n, ])# Close the arrayinvisible(tiledb_array_close(arr))
Condition:
rows cols a
1 2 0 4
2 3 1 6
Condition plus slicing:
rows cols a
2 3 1 6
Condition plus attribute / dimension subselection:
[1] 1 2 2 3