Ingest Dense Data into a Machine Learning Model

tutorials

ai/ml

machine learning (ml)

imgestion

Learn how to ingest and perform basic ML operations on the MNIST dense dataset.

A dense dataset is one in which most of the values are nonzero.

In a dense dataset, the majority of elements in the data matrix or array have nonzero values. This means dense datasets have little to no sparsity.
Dense datasets are common in many types of data, including numerical data, images, and text, where most of the features or dimensions are relevant and contribute to the information content of the dataset.
Dense datasets often require more memory and computational resources to process and analyze due to the larger amount of data present.

For the dense case, this tutorial will use the MNIST dataset.

Note

The MNIST dataset is a widely used benchmark dataset in the field of machine learning. It consists of a collection of 28×28 pixel grayscale images of handwritten digits (0 to 9), along with their corresponding labels showing the digit represented in each image. The dataset is commonly used for training and evaluating machine learning models, particularly for image classification tasks.

Import libraries

Start by importing the libraries used in this tutorial.

import os

import idx2numpy
import matplotlib.pyplot as plt
import numpy as np
import tiledb
import torchvision
from tiledb.ml.readers.pytorch import PyTorchTileDBDataLoader
from tiledb.ml.readers.types import ArrayParams

Download the dataset

def load_mnist_data():
    data_home = os.path.join(os.path.pardir, "data")
    _ = torchvision.datasets.MNIST(root=data_home, train=False, download=True)
    img_path = os.path.join(data_home, "MNIST/raw/train-images-idx3-ubyte")
    labels_path = os.path.join(data_home, "MNIST/raw/train-labels-idx1-ubyte")
    images = idx2numpy.convert_from_file(img_path)
    labels = idx2numpy.convert_from_file(labels_path)
    return images, labels


(images, labels) = load_mnist_data()

Ingest in TileDB

def ingest_in_tiledb(data: np.array, batch_size: int, uri: str):
    # Equal number of dimensions with the numpy array.
    dims = [
        tiledb.Dim(
            name="dim_" + str(dim),
            domain=(0, data.shape[dim] - 1),
            tile=data.shape[dim] if dim > 0 else batch_size,
            dtype=np.int32,
        )
        for dim in range(data.ndim)
    ]
    # TileDB schema
    schema = tiledb.ArraySchema(
        domain=tiledb.Domain(*dims),
        sparse=False,
        attrs=[tiledb.Attr(name="features", dtype=data.dtype)],
    )
    # Create array
    tiledb.Array.create(uri, schema)
    # Ingest
    with tiledb.open(uri, "w") as tiledb_array:
        tiledb_array[:] = {"features": data}

data_dir = os.path.join(os.path.pardir, "data", "readers", "dense")
os.makedirs(data_dir, exist_ok=True)
# Ingest images
training_images = os.path.join(data_dir, "training_images")
ingest_in_tiledb(data=images, batch_size=64, uri=training_images)

# Ingest labels
training_labels = os.path.join(data_dir, "training_labels")
ingest_in_tiledb(data=labels, batch_size=64, uri=training_labels)

TileDB dataset

images_array = tiledb.open(training_images)
labels_array = tiledb.open(training_labels)

Arrays schemas

images_array.schema

Domain

Name	Domain	Tile	Data Type	Is Var-length	Filters
dim_0	(0, 59999)	64	int32	False	-
dim_1	(0, 27)	28	int32	False	-
dim_2	(0, 27)	28	int32	False	-

Attributes

Name	Data Type	Is Var-Len	Is Nullable	Filters
features	uint8	False	False	-

Cell Order

row-major

Tile Order

row-major

Sparse

False

labels_array.schema

Domain

Name	Domain	Tile	Data Type	Is Var-length	Filters
dim_0	(0, 59999)	64	int32	False	-

Attributes

Name	Data Type	Is Var-Len	Is Nullable	Filters
features	uint8	False	False	-

Cell Order

row-major

Tile Order

row-major

Sparse

False

Import libraries

Start by importing the libraries used in this tutorial.

import os

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tiledb
from tiledb.ml.readers.tensorflow import ArrayParams, TensorflowTileDBDataset

Download the dataset

(images, labels), _ = tf.keras.datasets.mnist.load_data()
images = images / 255.0

Ingest in TileDB

def ingest_in_tiledb(data: np.array, batch_size: int, uri: str):
    # Equal number of dimensions with the numpy array.
    dims = [
        tiledb.Dim(
            name="dim_" + str(dim),
            domain=(0, data.shape[dim] - 1),
            tile=data.shape[dim] if dim > 0 else batch_size,
            dtype=np.int32,
        )
        for dim in range(data.ndim)
    ]
    # TileDB schema
    schema = tiledb.ArraySchema(
        domain=tiledb.Domain(*dims),
        sparse=False,
        attrs=[tiledb.Attr(name="features", dtype=data.dtype)],
    )
    # Create array
    tiledb.Array.create(uri, schema)
    # Ingest
    with tiledb.open(uri, "w") as tiledb_array:
        tiledb_array[:] = {"features": data}

import shutil

data_dir = os.path.join(os.path.curdir, "data", "readers", "dense")
if os.path.exists(data_dir):
    shutil.rmtree(data_dir)

os.makedirs(data_dir, exist_ok=True)
# Ingest images
training_images = os.path.join(data_dir, "training_images")
ingest_in_tiledb(data=images, batch_size=64, uri=training_images)

# Ingest labels
training_labels = os.path.join(data_dir, "training_labels")
ingest_in_tiledb(data=labels, batch_size=64, uri=training_labels)

TileDB dataset

images_array = tiledb.open(training_images)
labels_array = tiledb.open(training_labels)

Arrays schemas

images_array.schema

Domain

Name

Domain

Tile

Data Type

Is Var-length

Filters

dim_0

(0, 59999)

int32

False

Name	Option	Level
ZstdFilter	level	-1

dim_1

(0, 27)

int32

False

Name	Option	Level
ZstdFilter	level	-1

dim_2

(0, 27)

int32

False

Name	Option	Level
ZstdFilter	level	-1

Attributes

Name	Data Type	Is Var-Len	Is Nullable	Filters
features	float64	False	False	-

Cell Order

row-major

Tile Order

row-major

Sparse

False

labels_array.schema

Domain

Name

Domain

Tile

Data Type

Is Var-length

Filters

dim_0

(0, 59999)

int32

False

Name	Option	Level
ZstdFilter	level	-1

Attributes

Name	Data Type	Is Var-Len	Is Nullable	Filters
features	uint8	False	False	-

Cell Order

row-major

Tile Order

row-major

Sparse

False

Dataloaders

TileDB offers an API with native dataloaders for all the ML frameworks with which TileDB integrates. After you store your data, you can use the API to create dataloaders in each framework that will be later used as input to the model’s training stage. The API takes two TileDB arrays as inputs: x (which refers to the sample data), and y (which holds the label data corresponding to each sample in x). The dataloader collates these two arrays into a single data object that you can use later as input for training a model.

PyTorch
TensorFlow

Note

Jupyter notebooks have limited support of Python multiprocessing. Avoid using multiple workers on Jupyter when you need multiprocessing. Instead, run scripts with a normal Python interpreter.

with tiledb.open(training_images) as x, tiledb.open(training_labels) as y:
    train_loader = PyTorchTileDBDataLoader(
        ArrayParams(x),
        ArrayParams(y),
        batch_size=128,
        num_workers=0,
        shuffle_buffer_size=256,
    )
    batch_imgs, batch_labels = next(iter(train_loader))
    print(f"Input Shape: {batch_imgs.shape}")
    print(f"Label Shape: {batch_labels.shape}")

Input Shape: torch.Size([128, 28, 28])
Label Shape: torch.Size([128])

with (
    tiledb.open(training_images) as x,
    tiledb.open(training_labels) as y,
):
    tiledb_dataset = TensorflowTileDBDataset(
        ArrayParams(array=x),
        ArrayParams(array=y),
    )
    batched_dataset = tiledb_dataset.batch(128)
    batch_imgs, batch_labels = next(batched_dataset.as_numpy_iterator())
    print(f"Input Shape: {batch_imgs.shape}")
    print(f"Label Shape: {batch_labels.shape}")

Input Shape: (128, 28, 28)
Label Shape: (128,)

Visualization

Render the first image from the batched data fetched by TileDB-ML loaders:

PyTorch
TensorFlow

image = batch_imgs[0]
plt.subplot(1, 2, 1)
plt.imshow(image, cmap="gray")

image = batch_imgs[0]
plt.subplot(1, 2, 1)
plt.imshow(image, cmap="gray")