Source code for movement.validators.datasets
"""``attrs`` classes for validating data structures."""
import warnings
from abc import ABC, abstractmethod
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
import attrs
import numpy as np
import xarray as xr
from attrs import converters, define, field, validators
from numpy.typing import NDArray
from movement.utils.logging import logger
if TYPE_CHECKING:
from numpy.typing import NDArray
def _convert_to_list_of_str(value: str | Iterable[Any]) -> list[str]:
"""Try to coerce the value into a list of strings."""
if isinstance(value, str):
warnings.warn(
f"Expected a list of strings, but got a string ({value}). "
"Converting to a list of length 1.",
UserWarning,
stacklevel=2,
)
return [value]
elif isinstance(value, Iterable):
return [str(item) for item in value]
else:
raise logger.error(
ValueError(f"Invalid value ({value}). Expected a list of strings.")
)
def _convert_fps_to_none_if_invalid(fps: float | None) -> float | None:
"""Set fps to None if a non-positive float is passed."""
if fps is not None and fps <= 0:
warnings.warn(
f"Invalid fps value ({fps}). Expected a positive number. "
"Setting fps to None.",
UserWarning,
stacklevel=2,
)
return None
return fps
@define(kw_only=True)
class _BaseDatasetInputs(ABC):
"""Abstract base class for validating ``movement`` dataset inputs.
This base class centralises shared fields, validators, and default
assignment logic for creating ``movement`` datasets
(e.g. poses, bounding boxes).
It registers the attrs validators for required fields like
``position_array`` and optional fields like ``confidence_array`` and
``individual_names``.
Subclasses must implement ``to_dataset()`` and define class variables
``DIM_NAMES``, ``VAR_NAMES``, and ``_ALLOWED_SPACE_DIM_SIZE``.
"""
# --- Required fields ---
position_array: np.ndarray = field(
validator=validators.instance_of(np.ndarray)
)
# --- Optional fields ---
confidence_array: np.ndarray | None = field(
default=None,
validator=validators.optional(validators.instance_of(np.ndarray)),
)
individual_names: list[str] | None = field(
default=None,
converter=converters.optional(_convert_to_list_of_str),
)
fps: float | None = field(
default=None,
converter=converters.pipe( # type: ignore
converters.optional(float), _convert_fps_to_none_if_invalid
),
)
source_software: str | None = field(
default=None,
validator=validators.optional(validators.instance_of(str)),
)
# --- Required class variables (to be defined by subclasses) ---
DIM_NAMES: ClassVar[tuple[str, ...]]
VAR_NAMES: ClassVar[tuple[str, ...]]
_ALLOWED_SPACE_DIM_SIZE: ClassVar[int | Iterable[int]]
# --- Lifecycle hooks ---
def __attrs_post_init__(self):
"""Assign default values to optional attributes (if None)."""
# confidence_array default: array of NaNs with appropriate shape
if self.confidence_array is None:
self.confidence_array = np.full(
self._confidence_expected_shape, np.nan, dtype="float32"
)
logger.info(
"Confidence array was not provided."
"Setting to an array of NaNs."
)
# individual_names default: id_0, id_1, ...
if self.individual_names is None and "individuals" in self.DIM_NAMES:
n_inds = self.position_array.shape[
self.DIM_NAMES.index("individuals")
]
self.individual_names = [f"id_{i}" for i in range(n_inds)]
logger.info(
"Individual names were not provided. "
f"Setting to {self.individual_names}."
)
# --- Properties (derived attributes) ---
@property
def _confidence_expected_shape(self):
"""Return expected shape for confidence_array."""
# confidence shape == position_array shape without the space dim
return tuple(
dim
for i, dim in enumerate(self.position_array.shape)
if i != self.DIM_NAMES.index("space")
)
# --- Validators ---
@position_array.validator
def _validate_position_array(self, attribute, value):
"""Raise ValueError if array dimensions are unexpected."""
# Check array dimensions match the number of DIM_NAMES
expected_ndim = len(self.DIM_NAMES)
if value.ndim != expected_ndim:
raise logger.error(
ValueError(
f"Expected '{attribute.name}' to have "
f"{expected_ndim} dimensions, but got {value.ndim}."
)
)
# Check size of 'space' dimension
allowed_axis_size = self._ALLOWED_SPACE_DIM_SIZE
space_dim_size = value.shape[self.DIM_NAMES.index("space")]
if not isinstance(allowed_axis_size, Iterable):
allowed_axis_size = (allowed_axis_size,)
if space_dim_size not in allowed_axis_size:
allowed_dims_str = " or ".join(
str(dim) for dim in allowed_axis_size
)
raise logger.error(
ValueError(
f"Expected '{attribute.name}' to have {allowed_dims_str} "
f"spatial dimensions, but got {space_dim_size}."
)
)
@confidence_array.validator
def _validate_confidence_array(self, attribute, value):
"""Check confidence_array type and shape."""
if value is not None:
expected_shape = self._confidence_expected_shape
self._validate_array_shape(
attribute, value, expected_shape=expected_shape
)
@individual_names.validator
def _validate_individual_names(self, attribute, value):
"""Validate individual_names length and uniqueness."""
if value is not None:
individuals_dim_index = self.DIM_NAMES.index("individuals")
self._validate_list_length(
attribute,
value,
self.position_array.shape[individuals_dim_index],
)
self._validate_list_uniqueness(attribute, value)
# --- Utility methods ---
@staticmethod
def _validate_array_shape(
attribute: attrs.Attribute, value: np.ndarray, expected_shape: tuple
):
"""Raise ValueError if the value does not have the expected shape."""
if value.shape != expected_shape:
raise logger.error(
ValueError(
f"Expected '{attribute.name}' to have shape "
f"{expected_shape}, but got {value.shape}."
)
)
@staticmethod
def _validate_list_length(
attribute: attrs.Attribute, value: list | None, expected_length: int
):
"""Raise a ValueError if the list does not have the expected length."""
if value is not None and len(value) != expected_length:
raise logger.error(
ValueError(
f"Expected '{attribute.name}' to have "
f"length {expected_length}, but got {len(value)}."
)
)
@staticmethod
def _validate_list_uniqueness(
attribute: attrs.Attribute, value: list | None
):
"""Raise a ValueError if the list does not have unique elements."""
if value is not None and len(value) != len(set(value)):
raise logger.error(
ValueError(
f"Elements in '{attribute.name}' are not unique. "
f"There are {len(value)} elements in the list, but "
f"only {len(set(value))} are unique."
)
)
@abstractmethod
def to_dataset(self) -> xr.Dataset:
"""Convert validated inputs to a ``movement`` xarray.Dataset.
Returns
-------
xarray.Dataset
``movement`` dataset containing the validated data and metadata.
"""
...
@classmethod
def validate(cls, ds: xr.Dataset) -> None:
"""Validate that the dataset has the required variables and dimensions.
Parameters
----------
ds : xarray.Dataset
Dataset to validate.
Raises
------
TypeError
If the input is not an xarray Dataset.
ValueError
If the dataset is missing required data variables or dimensions
for a valid ``movement`` dataset.
"""
if not isinstance(ds, xr.Dataset):
raise logger.error(
TypeError(f"Expected an xarray Dataset, but got {type(ds)}.")
)
missing_vars = set(cls.VAR_NAMES) - set(
cast("Iterable[str]", ds.data_vars.keys())
)
if missing_vars:
raise logger.error(
ValueError(
f"Missing required data variables: {sorted(missing_vars)}"
)
) # sort for a reproducible error message
# Ignore type error - ds.dims will soon return a set of dim names
missing_dims = set(cls.DIM_NAMES) - set(ds.dims) # type: ignore[arg-type]
if missing_dims:
raise logger.error(
ValueError(
f"Missing required dimensions: {sorted(missing_dims)}"
)
) # sort for a reproducible error message
[docs]
@define(kw_only=True)
class ValidPosesInputs(_BaseDatasetInputs):
"""Class for validating input data for a ``movement poses`` dataset.
The validator ensures that within the ``movement poses`` dataset:
- The required ``position_array`` is a numpy array
with the ``space`` dimension containing 2 or 3 spatial coordinates.
- The optional ``confidence_array``, if provided, is a numpy array
with its shape matching that of the ``position_array``,
excluding the ``space`` dimension;
otherwise, it defaults to an array of NaNs.
- The optional ``individual_names`` and ``keypoint_names``,
if provided, match the number of individuals and keypoints
in the dataset, respectively; otherwise, default names are assigned.
- The optional ``fps`` is a positive float; otherwise, it defaults to None.
- The optional ``source_software`` is a string; otherwise,
it defaults to None.
Attributes
----------
position_array : np.ndarray
Array of shape (n_frames, n_space, n_keypoints, n_individuals)
containing the poses.
confidence_array : np.ndarray, optional
Array of shape (n_frames, n_keypoints, n_individuals) containing
the point-wise confidence scores.
If None (default), the scores will be set to an array of NaNs.
individual_names : list of str, optional
List of unique names for the individuals in the video. If None
(default), the individuals will be named "id_0", "id_1", etc.
keypoint_names : list of str, optional
List of unique names for the keypoints in the skeleton. If None
(default), the keypoints will be named "keypoint_0", "keypoint_1",
etc.
fps : float, optional
Frames per second of the video. Defaults to None.
source_software : str, optional
Name of the software from which the poses were loaded.
Defaults to None.
Raises
------
ValueError
If the dataset does not meet the ``movement poses``
dataset requirements.
"""
keypoint_names: list[str] | None = field(
default=None,
converter=converters.optional(_convert_to_list_of_str),
)
DIM_NAMES: ClassVar[tuple[str, ...]] = (
"time",
"space",
"keypoints",
"individuals",
)
VAR_NAMES: ClassVar[tuple[str, ...]] = ("position", "confidence")
_ALLOWED_SPACE_DIM_SIZE: ClassVar[Iterable[int]] = (2, 3)
@keypoint_names.validator
def _validate_keypoint_names(self, attribute, value):
"""Validate keypoint_names length and uniqueness."""
keypoints_dim_index = self.DIM_NAMES.index("keypoints")
self._validate_list_length(
attribute, value, self.position_array.shape[keypoints_dim_index]
)
self._validate_list_uniqueness(attribute, value)
def __attrs_post_init__(self):
"""Assign default values to optional attributes (if None)."""
super().__attrs_post_init__()
position_array_shape = self.position_array.shape
keypoints_dim_index = self.DIM_NAMES.index("keypoints")
if self.keypoint_names is None:
self.keypoint_names = [
f"keypoint_{i}"
for i in range(position_array_shape[keypoints_dim_index])
]
logger.info(
"Keypoint names were not provided. "
f"Setting to {self.keypoint_names}."
)
[docs]
def to_dataset(self) -> xr.Dataset:
"""Convert validated poses inputs to a ``movement poses`` dataset.
Returns
-------
xarray.Dataset
``movement`` dataset containing the pose tracks, confidence scores,
and associated metadata.
"""
n_frames = self.position_array.shape[0]
n_space = self.position_array.shape[1]
dataset_attrs: dict[str, str | float | None] = {
"source_software": self.source_software,
"ds_type": "poses",
}
# Create the time coordinate, depending on the value of fps
time_coords: NDArray[np.floating] | NDArray[np.integer]
time_unit: Literal["seconds", "frames"]
if self.fps is not None:
time_coords = np.arange(n_frames, dtype=np.float64) / self.fps
time_unit = "seconds"
dataset_attrs["fps"] = self.fps
else:
time_coords = np.arange(n_frames, dtype=np.int64)
time_unit = "frames"
dataset_attrs["time_unit"] = time_unit
DIM_NAMES = self.DIM_NAMES
# Convert data to an xarray.Dataset
return xr.Dataset(
data_vars={
"position": xr.DataArray(self.position_array, dims=DIM_NAMES),
"confidence": xr.DataArray(
self.confidence_array, dims=DIM_NAMES[:1] + DIM_NAMES[2:]
),
},
coords={
DIM_NAMES[0]: time_coords,
DIM_NAMES[1]: ["x", "y", "z"][:n_space],
DIM_NAMES[2]: self.keypoint_names,
DIM_NAMES[3]: self.individual_names,
},
attrs=dataset_attrs,
)
[docs]
@define(kw_only=True)
class ValidBboxesInputs(_BaseDatasetInputs):
"""Class for validating input data for a ``movement bboxes`` dataset.
The validator considers 2D bounding boxes only. It ensures that
within the ``movement bboxes`` dataset:
- The required ``position_array`` and ``shape_array`` are numpy arrays,
with the ``space`` dimension containing 2 spatial coordinates.
- The optional ``confidence_array``, if provided, is a numpy array
with its shape matching that of the ``position_array``,
excluding the ``space`` dimension;
otherwise, it defaults to an array of NaNs.
- The optional ``individual_names``, if provided, match the number of
individuals in the dataset; otherwise, default names are assigned.
- The optional ``frame_array``, if provided, is a column vector
with the frame numbers; otherwise, it defaults to an array of
0-based integers.
- The optional ``fps`` is a positive float; otherwise, it defaults to None.
- The optional ``source_software`` is a string; otherwise, it defaults to
None.
Attributes
----------
position_array : np.ndarray
Array of shape (n_frames, n_space, n_individuals)
containing the tracks of the bounding box centroids.
shape_array : np.ndarray
Array of shape (n_frames, n_space, n_individuals)
containing the shape of the bounding boxes. The shape of a bounding
box is its width (extent along the x-axis of the image) and height
(extent along the y-axis of the image).
confidence_array : np.ndarray, optional
Array of shape (n_frames, n_individuals) containing
the confidence scores of the bounding boxes. If None (default), the
confidence scores are set to an array of NaNs.
individual_names : list of str, optional
List of individual names for the tracked bounding boxes in the video.
If None (default), bounding boxes are assigned names based on the size
of the ``position_array``. The names will be in the format of
``id_<N>``, where <N> is an integer from 0 to
``position_array.shape[1]-1``.
frame_array : np.ndarray, optional
Array of shape (n_frames, 1) containing the frame numbers for which
bounding boxes are defined. If None (default), frame numbers will
be assigned based on the first dimension of the ``position_array``,
starting from 0.
fps : float, optional
Frames per second defining the sampling rate of the data.
Defaults to None.
source_software : str, optional
Name of the software that generated the data. Defaults to None.
Raises
------
ValueError
If the dataset does not meet the ``movement bboxes`` dataset
requirements.
"""
shape_array: np.ndarray = field(
validator=validators.instance_of(np.ndarray)
)
frame_array: np.ndarray | None = field(
default=None,
validator=validators.optional(validators.instance_of(np.ndarray)),
)
DIM_NAMES: ClassVar[tuple[str, ...]] = ("time", "space", "individuals")
VAR_NAMES: ClassVar[tuple[str, ...]] = ("position", "shape", "confidence")
_ALLOWED_SPACE_DIM_SIZE: ClassVar[int] = 2
@shape_array.validator
def _validate_shape_array(self, attribute, value):
"""Validate shape_array dimensions and shape."""
super()._validate_position_array(attribute, value)
# Shape must match that of position_array
self._validate_array_shape(
attribute, value, expected_shape=self.position_array.shape
)
@frame_array.validator
def _validate_frame_array(self, attribute, value):
"""Validate frame_array type, shape, and monotonicity."""
if value is not None:
# should be a column vector (n_frames, 1)
time_dim_index = self.DIM_NAMES.index("time")
self._validate_array_shape(
attribute,
value,
expected_shape=(self.position_array.shape[time_dim_index], 1),
)
# check frames are monotonically increasing
if not np.all(np.diff(value, axis=0) >= 1):
raise logger.error(
ValueError(
f"Frame numbers in {attribute.name} are "
"not monotonically increasing."
)
)
def __attrs_post_init__(self):
"""Assign default values to optional attributes (if None)."""
super().__attrs_post_init__()
# assign default frame_array
if self.frame_array is None:
time_dim_index = self.DIM_NAMES.index("time")
n_frames = self.position_array.shape[time_dim_index]
self.frame_array = np.arange(n_frames).reshape(-1, 1)
logger.info(
"Frame numbers were not provided. "
"Setting to an array of 0-based integers."
)
[docs]
def to_dataset(self) -> xr.Dataset:
"""Convert validated bboxes inputs to a ``movement bboxes`` dataset.
Returns
-------
xarray.Dataset
``movement`` dataset containing the bounding boxes tracks,
shapes, confidence scores and associated metadata.
"""
dataset_attrs: dict[str, str | float | None] = {
"source_software": self.source_software,
"ds_type": "bboxes",
}
# Ignore type error as ValidBboxesInputs ensures
# `frame_array` is not None
time_coords: NDArray[np.floating] | NDArray[np.integer] = (
self.frame_array.squeeze() # type: ignore[union-attr]
)
time_unit: Literal["seconds", "frames"] = "frames"
# if fps is provided:
# time_coords is expressed in seconds, with the time origin
# set as frame 0 == time 0 seconds
# Store fps as a dataset attribute
if self.fps:
# Compute elapsed time from frame 0.
time_coords = time_coords / self.fps
time_unit = "seconds"
dataset_attrs["fps"] = self.fps
dataset_attrs["time_unit"] = time_unit
# Convert data to an xarray.Dataset
# with dimensions ('time', 'space', 'individuals')
DIM_NAMES = self.DIM_NAMES
n_space = self.position_array.shape[1]
return xr.Dataset(
data_vars={
"position": xr.DataArray(self.position_array, dims=DIM_NAMES),
"shape": xr.DataArray(self.shape_array, dims=DIM_NAMES),
"confidence": xr.DataArray(
self.confidence_array, dims=DIM_NAMES[:1] + DIM_NAMES[2:]
),
},
coords={
DIM_NAMES[0]: time_coords,
DIM_NAMES[1]: ["x", "y", "z"][:n_space],
DIM_NAMES[2]: self.individual_names,
},
attrs=dataset_attrs,
)