dataeval.data.Metadata

class dataeval.data.Metadata(dataset, *, continuous_factor_bins=None, auto_bin_method='uniform_width', exclude=None, include=None)

Class containing binned metadata using Polars DataFrames.

Parameters:
dataset : ImageClassificationDataset or ObjectDetectionDataset

Dataset to access original targets and metadata from.

continuous_factor_bins : Mapping[str, int | Sequence[float]] | None, default None

Mapping from continuous factor name to the number of bins or bin edges

auto_bin_method : Literal["uniform_width", "uniform_count", "clusters"], default "uniform_width"

Method for automatically determining the number of bins for continuous factors

exclude : Sequence[str] | None, default None

Filter metadata factors to exclude the specified factors, cannot be set with include

include : Sequence[str] | None, default None

Filter metadata factors to include the specified factors, cannot be set with exclude

add_factors(factors)

Add additional factors to the metadata.

The number of measures per factor must match the number of images in the dataset or the number of detections in the dataset.

Parameters:
factors : Mapping[str, Array | Sequence[Any]]

Dictionary of factors to add to the metadata.

Return type:

None

get_factors_by_type(factor_type)

Get the names of factors of a specific type.

Parameters:
factor_type : Literal["categorical", "continuous", "discrete"]

The type of factors to retrieve.

Returns:

List of factor names of the specified type.

Return type:

list[str]

property auto_bin_method : 'uniform_width' | 'uniform_count' | 'clusters'

Binning method to use when continuous_factor_bins is not defined.

Return type:

Literal[‘uniform_width’, ‘uniform_count’, ‘clusters’]

property class_labels : numpy.typing.NDArray[numpy.intp]

Class labels as a NumPy array.

Return type:

numpy.typing.NDArray[numpy.intp]

property class_names : collections.abc.Sequence[str]

Class names as a list of strings.

Return type:

Sequence[str]

property continuous_factor_bins : Mapping[str, int | collections.abc.Sequence[float]]

Map of factor names to bin counts or bin edges.

Return type:

Mapping[str, int | Sequence[float]]

property dataframe : polars.DataFrame

Dataframe containing target information and metadata factors.

Return type:

polars.DataFrame

property discretized_data : numpy.typing.NDArray[numpy.int64]

Factor data with continuous data discretized.

Return type:

numpy.typing.NDArray[numpy.int64]

property dropped_factors : Mapping[str, collections.abc.Sequence[str]]

Factors that were dropped during preprocessing and the reasons why they were dropped.

Return type:

Mapping[str, Sequence[str]]

property exclude : set[str]

Factors to exclude from the metadata.

Return type:

set[str]

property factor_data : numpy.typing.NDArray[Any]

Factor data as a NumPy array.

Return type:

numpy.typing.NDArray[Any]

property factor_info : Mapping[str, FactorInfo]

Factor types of the metadata.

Return type:

Mapping[str, FactorInfo]

property factor_names : collections.abc.Sequence[str]

Factor names of the metadata.

Return type:

Sequence[str]

property image_indices : numpy.typing.NDArray[numpy.intp]

Indices of images as a NumPy array.

Return type:

numpy.typing.NDArray[numpy.intp]

property include : set[str]

Factors to include from the metadata.

Return type:

set[str]

property raw : collections.abc.Sequence[Mapping[str, Any]]

The raw list of metadata dictionaries for the dataset.

Return type:

Sequence[Mapping[str, Any]]