Skip to content

dataset

Dataset(data)

Bases: OfflineEnvironment

A dataset environment.

This environment represents static tabular datasets.

Attributes:

Name Type Description
data LazyFrame

The data to represent.

Initialize the dataset environment.

Parameters:

Name Type Description Default
data DataFrame | LazyFrame

The data to represent.

required
Source code in src/flowcean/environments/dataset.py
25
26
27
28
29
30
31
32
33
34
35
36
def __init__(self, data: pl.DataFrame | pl.LazyFrame) -> None:
    """Initialize the dataset environment.

    Args:
        data: The data to represent.
    """
    if isinstance(data, pl.DataFrame):
        self.data = data.lazy()
        self._length = len(data)
    else:
        self.data = data
    super().__init__()

__len__()

Return the number of samples in the dataset.

Source code in src/flowcean/environments/dataset.py
42
43
44
45
46
47
48
49
50
def __len__(self) -> int:
    """Return the number of samples in the dataset."""
    if self._length is None:
        # This operation is potentially very slow / costly
        self._length = cast(
            int,
            self.data.select(pl.len()).collect().item(),
        )
    return self._length

collect(environment, n=None, *, progress_bar=True)

Collect data from an environment.

Parameters:

Name Type Description Default
environment Iterable[LazyFrame] | Collection[LazyFrame]

The environment to collect data from.

required
n int | None

Number of samples to collect. If None, all samples are collected.

None
progress_bar bool | dict[str, Any]

Whether to show a progress bar. If a dictionary is provided, it will be passed to the progress bar.

True

Returns:

Type Description
Dataset

The collected dataset.

Source code in src/flowcean/environments/dataset.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def collect(
    environment: Iterable[pl.LazyFrame] | Collection[pl.LazyFrame],
    n: int | None = None,
    *,
    progress_bar: bool | dict[str, Any] = True,
) -> Dataset:
    """Collect data from an environment.

    Args:
        environment: The environment to collect data from.
        n: Number of samples to collect. If None, all samples are collected.
        progress_bar: Whether to show a progress bar. If a dictionary is
            provided, it will be passed to the progress bar.

    Returns:
        The collected dataset.
    """
    from flowcean.environments.dataset import Dataset

    samples = islice(environment, n)

    if n is not None:
        total = n
    elif isinstance(environment, Collection):
        total = len(environment)
    else:
        total = None

    if isinstance(progress_bar, dict):
        progress_bar.setdefault("desc", "Collecting samples")
        progress_bar.setdefault("total", total)
        samples = tqdm(
            samples,
            **progress_bar,
        )
    elif progress_bar:
        samples = tqdm(samples, desc="Collecting samples", total=total)

    data = pl.concat(samples, how="vertical")
    return Dataset(data)