Skip to content

dataframe

DataFrame(data, *, data_hash=None)

Bases: OfflineEnvironment

A dataset environment.

This environment represents static tabular datasets.

Attributes:

Name Type Description
data LazyFrame

The data to represent.

Initialize the dataset environment.

Parameters:

Name Type Description Default
data DataFrame | LazyFrame

The data to represent.

required
data_hash bytes | None

The hash of the data. If None, it will be computed from the dataframe which is potentially slow and expensive.

None
Source code in src/flowcean/polars/environments/dataframe.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def __init__(
    self,
    data: pl.DataFrame | pl.LazyFrame,
    *,
    data_hash: bytes | None = None,
) -> None:
    """Initialize the dataset environment.

    Args:
        data: The data to represent.
        data_hash: The hash of the data. If None, it will be computed from
            the dataframe which is potentially slow and expensive.
    """
    if isinstance(data, pl.DataFrame):
        self.data = data.lazy()
        self._length = len(data)
    else:
        self.data = data

    self._hash = data_hash
    super().__init__()

from_csv(path, separator=',') classmethod

Load a dataset from a CSV file.

Parameters:

Name Type Description Default
path str | Path

Path to the CSV file.

required
separator str

Value separator. Defaults to ",".

','
Source code in src/flowcean/polars/environments/dataframe.py
54
55
56
57
58
59
60
61
62
63
64
@classmethod
def from_csv(cls, path: str | Path, separator: str = ",") -> Self:
    """Load a dataset from a CSV file.

    Args:
        path: Path to the CSV file.
        separator: Value separator. Defaults to ",".
    """
    data = pl.scan_csv(path, separator=separator)
    data = data.rename(lambda column_name: column_name.strip())
    return cls(data, data_hash=_hash_from_path(path))

from_json(path) classmethod

Load a dataset from a JSON file.

Parameters:

Name Type Description Default
path str | Path

Path to the JSON file.

required
Source code in src/flowcean/polars/environments/dataframe.py
66
67
68
69
70
71
72
73
74
@classmethod
def from_json(cls, path: str | Path) -> Self:
    """Load a dataset from a JSON file.

    Args:
        path: Path to the JSON file.
    """
    data = pl.read_json(path)
    return cls(data, data_hash=_hash_from_path(path))

from_parquet(path) classmethod

Load a dataset from a Parquet file.

Parameters:

Name Type Description Default
path str | Path

Path to the Parquet file.

required
Source code in src/flowcean/polars/environments/dataframe.py
76
77
78
79
80
81
82
83
84
@classmethod
def from_parquet(cls, path: str | Path) -> Self:
    """Load a dataset from a Parquet file.

    Args:
        path: Path to the Parquet file.
    """
    data = pl.scan_parquet(path)
    return cls(data, data_hash=_hash_from_path(path))

from_yaml(path) classmethod

Load a dataset from a YAML file.

Parameters:

Name Type Description Default
path str | Path

Path to the YAML file.

required
Source code in src/flowcean/polars/environments/dataframe.py
86
87
88
89
90
91
92
93
94
@classmethod
def from_yaml(cls, path: str | Path) -> Self:
    """Load a dataset from a YAML file.

    Args:
        path: Path to the YAML file.
    """
    data = pl.LazyFrame(YAML(typ="safe").load(path))
    return cls(data, data_hash=_hash_from_path(path))

from_uri(uri) classmethod

Load a dataset from a URI.

Parameters:

Name Type Description Default
uri str

The URI to load the dataset from.

required
Source code in src/flowcean/polars/environments/dataframe.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
@classmethod
def from_uri(cls, uri: str) -> Self:
    """Load a dataset from a URI.

    Args:
        uri: The URI to load the dataset from.
    """
    path = _file_uri_to_path(uri)
    suffix = path.suffix

    if suffix == ".csv":
        return cls.from_csv(path)
    if suffix == ".json":
        return cls.from_json(path)
    if suffix == ".parquet":
        return cls.from_parquet(path)
    if suffix in (".yaml", ".yml"):
        return cls.from_yaml(path)

    raise UnsupportedFileTypeError(suffix)

__len__()

Return the number of samples in the dataset.

Source code in src/flowcean/polars/environments/dataframe.py
127
128
129
130
131
132
133
134
135
def __len__(self) -> int:
    """Return the number of samples in the dataset."""
    if self._length is None:
        # This operation is potentially very slow / costly
        self._length = cast(
            int,
            self.data.select(pl.len()).collect().item(),
        )
    return self._length

InvalidUriSchemeError(scheme)

Bases: Exception

Exception raised when an URI scheme is invalid.

Initialize the InvalidUriSchemeError.

Parameters:

Name Type Description Default
scheme str

Invalid URI scheme.

required
Source code in src/flowcean/polars/environments/dataframe.py
194
195
196
197
198
199
200
201
202
def __init__(self, scheme: str) -> None:
    """Initialize the InvalidUriSchemeError.

    Args:
        scheme: Invalid URI scheme.
    """
    super().__init__(
        f"only file URIs can be converted to a path, but got `{scheme}`",
    )

UnsupportedFileTypeError(suffix)

Bases: Exception

Exception raised when a file type is not supported.

Initialize the UnsupportedFileTypeError.

Parameters:

Name Type Description Default
suffix str

File type suffix.

required
Source code in src/flowcean/polars/environments/dataframe.py
208
209
210
211
212
213
214
def __init__(self, suffix: str) -> None:
    """Initialize the UnsupportedFileTypeError.

    Args:
        suffix: File type suffix.
    """
    super().__init__(f"file type `{suffix}` is not supported")

collect(environment, n=None, *, progress_bar=True)

Collect data from an environment.

Parameters:

Name Type Description Default
environment Iterable[LazyFrame] | Collection[LazyFrame]

The environment to collect data from.

required
n int | None

Number of samples to collect. If None, all samples are collected.

None
progress_bar bool | dict[str, Any]

Whether to show a progress bar. If a dictionary is provided, it will be passed to the progress bar.

True

Returns:

Type Description
DataFrame

The collected dataset.

Source code in src/flowcean/polars/environments/dataframe.py
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
def collect(
    environment: Iterable[pl.LazyFrame] | Collection[pl.LazyFrame],
    n: int | None = None,
    *,
    progress_bar: bool | dict[str, Any] = True,
) -> DataFrame:
    """Collect data from an environment.

    Args:
        environment: The environment to collect data from.
        n: Number of samples to collect. If None, all samples are collected.
        progress_bar: Whether to show a progress bar. If a dictionary is
            provided, it will be passed to the progress bar.

    Returns:
        The collected dataset.
    """
    samples = islice(environment, n)

    if n is not None:
        total = n
    elif isinstance(environment, Collection):
        total = len(environment)
    else:
        total = None

    if isinstance(progress_bar, dict):
        progress_bar.setdefault("desc", "Collecting samples")
        progress_bar.setdefault("total", total)
        samples = tqdm(
            samples,
            **progress_bar,
        )
    elif progress_bar:
        samples = tqdm(samples, desc="Collecting samples", total=total)

    data = pl.concat(samples, how="vertical")
    return DataFrame(data)