Skip to content

match_sampling_rate

MatchSamplingRate(reference_feature_name, feature_interpolation_map, fill_strategy='both_ways')

Bases: Transform

Matches the sampling rate of all time series in the DataFrame.

Interpolates the time series to match the sampling rate of the reference time series. The below example shows the usage of a MatchSamplingRate transform in a run.py file. Assuming the loaded data is represented by the table:

| feature_a                   | feature_b                   | const |
| ---                         | ---                         | ---   |
| list[struct[time,struct[]]] | list[struct[time,struct[]]] | int   |
| --------------------------- | --------------------------- | ----- |
| [{12:26:01.0, {1.2}},       | [{12:26:00.0, {1.0}},       | 1     |
|  {12:26:02.0, {2.4}},       |  {12:26:05.0, {2.0}}]       |       |
|  {12:26:03.0, {3.6}},       |                             |       |
|  {12:26:04.0, {4.8}}]       |                             |       |

The following transform can be used to match the sampling rate of the time series feature_b to the sampling rate of the time series feature_a.

    ...
    environment.load()
    data = environment.get_data()
    transform = MatchSamplingRate(
        reference_feature_name="feature_a",
        feature_interpolation_map={
            "feature_b": "linear",
        },
    )
    transformed_data = transform.transform(data)
    ...

The resulting Dataframe after the transform is:

| feature_a                   | feature_b                   | const |
| ---                         | ---                         | ---   |
| list[struct[time,struct[]]] | list[struct[time,struct[]]] | int   |
| --------------------------- | --------------------------- | ----- |
| [{12:26:00.0, {1.2}},       | [{12:26:00.0, {1.2}},       | 1     |
|  {12:26:01.0, {2.4}},       |  {12:26:01.0, {1.4}},       |       |
|  {12:26:02.0, {3.6}},       |  {12:26:02.0, {1.6}},       |       |
|  {12:26:03.0, {4.8}}]       |  {12:26:03.0, {1.8}}]       |       |

Initialize the transform.

Parameters:

Name Type Description Default
reference_feature_name str

Reference timeseries feature.

required
feature_interpolation_map dict[str, MatchSamplingRateMethod]

Key-value pairs of the timeseries features that are targeted in interpolation columns and the interpolation method to use. The interpolation method can be 'linear' or 'nearest'.

required
fill_strategy FillStrategy

Strategy to fill missing values after interpolation.

'both_ways'
Source code in src/flowcean/polars/transforms/match_sampling_rate.py
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def __init__(
    self,
    reference_feature_name: str,
    feature_interpolation_map: dict[str, MatchSamplingRateMethod],
    fill_strategy: FillStrategy = "both_ways",
) -> None:
    """Initialize the transform.

    Args:
        reference_feature_name: Reference timeseries feature.
        feature_interpolation_map: Key-value pairs of the timeseries
            features that are targeted in interpolation columns and the
            interpolation method to use. The interpolation
            method can be 'linear' or 'nearest'.
        fill_strategy: Strategy to fill missing values after interpolation.
    """
    self.reference_feature_name = reference_feature_name
    self.feature_interpolation_map = feature_interpolation_map
    self.fill_strategy = fill_strategy

apply(data)

Transform the input DataFrame.

Parameters:

Name Type Description Default
data LazyFrame

Input DataFrame.

required

Returns:

Type Description
LazyFrame

Transformed DataFrame.

Source code in src/flowcean/polars/transforms/match_sampling_rate.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def apply(self, data: pl.LazyFrame) -> pl.LazyFrame:
    """Transform the input DataFrame.

    Args:
        data: Input DataFrame.

    Returns:
        Transformed DataFrame.

    """
    # preserve all constant columns that are not timeseries data
    transformed_data = pl.DataFrame()
    collected_data = data.collect()
    for i in range(len(collected_data.rows())):
        transformed_data_slice = self._transform_row(
            collected_data.slice(i, 1),
        )
        transformed_data = transformed_data.vstack(transformed_data_slice)
    return transformed_data.lazy()

FeatureNotFoundError(feature)

Bases: Exception

Feature not found in the DataFrame.

This exception is raised when a feature is not found in the DataFrame.

Source code in src/flowcean/polars/transforms/match_sampling_rate.py
277
278
def __init__(self, feature: str) -> None:
    super().__init__(f"{feature} not found")

interpolate_feature(target_feature_name, data, reference_feature, interpolation_method='linear', fill_strategy=None)

Interpolate a single time series feature using Polars expressions.

Source code in src/flowcean/polars/transforms/match_sampling_rate.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
def interpolate_feature(
    target_feature_name: str,
    data: pl.DataFrame,
    reference_feature: pl.DataFrame,
    interpolation_method: Literal["linear", "nearest"] = "linear",
    fill_strategy: FillStrategy | None = None,
) -> pl.DataFrame:
    """Interpolate a single time series feature using Polars expressions."""
    logger.debug("Interpolating feature %s", target_feature_name)

    # Extract and unnest feature dataframe
    feature_df = data.select(pl.col(target_feature_name).explode()).unnest(
        cs.all(),
    )
    # Handle scalar 'value' by wrapping into a struct
    if "value" in feature_df.columns:
        if not isinstance(feature_df.schema["value"], pl.Struct):
            # Wrap scalar 'value' into a struct with a single field
            feature_df = feature_df.with_columns(
                pl.struct([pl.col("value").alias("value")]).alias("value"),
            )
        # Now unnest the 'value' struct
        feature_df = feature_df.unnest("value")
    else:
        msg = f"Feature {target_feature_name} is missing 'value' field."
        raise ValueError(
            msg,
        )

    # Rename columns except 'time' to include the feature name
    feature_df = feature_df.rename(
        lambda name: f"{target_feature_name}_{name}"
        if name != "time"
        else name,
    )

    # Get reference times and feature times
    reference_times = reference_feature.get_column("time")
    feature_times = feature_df.get_column("time")

    # Combine all unique times and sort
    all_times = (
        pl.concat([reference_times, feature_times])
        .unique()
        .sort()
        .to_frame("time")
    )

    # Join with feature data
    joined_df = all_times.join(feature_df, on="time", how="left")

    # Get value columns (excluding time)
    value_columns = [col for col in feature_df.columns if col != "time"]

    # Interpolate missing values
    interpolated = joined_df.with_columns(
        [
            pl.col(col).interpolate(method=interpolation_method)
            for col in value_columns
        ],
    )
    if fill_strategy == "both_ways":
        fill_strategy = "backward"
        interpolated = interpolated.with_columns(
            [
                pl.col(col).fill_null(strategy=fill_strategy)
                for col in value_columns
            ],
        )
        fill_strategy = "forward"
        interpolated = interpolated.with_columns(
            [
                pl.col(col).fill_null(strategy=fill_strategy)
                for col in value_columns
            ],
        )
    elif fill_strategy:
        interpolated = interpolated.with_columns(
            [
                pl.col(col).fill_null(strategy=fill_strategy)
                for col in value_columns
            ],
        )

    # Filter to only include reference times
    interpolated = interpolated.filter(pl.col("time").is_in(reference_times))
    # Determine if the original 'value' was a scalar
    is_scalar_value = (
        len(value_columns) == 1
        and value_columns[0] == f"{target_feature_name}_value"
    )

    # Restructure to nested format, preserving scalar 'value' if needed
    if is_scalar_value:
        restructure_value = pl.col(value_columns[0]).alias("value")
    else:
        restructure_value = pl.struct(value_columns).alias("value")

    restructure = pl.struct(
        pl.col("time"),
        restructure_value,
    ).alias(target_feature_name)

    return interpolated.select(restructure).select(pl.all().implode())