Skip to content

random_forest

RandomForestRegressorLearner(n_estimators=100, *, criterion='squared_error', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=1.0, max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None, monotonic_cst=None, callbacks=None)

Bases: SupervisedLearner

Wrapper class for sklearn's RandomForestRegressor.

Reference: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

Initialize the random forest learner.

Reference: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

Parameters:

Name Type Description Default
n_estimators int

Number of trees in the forest.

100
criterion str

Function to measure the quality of a split.

'squared_error'
max_depth int | None

Maximum depth of the tree.

None
min_samples_split int

Minimum number of samples required to split an internal node.

2
min_samples_leaf int

Minimum number of samples required to be at a leaf node.

1
min_weight_fraction_leaf float

Minimum weighted fraction of the sum total of weights required to be at a leaf node.

0.0
max_features float

Number of features to consider when looking for the best split.

1.0
max_leaf_nodes int | None

Grow trees with max_leaf_nodes in best-first fashion.

None
min_impurity_decrease float

A node will be split if this split induces a decrease of the impurity greater than or equal to this value.

0.0
bootstrap bool

Whether bootstrap samples are used when building trees.

True
oob_score bool

Whether to use out-of-bag samples to estimate the R^2 on unseen data.

False
n_jobs int | None

Number of jobs to run in parallel.

None
random_state int | None

Controls the randomness of the estimator.

None
verbose int

Controls the verbosity when fitting and predicting.

0
warm_start bool

When set to True, reuse the solution of the previous call to fit.

False
ccp_alpha float

Complexity parameter used for Minimal Cost-Complexity Pruning.

0.0
max_samples int | float | None

If bootstrap is True, the number of samples to draw from X to train each base estimator.

None
monotonic_cst NDArray | None

Monotonicity constraints.

None
callbacks list[LearnerCallback] | LearnerCallback | None

Optional callbacks for progress feedback. Use None for silent learning.

None
Source code in src/flowcean/sklearn/random_forest.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def __init__(
    self,
    n_estimators: int = 100,
    *,
    criterion: str = "squared_error",
    max_depth: int | None = None,
    min_samples_split: int = 2,
    min_samples_leaf: int = 1,
    min_weight_fraction_leaf: float = 0.0,
    max_features: float = 1.0,
    max_leaf_nodes: int | None = None,
    min_impurity_decrease: float = 0.0,
    bootstrap: bool = True,
    oob_score: bool = False,
    n_jobs: int | None = None,
    random_state: int | None = None,
    verbose: int = 0,
    warm_start: bool = False,
    ccp_alpha: float = 0.0,
    max_samples: int | float | None = None,  # noqa: PYI041
    monotonic_cst: NDArray | None = None,
    callbacks: list[LearnerCallback] | LearnerCallback | None = None,
) -> None:
    """Initialize the random forest learner.

    Reference: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

    Args:
        n_estimators: Number of trees in the forest.
        criterion: Function to measure the quality of a split.
        max_depth: Maximum depth of the tree.
        min_samples_split: Minimum number of samples required to split
            an internal node.
        min_samples_leaf: Minimum number of samples required to be at
            a leaf node.
        min_weight_fraction_leaf: Minimum weighted fraction of the sum
            total of weights required to be at a leaf node.
        max_features: Number of features to consider when looking for
            the best split.
        max_leaf_nodes: Grow trees with max_leaf_nodes in best-first
            fashion.
        min_impurity_decrease: A node will be split if this split
            induces a decrease of the impurity greater than or equal
            to this value.
        bootstrap: Whether bootstrap samples are used when building trees.
        oob_score: Whether to use out-of-bag samples to estimate the R^2
            on unseen data.
        n_jobs: Number of jobs to run in parallel.
        random_state: Controls the randomness of the estimator.
        verbose: Controls the verbosity when fitting and predicting.
        warm_start: When set to True, reuse the solution of the previous
            call to fit.
        ccp_alpha: Complexity parameter used for Minimal Cost-Complexity
            Pruning.
        max_samples: If bootstrap is True, the number of samples to draw
            from X to train each base estimator.
        monotonic_cst: Monotonicity constraints.
        callbacks: Optional callbacks for progress feedback. Use `None`
            for silent learning.
    """
    self.regressor = RandomForestRegressor(
        n_estimators=n_estimators,
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        min_weight_fraction_leaf=min_weight_fraction_leaf,
        max_features=max_features,
        max_leaf_nodes=max_leaf_nodes,
        min_impurity_decrease=min_impurity_decrease,
        bootstrap=bootstrap,
        oob_score=oob_score,
        n_jobs=n_jobs,
        random_state=random_state or get_seed(),
        verbose=verbose,
        warm_start=warm_start,
        ccp_alpha=ccp_alpha,
        max_samples=max_samples,
        monotonic_cst=monotonic_cst,
    )
    self.callback_manager = create_callback_manager(callbacks)

learn(inputs, outputs)

Fit the random forest regressor on the given inputs and outputs.

Source code in src/flowcean/sklearn/random_forest.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
@override
def learn(
    self,
    inputs: pl.LazyFrame,
    outputs: pl.LazyFrame,
) -> Model:
    """Fit the random forest regressor on the given inputs and outputs."""
    dfs = pl.collect_all([inputs, outputs])
    collected_inputs = dfs[0]
    collected_outputs = dfs[1]

    # Notify callbacks that learning is starting
    context = {
        "n_estimators": self.regressor.n_estimators,
        "n_samples": len(collected_inputs),
        "n_features": len(collected_inputs.columns),
    }
    self.callback_manager.on_learning_start(self, context)

    try:
        # Fit the model (flatten outputs to 1D if single column)
        outputs_array = collected_outputs.to_numpy()
        if outputs_array.shape[1] == 1:
            outputs_array = outputs_array.ravel()
        self.regressor.fit(collected_inputs, outputs_array)
        logger.info("Using Random Forest Regressor")

        # Create the model
        model = SciKitModel(
            self.regressor,
            output_names=outputs.collect_schema().names(),
        )

        # Notify callbacks that learning is complete
        self.callback_manager.on_learning_end(self, model)
    except Exception as e:
        # Notify callbacks of the error
        self.callback_manager.on_learning_error(self, e)
        raise
    else:
        return model