diff --git a/river/stream/iter_arff.py b/river/stream/iter_arff.py index 6c91aec40a..f9eec46ded 100644 --- a/river/stream/iter_arff.py +++ b/river/stream/iter_arff.py @@ -176,7 +176,7 @@ def iter_arff( x = { name: cast(val) if cast else val for name, cast, val in zip(names, casts, r.rstrip().split(",")) - if val != "?" and val != '' + if val != "?" and val != "" } # Handle target @@ -189,7 +189,7 @@ def iter_arff( y = x.pop(target) if target else None except KeyError: y = None - + yield x, y # Close the file if we opened it diff --git a/river/tree/hoeffding_adaptive_tree_classifier.py b/river/tree/hoeffding_adaptive_tree_classifier.py index 7693fdeff9..ce40900ffc 100644 --- a/river/tree/hoeffding_adaptive_tree_classifier.py +++ b/river/tree/hoeffding_adaptive_tree_classifier.py @@ -129,7 +129,7 @@ class HoeffdingAdaptiveTreeClassifier(HoeffdingTreeClassifier): >>> metric = metrics.Accuracy() >>> evaluate.progressive_val_score(dataset, model, metric) - + Accuracy: 91.49% """ diff --git a/river/tree/last_classifier.py b/river/tree/last_classifier.py index 1366a66901..91b271adb1 100644 --- a/river/tree/last_classifier.py +++ b/river/tree/last_classifier.py @@ -4,10 +4,14 @@ from .hoeffding_tree_classifier import HoeffdingTreeClassifier from .nodes.branch import DTBranch -from .nodes.last_nodes import LeafMajorityClassWithDetector, LeafNaiveBayesWithDetector, LeafNaiveBayesAdaptiveWithDetector +from .nodes.last_nodes import ( + LeafMajorityClassWithDetector, + LeafNaiveBayesAdaptiveWithDetector, + LeafNaiveBayesWithDetector, +) from .nodes.leaf import HTLeaf from .split_criterion import GiniSplitCriterion, HellingerDistanceCriterion, InfoGainSplitCriterion -from .splitter import GaussianSplitter, Splitter +from .splitter import Splitter class LASTClassifier(HoeffdingTreeClassifier, base.Classifier): @@ -28,9 +32,9 @@ class LASTClassifier(HoeffdingTreeClassifier, base.Classifier): - 'nb' - Naive Bayes
- 'nba' - Naive Bayes Adaptive
change_detector - Change detector that will be created at each leaf of the tree. + Change detector that will be created at each leaf of the tree. track_error - If True, the change detector will have binary inputs for error predictions, + If True, the change detector will have binary inputs for error predictions, otherwise the input will be the split criteria. nb_threshold Number of instances a leaf should observe before allowing Naive Bayes. @@ -97,6 +101,7 @@ class LASTClassifier(HoeffdingTreeClassifier, base.Classifier): >>> metric = metrics.Accuracy() >>> evaluate.progressive_val_score(dataset, model, metric) + Accuracy: 92.50% """ @@ -105,8 +110,8 @@ def __init__( max_depth: int | None = None, split_criterion: str = "info_gain", leaf_prediction: str = "nba", - change_detector:base.DriftDetector| None = None, - track_error : bool = True, + change_detector: base.DriftDetector | None = None, + track_error: bool = True, nb_threshold: int = 0, nominal_attributes: list | None = None, splitter: Splitter | None = None, @@ -120,23 +125,23 @@ def __init__( merit_preprune: bool = True, ): super().__init__( - grace_period=None, + grace_period=1, #no usage max_depth=max_depth, split_criterion=split_criterion, - delta=None, - tau=None, + delta=1., #no usage + tau=1, #no usage leaf_prediction=leaf_prediction, - nb_threshold = nb_threshold, + nb_threshold=nb_threshold, binary_split=binary_split, max_size=max_size, memory_estimate_period=memory_estimate_period, stop_mem_management=stop_mem_management, remove_poor_attrs=remove_poor_attrs, merit_preprune=merit_preprune, - nominal_attributes = nominal_attributes, - splitter = splitter, - min_branch_fraction = min_branch_fraction, - max_share_to_split = max_share_to_split, + nominal_attributes=nominal_attributes, + splitter=splitter, + min_branch_fraction=min_branch_fraction, + max_share_to_split=max_share_to_split, ) self.change_detector = change_detector if change_detector is not None else drift.ADWIN() self.track_error = track_error @@ -148,7 +153,6 @@ def __init__( def _mutable_attributes(self): return {} - def _new_leaf(self, initial_stats=None, parent=None): if initial_stats is None: initial_stats = {} @@ -159,20 +163,43 @@ def _new_leaf(self, initial_stats=None, parent=None): if not self.track_error: if self._leaf_prediction == self._MAJORITY_CLASS: - return LeafMajorityClassWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone()) + return LeafMajorityClassWithDetector( + initial_stats, depth, self.splitter, self.change_detector.clone() + ) elif self._leaf_prediction == self._NAIVE_BAYES: - return LeafNaiveBayesWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone()) + return LeafNaiveBayesWithDetector( + initial_stats, depth, self.splitter, self.change_detector.clone() + ) else: # Naives Bayes Adaptive (default) - return LeafNaiveBayesAdaptiveWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone()) + return LeafNaiveBayesAdaptiveWithDetector( + initial_stats, depth, self.splitter, self.change_detector.clone() + ) else: split_criterion = self._new_split_criterion() if self._leaf_prediction == self._MAJORITY_CLASS: - return LeafMajorityClassWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone(), split_criterion) + return LeafMajorityClassWithDetector( + initial_stats, + depth, + self.splitter, + self.change_detector.clone(), + split_criterion, + ) elif self._leaf_prediction == self._NAIVE_BAYES: - return LeafNaiveBayesWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone(), split_criterion) + return LeafNaiveBayesWithDetector( + initial_stats, + depth, + self.splitter, + self.change_detector.clone(), + split_criterion, + ) else: # Naives Bayes Adaptive (default) - return LeafNaiveBayesAdaptiveWithDetector(initial_stats, depth, self.splitter, self.change_detector.clone(), split_criterion) - + return LeafNaiveBayesAdaptiveWithDetector( + initial_stats, + depth, + self.splitter, + self.change_detector.clone(), + split_criterion, + ) def _new_split_criterion(self): if self._split_criterion == self._GINI_SPLIT: @@ -181,8 +208,10 @@ def _new_split_criterion(self): split_criterion = InfoGainSplitCriterion(self.min_branch_fraction) elif self._split_criterion == self._HELLINGER_SPLIT: if not self.track_error: - raise ValueError("The Heillinger distance cannot estimate the purity of a single distribution.\ - Use another split criterion or set track_error to True") + raise ValueError( + "The Heillinger distance cannot estimate the purity of a single distribution.\ + Use another split criterion or set track_error to True" + ) split_criterion = HellingerDistanceCriterion(self.min_branch_fraction) else: split_criterion = InfoGainSplitCriterion(self.min_branch_fraction) @@ -315,7 +344,7 @@ def learn_one(self, x, y, *, w=1.0): self._n_inactive_leaves += 1 else: weight_seen = node.total_weight - #check if the change detector triggered a change + # check if the change detector triggered a change if node.change_detector.drift_detected: p_branch = p_node.branch_no(x) if isinstance(p_node, DTBranch) else None self._attempt_to_split(node, p_node, p_branch) @@ -345,4 +374,3 @@ def learn_one(self, x, y, *, w=1.0): if self._train_weight_seen_by_model % self.memory_estimate_period == 0: self._estimate_model_size() - diff --git a/river/tree/nodes/last_nodes.py b/river/tree/nodes/last_nodes.py index 7c178c86c8..5ee89fb431 100644 --- a/river/tree/nodes/last_nodes.py +++ b/river/tree/nodes/last_nodes.py @@ -1,10 +1,6 @@ from __future__ import annotations -from river.tree.utils import BranchFactory -from river.utils.norm import normalize_values_in_dict - -from ..splitter.nominal_splitter_classif import NominalSplitterClassif -from ..utils import do_naive_bayes_prediction, round_sig_fig +from ..utils import do_naive_bayes_prediction from .htc_nodes import LeafMajorityClass @@ -24,23 +20,26 @@ class LeafMajorityClassWithDetector(LeafMajorityClass): Other parameters passed to the learning node. """ - def __init__(self, stats, depth, splitter,change_detector, split_criterion = None, **kwargs): + def __init__(self, stats, depth, splitter, change_detector, split_criterion=None, **kwargs): super().__init__(stats, depth, splitter, **kwargs) self.change_detector = change_detector - self.split_criterion = split_criterion #if None, the change detector will have binary inputs - + self.split_criterion = ( + split_criterion # if None, the change detector will have binary inputs + ) + def learn_one(self, x, y, *, w=1, tree=None): self.update_stats(y, w) if self.is_active(): if self.split_criterion is None: mc_pred = self.prediction(x) - detector_input = (max(mc_pred, key=mc_pred.get) != y) + detector_input = max(mc_pred, key=mc_pred.get) != y self.change_detector.update(detector_input) else: - detector_input = self.split_criterion.purity(self.stats) + detector_input = self.split_criterion.current_merit(self.stats) self.change_detector.update(detector_input) self.update_splitters(x, y, w, tree.nominal_attributes) + class LeafNaiveBayesWithDetector(LeafMajorityClassWithDetector): """Leaf that uses Naive Bayes models. @@ -57,18 +56,18 @@ class LeafNaiveBayesWithDetector(LeafMajorityClassWithDetector): Other parameters passed to the learning node. """ - def __init__(self, stats, depth, splitter,change_detector, split_criterion = None, **kwargs): - super().__init__(stats, depth, splitter,change_detector,split_criterion,**kwargs) - + def __init__(self, stats, depth, splitter, change_detector, split_criterion=None, **kwargs): + super().__init__(stats, depth, splitter, change_detector, split_criterion, **kwargs) + def learn_one(self, x, y, *, w=1, tree=None): self.update_stats(y, w) if self.is_active(): if self.split_criterion is None: nb_pred = self.prediction(x) - detector_input = (max(nb_pred, key=nb_pred.get) == y) + detector_input = max(nb_pred, key=nb_pred.get) == y self.change_detector.update(detector_input) else: - detector_input = self.split_criterion.purity(self.stats) + detector_input = self.split_criterion.current_merit(self.stats) self.change_detector.update(detector_input) self.update_splitters(x, y, w, tree.nominal_attributes) @@ -108,8 +107,8 @@ class LeafNaiveBayesAdaptiveWithDetector(LeafMajorityClassWithDetector): Other parameters passed to the learning node. """ - def __init__(self, stats, depth, splitter, change_detector,split_criterion = None, **kwargs): - super().__init__(stats, depth, splitter, change_detector, split_criterion,**kwargs) + def __init__(self, stats, depth, splitter, change_detector, split_criterion=None, **kwargs): + super().__init__(stats, depth, splitter, change_detector, split_criterion, **kwargs) self._mc_correct_weight = 0.0 self._nb_correct_weight = 0.0 @@ -150,13 +149,10 @@ def learn_one(self, x, y, *, w=1.0, tree=None): else: self.change_detector.update(detector_input_mc) else: - detector_input = self.split_criterion.purity(self.stats) + detector_input = self.split_criterion.current_merit(self.stats) self.change_detector.update(detector_input) self.update_splitters(x, y, w, tree.nominal_attributes) - - - def prediction(self, x, *, tree=None): """Get the probabilities per class for a given instance. @@ -188,4 +184,4 @@ def disable_attribute(self, att_index): att_index Attribute index. """ - pass \ No newline at end of file + pass diff --git a/river/tree/split_criterion/base.py b/river/tree/split_criterion/base.py index ae09efd397..84da388b4b 100644 --- a/river/tree/split_criterion/base.py +++ b/river/tree/split_criterion/base.py @@ -31,11 +31,10 @@ def merit_of_split(self, pre_split_dist, post_split_dist): ------- Value of the merit of splitting """ - + @abc.abstractmethod - def purity(self, dist): - """Compute how pure (how close the distribution is to have only a single class) - the distribution is. + def current_merit(self, dist): + """Compute the merit of the distribution. Parameters ---------- @@ -44,7 +43,7 @@ def purity(self, dist): Returns ------- - Value of purity of the distribution according to the splitting merit + Value of merit of the distribution according to the splitting criterion """ @staticmethod diff --git a/river/tree/split_criterion/gini_split_criterion.py b/river/tree/split_criterion/gini_split_criterion.py index abf49ca783..1a71c1b651 100644 --- a/river/tree/split_criterion/gini_split_criterion.py +++ b/river/tree/split_criterion/gini_split_criterion.py @@ -27,9 +27,8 @@ def merit_of_split(self, pre_split_dist, post_split_dist): post_split_dist[i], dist_weights[i] ) return 1.0 - gini - - - def purity(self, dist): + + def current_merit(self, dist): return self.compute_gini(dist, sum(dist.values())) @staticmethod diff --git a/river/tree/split_criterion/hellinger_distance_criterion.py b/river/tree/split_criterion/hellinger_distance_criterion.py index 2f7662f34f..236564f9bd 100644 --- a/river/tree/split_criterion/hellinger_distance_criterion.py +++ b/river/tree/split_criterion/hellinger_distance_criterion.py @@ -28,7 +28,7 @@ def merit_of_split(self, pre_split_dist, post_split_dist): return -math.inf return self.compute_hellinger(post_split_dist) - def purity(self, dist): + def current_merit(self, dist): raise ValueError("The Heillinger distance is for 2 or more sets of data.") @staticmethod diff --git a/river/tree/split_criterion/info_gain_split_criterion.py b/river/tree/split_criterion/info_gain_split_criterion.py index f8ced1b03d..b112ff829d 100644 --- a/river/tree/split_criterion/info_gain_split_criterion.py +++ b/river/tree/split_criterion/info_gain_split_criterion.py @@ -38,8 +38,8 @@ def compute_entropy(self, dist): return self._compute_entropy_dict(dist) elif isinstance(dist, list): return self._compute_entropy_list(dist) - - def purity(self, dist): + + def current_merit(self, dist): return self.compute_entropy(dist) @staticmethod diff --git a/river/tree/split_criterion/variance_ratio_split_criterion.py b/river/tree/split_criterion/variance_ratio_split_criterion.py index 3f921da140..dfdff8ea55 100644 --- a/river/tree/split_criterion/variance_ratio_split_criterion.py +++ b/river/tree/split_criterion/variance_ratio_split_criterion.py @@ -34,9 +34,9 @@ def merit_of_split(self, pre_split_dist, post_split_dist): vr -= (n_i / n) * (self.compute_var(post_split_dist[i]) / var) return vr - def purity(self, dist): + def current_merit(self, dist): return self.compute_var(dist) - + @staticmethod def compute_var(dist): return dist.get()