dbstream: fix adjacency matrix building (#1340)

* dbstream: fix adj matrix building * Update unreleased.md * Update docs/releases/unreleased.md Co-authored-by: Max Halford <maxhalford25@gmail.com> --------- Co-authored-by: Max Halford <maxhalford25@gmail.com>
online-ml · Oct 15, 2023 · 424cc38 · 424cc38
1 parent bfb4ea6
commit 424cc38
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 42 deletions.
diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md
@@ -16,6 +16,7 @@ River's mini-batch methods now support pandas v2. In particular, River conforms
   - `cluster_is_up_to_date` is set to `True` at the end of the `self._recluster()` function.
   - Shared density graph update timestamps are initialized with the current timestamp value
   - `neighbour_neighbours` are appended correctly to the `seed_set` when generating cluster labels
+  - When building weighted adjacency matrix the algorithm accounts for possibly orphaned entries in shared density graph
 
 ## datasets
 

diff --git a/river/cluster/dbstream.py b/river/cluster/dbstream.py
@@ -287,18 +287,23 @@ def _generate_weighted_adjacency_matrix(self):
         weighted_adjacency_matrix = {}
         for i in list(self.s.keys()):
             for j in list(self.s[i].keys()):
-                if (
-                    self._micro_clusters[i].weight >= self.minimum_weight
-                    and self._micro_clusters[j].weight >= self.minimum_weight
-                ):
-                    value = self.s[i][j] / (
-                        (self._micro_clusters[i].weight + self._micro_clusters[j].weight) / 2
-                    )
-                    if value > self.intersection_factor:
-                        try:
-                            weighted_adjacency_matrix[i][j] = value
-                        except KeyError:
-                            weighted_adjacency_matrix[i] = {j: value}
+                try:
+                    if (
+                        self._micro_clusters[i].weight <= self.minimum_weight
+                        or self._micro_clusters[j].weight <= self.minimum_weight
+                    ):
+                        continue
+                except KeyError:
+                    continue
+
+                value = self.s[i][j] / (
+                    (self._micro_clusters[i].weight + self._micro_clusters[j].weight) / 2
+                )
+                if value > self.intersection_factor:
+                    try:
+                        weighted_adjacency_matrix[i][j] = value
+                    except KeyError:
+                        weighted_adjacency_matrix[i] = {j: value}
 
         return weighted_adjacency_matrix
 

diff --git a/river/cluster/test_dbstream.py b/river/cluster/test_dbstream.py
@@ -5,20 +5,30 @@
 from river.cluster import DBSTREAM
 
 
-@pytest.fixture
-def dbstream():
+def build_dbstream(fading_factor=0.001, intersection_factor=0.05):
     return DBSTREAM(
-        fading_factor=0.001, clustering_threshold=1, cleanup_interval=1, intersection_factor=0.05
+        fading_factor=fading_factor,
+        clustering_threshold=1,
+        cleanup_interval=1,
+        intersection_factor=intersection_factor,
     )
 
 
+def add_cluster(dbstream, initial_point, move_towards, times=1):
+    dbstream.learn_one(initial_point)
+    for _ in range(times):
+        dbstream.learn_one(move_towards)
+
+
 def assert_micro_cluster_properties(cluster, center, last_update=None):
     assert cluster.center == pytest.approx(center)
     if last_update is not None:
         assert cluster.last_update == last_update
 
 
-def test_cluster_formation_and_cleanup(dbstream: DBSTREAM):
+def test_cluster_formation_and_cleanup():
+    dbstream = build_dbstream()
+
     X = [
         {1: 1},
         {1: 3},
@@ -44,18 +54,12 @@ def test_cluster_formation_and_cleanup(dbstream: DBSTREAM):
     assert_micro_cluster_properties(dbstream.micro_clusters[7], center={1: 17}, last_update=12)
 
 
-def test_with_two_micro_clusters(dbstream: DBSTREAM):
-    # First micro-cluster
-    dbstream.learn_one({1: 1, 2: 1})
-    for _ in range(25):
-        dbstream.learn_one({1: 1.7, 2: 1.7})
-
-    # Second micro-cluster
-    dbstream.learn_one({1: 3, 2: 3})
-    for _ in range(25):
-        dbstream.learn_one({1: 2.3, 2: 2.3})
+def test_with_two_micro_clusters():
+    dbstream = build_dbstream()
 
-    # Points in the middle of two micro-clusters
+    add_cluster(dbstream, initial_point={1: 1, 2: 1}, move_towards={1: 1.7, 2: 1.7}, times=25)
+    add_cluster(dbstream, initial_point={1: 3, 2: 3}, move_towards={1: 2.3, 2: 2.3}, times=25)
+    # Points in the middle of first and second micro-clusters
     for _ in range(5):
         dbstream.learn_one({1: 2, 2: 2})
 
@@ -75,26 +79,16 @@ def test_with_two_micro_clusters(dbstream: DBSTREAM):
     assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.003033, 2: 2.003033})
 
 
-def test_density_graph_with_three_micro_clusters(dbstream: DBSTREAM):
-    # First micro-cluster
-    dbstream.learn_one({1: 1, 2: 1})
-    for _ in range(25):
-        dbstream.learn_one({1: 1.7, 2: 1.7})
-
-    # Second micro-cluster
-    dbstream.learn_one({1: 3, 2: 3})
-    for _ in range(25):
-        dbstream.learn_one({1: 2.3, 2: 2.3})
+def test_density_graph_with_three_micro_clusters():
+    dbstream = build_dbstream()
 
+    add_cluster(dbstream, initial_point={1: 1, 2: 1}, move_towards={1: 1.7, 2: 1.7}, times=25)
+    add_cluster(dbstream, initial_point={1: 3, 2: 3}, move_towards={1: 2.3, 2: 2.3}, times=25)
     # Points in the middle of first and second micro-clusters
     for _ in range(5):
         dbstream.learn_one({1: 2, 2: 2})
 
-    # Third micro-cluster
-    dbstream.learn_one({1: 4, 2: 4})
-    for _ in range(25):
-        dbstream.learn_one({1: 3.3, 2: 3.3})
-
+    add_cluster(dbstream, initial_point={1: 4, 2: 4}, move_towards={1: 3.3, 2: 3.3}, times=25)
     # Points in the middle of second and third micro-clusters
     for _ in range(4):
         dbstream.learn_one({1: 3, 2: 3})
@@ -118,3 +112,34 @@ def test_density_graph_with_three_micro_clusters(dbstream: DBSTREAM):
     dbstream._recluster()
     assert len(dbstream.clusters) == 1
     assert_micro_cluster_properties(dbstream.clusters[0], center={1: 2.489894, 2: 2.489894})
+
+
+def test_density_graph_with_removed_microcluster():
+    dbstream = build_dbstream(fading_factor=0.1, intersection_factor=0.3)
+
+    add_cluster(dbstream, initial_point={1: 1, 2: 1}, move_towards={1: 1.7, 2: 1.7}, times=25)
+    add_cluster(dbstream, initial_point={1: 3, 2: 3}, move_towards={1: 2.3, 2: 2.3}, times=25)
+    # Points in the middle of first and second micro-clusters
+    for _ in range(5):
+        dbstream.learn_one({1: 2, 2: 2})
+
+    add_cluster(dbstream, initial_point={1: 4, 2: 4}, move_towards={1: 3.3, 2: 3.3}, times=25)
+    # Points in the middle of second and third micro-clusters
+    for _ in range(4):
+        dbstream.learn_one({1: 3, 2: 3})
+
+    assert len(dbstream._micro_clusters) == 2
+    assert_micro_cluster_properties(
+        dbstream.micro_clusters[1], center={1: 2.461654, 2: 2.461654}, last_update=86
+    )
+    assert_micro_cluster_properties(
+        dbstream.micro_clusters[2], center={1: 3.430485, 2: 3.430485}, last_update=86
+    )
+
+    assert dbstream.s[0] == pytest.approx({1: 3.615835})
+    assert dbstream.s[1] == pytest.approx({2: 2.803583})
+    assert dbstream.s_t == {0: {1: 56}, 1: {2: 86}}
+
+    dbstream._recluster()
+    assert len(dbstream.clusters) == 1
+    assert_micro_cluster_properties(dbstream.clusters[0], center={1: 3.152231, 2: 3.152231})