From 11bd20355061b1c947b6c716c4685a4b66115350 Mon Sep 17 00:00:00 2001 From: anuunchin <88698977+anuunchin@users.noreply.github.com> Date: Fri, 12 Dec 2025 10:11:09 +0100 Subject: [PATCH 1/2] Initial commit: preliminary tests, some helper functions --- scripts/builtin/hdbscan.dml | 132 ++++++++++++++++++ scripts/builtin/hdbscanApply.dml | 0 .../functions/builtin/BuiltinHDBSCANTest.java | 0 src/test/scripts/functions/builtin/hdbscan.R | 0 .../scripts/functions/builtin/hdbscan.dml | 20 +++ .../functions/builtin/hdbscanApply.dml | 0 test_build_mst.dml | 46 ++++++ test_kth_smallest.dml | 14 ++ test_mutual_reachability.dml | 26 ++++ 9 files changed, 238 insertions(+) create mode 100644 scripts/builtin/hdbscan.dml create mode 100644 scripts/builtin/hdbscanApply.dml create mode 100644 src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java create mode 100644 src/test/scripts/functions/builtin/hdbscan.R create mode 100644 src/test/scripts/functions/builtin/hdbscan.dml create mode 100644 src/test/scripts/functions/builtin/hdbscanApply.dml create mode 100644 test_build_mst.dml create mode 100644 test_kth_smallest.dml create mode 100644 test_mutual_reachability.dml diff --git a/scripts/builtin/hdbscan.dml b/scripts/builtin/hdbscan.dml new file mode 100644 index 00000000000..ca48c7fdfc6 --- /dev/null +++ b/scripts/builtin/hdbscan.dml @@ -0,0 +1,132 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# The hdbscan function is used to perform the hdbscan clustering +# algorithm using knn-based mutual reachability distance and minimum spanning tree. +# +# INPUT: +# ------------------------------------------------------------ +# X The input Matrix to do hdbscan on. +# minPts Minimum number of points for core distance computation. (Defaults to 5) +# minClSize Minimum cluster size (Defaults to minPts) +# ------------------------------------------------------------ +# +# OUTPUT: +# ------------------------------------------------------------ +# clusterMems Cluster labels for each point +# clusterModel The cluster centroids for prediction +# ------------------------------------------------------------ + +# TODO: m,s , f? +m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = -1) + return (Matrix[Double] clusterMems, Matrix[Double] clusterModel) +{ + if(minPts < 2) { + stop("HDBSCAN: minPts should be at least 2") + } + + if(minClSize < 0) { + minClSize = minPts + } + + n = nrow(X) + d = ncol(X) + + if(n < minPts) { + stop("HDBSCAN: Number of data points should be at least minPts") + } + + distances = dist(X) + + coreDistances = matrix(0, rows=n, cols=1) + for(i in 1:n) { + kthDist = computeKthSmallest(distances[i,], minPts) + coreDistances[i] = kthDist + } + + mutualReachDist = computeMutualReachability(distances, coreDistances) + + [mstEdges, mstWeights] = buildMST(mutualReachDist, n) + + # TODO: build cluster hierarchy + # TODO: get stable cluster with stability score + # TODO: build cluster model + + # temp dummy values + clusterMems = matrix(1, rows=n, cols=1) + clusterModel = X +} + + +computeKthSmallest = function(Matrix[Double] array, Integer k) + return (Double res) +{ + sorted = order(target=array, by=1, decreasing=FALSE) + res = as.scalar(sorted[k+1, 1]) +} + + +computeMutualReachability = function(Matrix[Double] distances, Matrix[Double] coreDistances) + return (Matrix[Double] mutualReach) +{ + # mutualReach(i,j) = max(dist(i,j), coreDist(i), coreDist(j)) + # Diagonal is set to zero. + + n = nrow(distances) + + coreDistRow = t(coreDistances) + coreDistCol = coreDistances + + maxCoreRow = (distances > coreDistRow) * distances + (distances <= coreDistRow) * coreDistRow + mutualReach = (maxCoreRow > coreDistCol) * maxCoreRow + (maxCoreRow <= coreDistCol) * coreDistCol + + mutualReach = mutualReach * (1 - diag(matrix(1, rows=n, cols=1))) +} + + +buildMST = function(Matrix[Double] distances, Integer n) + return (Matrix[Double] edges, Matrix[Double] weights) +{ + edges = matrix(0, rows=n-1, cols=2) + weights = matrix(0, rows=n-1, cols=1) + + inMST = matrix(0, rows=n, cols=1) + inMST[1] = 1 + + minDist = distances[1,] + minDist = t(minDist) + + for(i in 1:(n-1)) { + candidates = minDist + inMST * 1e15 + minIdx = as.scalar(rowIndexMin(t(candidates))) + minWeight = as.scalar(minDist[minIdx]) + + # Find which node in MST connects to minIdx + connectIdx = as.scalar(rowIndexMin(distances[minIdx,] + t(1-inMST) * 1e15)) + edges[i,1] = minIdx + edges[i,2] = connectIdx + weights[i] = minWeight + + inMST[minIdx] = 1 + newDists = distances[minIdx,] + minDist = (minDist < t(newDists)) * minDist + (minDist >= t(newDists)) * t(newDists) + } +} \ No newline at end of file diff --git a/scripts/builtin/hdbscanApply.dml b/scripts/builtin/hdbscanApply.dml new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/test/scripts/functions/builtin/hdbscan.R b/src/test/scripts/functions/builtin/hdbscan.R new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/test/scripts/functions/builtin/hdbscan.dml b/src/test/scripts/functions/builtin/hdbscan.dml new file mode 100644 index 00000000000..cc59154f3c3 --- /dev/null +++ b/src/test/scripts/functions/builtin/hdbscan.dml @@ -0,0 +1,20 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- diff --git a/src/test/scripts/functions/builtin/hdbscanApply.dml b/src/test/scripts/functions/builtin/hdbscanApply.dml new file mode 100644 index 00000000000..e69de29bb2d diff --git a/test_build_mst.dml b/test_build_mst.dml new file mode 100644 index 00000000000..a8dd7a44ff9 --- /dev/null +++ b/test_build_mst.dml @@ -0,0 +1,46 @@ +source("scripts/builtin/hdbscan.dml") as hdb + +# 4 +# / | \ +# / | \ +# / (2) (5) +# | | \ +# | | \ +# (2) 1-(3)--3 +# | | / +# \ | (4) +# \ (1) / +# \ | / +# \|/ +# 2 + +distances = matrix(0, rows=4, cols=4) +distances[1,2] = 1 +distances[2,1] = 1 + +distances[1,3] = 3 +distances[3,1] = 3 + +distances[1,4] = 2 +distances[4,1] = 2 + +distances[2,3] = 4 +distances[3,2] = 4 + +distances[2,4] = 2 +distances[4,2] = 2 + +distances[3,4] = 5 +distances[4,3] = 5 + +[edges, weights] = hdb::buildMST(distances, 4) + +totalWeight = sum(weights) + +test_pass = (nrow(edges) == 3) & (totalWeight == 6) + +if(test_pass) { + print("Passed") +} else { + print("Failes") +} \ No newline at end of file diff --git a/test_kth_smallest.dml b/test_kth_smallest.dml new file mode 100644 index 00000000000..b5065230f26 --- /dev/null +++ b/test_kth_smallest.dml @@ -0,0 +1,14 @@ +source("scripts/builtin/hdbscan.dml") as hdb + +array = matrix("0 1 4 2", rows=4, cols=1) + +res1 = hdb::computeKthSmallest(array, 1) +res2 = hdb::computeKthSmallest(array, 2) + +test_pass = (res1 == 1) & (res2 == 2) + +if(test_pass) { + print("Passed") +} else { + print("Failed") +} \ No newline at end of file diff --git a/test_mutual_reachability.dml b/test_mutual_reachability.dml new file mode 100644 index 00000000000..2e86ed28311 --- /dev/null +++ b/test_mutual_reachability.dml @@ -0,0 +1,26 @@ +source("scripts/builtin/hdbscan.dml") as hdb + +distances = matrix("0 1 5 1 0 4 5 4 0", rows=3, cols=3) +coreDistances = matrix("1 1 4", rows=3, cols=1) + +mutualReach = hdb::computeMutualReachability(distances, coreDistances) + +diagSum = sum(diag(mutualReach)) + +val_AB = as.scalar(mutualReach[1,2]) +val_AC = as.scalar(mutualReach[1,3]) +val_BC = as.scalar(mutualReach[2,3]) + +sym_AB = as.scalar(mutualReach[2,1]) +sym_AC = as.scalar(mutualReach[3,1]) +sym_BC = as.scalar(mutualReach[3,2]) + +test1_pass = (val_AB == 1) & (val_AC == 5) & (val_BC == 4) & + (diagSum == 0) & + (val_AB == sym_AB) & (val_AC == sym_AC) & (val_BC == sym_BC) + +if(test1_pass) { + print("Passed") +} else { + print("Failed") +} From 39b3a4a45aa3db4e4d9ecd329ce7d227c791266a Mon Sep 17 00:00:00 2001 From: anuunchin <88698977+anuunchin@users.noreply.github.com> Date: Mon, 29 Dec 2025 17:34:07 +0100 Subject: [PATCH 2/2] Hierarchy builder with union-find --- scripts/builtin/hdbscan.dml | 88 ++++++++++++++++++++++++++++- test_build_hierarchy.dml | 77 +++++++++++++++++++++++++ test_build_mst.dml | 2 +- test_union_find.dml | 108 ++++++++++++++++++++++++++++++++++++ 4 files changed, 273 insertions(+), 2 deletions(-) create mode 100644 test_build_hierarchy.dml create mode 100644 test_union_find.dml diff --git a/scripts/builtin/hdbscan.dml b/scripts/builtin/hdbscan.dml index ca48c7fdfc6..3cb00e021ce 100644 --- a/scripts/builtin/hdbscan.dml +++ b/scripts/builtin/hdbscan.dml @@ -66,7 +66,8 @@ m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = - [mstEdges, mstWeights] = buildMST(mutualReachDist, n) - # TODO: build cluster hierarchy + [hierarchy, clusterSizes] = buildHierarchy(mstEdges, mstWeights, n) + # TODO: get stable cluster with stability score # TODO: build cluster model @@ -129,4 +130,89 @@ buildMST = function(Matrix[Double] distances, Integer n) newDists = distances[minIdx,] minDist = (minDist < t(newDists)) * minDist + (minDist >= t(newDists)) * t(newDists) } +} + +# Union-find utils + +find = function(Matrix[Double] parent, Integer x) + return (Integer root) +{ + root = x + while(as.scalar(parent[root]) != root) { + root = as.integer(as.scalar(parent[root])) + } +} + +union = function(Matrix[Double] parent, Matrix[Double] rank, + Integer x, Integer y, Matrix[Double] size) + return (Matrix[Double] newParent, Matrix[Double] newRank, Matrix[Double] newSize) +{ + newParent = parent + newRank = rank + newSize = size + + rankX = as.scalar(rank[x]) + rankY = as.scalar(rank[y]) + + # Calculate combined size + combinedSize = as.scalar(size[x]) + as.scalar(size[y]) + + if(rankX < rankY) { + newParent[x] = y + newSize[y] = combinedSize # Update size at new root + } + else if(rankX > rankY) { + newParent[y] = x + newSize[x] = combinedSize # Update size at new root + } + else { + newParent[y] = x + newRank[x] = rankX + 1 + newSize[x] = combinedSize # Update size at new root + } +} + +buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer n) + return (Matrix[Double] hierarchy, Matrix[Double] sizes) +{ + # sort edges by weight in ascending order + # to build the hierarchy from dense cores outward + sorted = order(target=weights, by=1, decreasing=FALSE) + + # parent[i] = i, meaning each point is its own parent in the beginning + parent = seq(1, n) + # tree depth when unioning (icnreases only when merged trees are of same height) + rank = matrix(0, rows=n, cols=1) + # each cluster is 1 in the beginning + clusterSize = matrix(1, rows=2*n-1, cols=1) # Space for all possible cluster IDs + + # hierarcy[i, :] = [cluster1_id, cluster2_id, merge_distance] + hierarchy = matrix(0, rows=n-1, cols=3) + + # sizes[i] = size of the cluster created by merge i + sizes = matrix(0, rows=n-1, cols=1) + + nextCluster = n + 1 + + for(i in 1:(n-1)) { + idx = as.scalar(sorted[i]) + u = as.scalar(edges[idx, 1]) + v = as.scalar(edges[idx, 2]) + w = as.scalar(weights[idx]) + + root_u = find(parent, u) + root_v = find(parent, v) + + if(root_u != root_v) { + hierarchy[i,1] = root_u + hierarchy[i,2] = root_v + hierarchy[i,3] = w + sizes[i] = as.scalar(clusterSize[root_u]) + as.scalar(clusterSize[root_v]) + + [parent, rank, clusterSize] = union(parent, rank, root_u, root_v, clusterSize) + + nextCluster = nextCluster + 1 + } + } + } \ No newline at end of file diff --git a/test_build_hierarchy.dml b/test_build_hierarchy.dml new file mode 100644 index 00000000000..b59aa288619 --- /dev/null +++ b/test_build_hierarchy.dml @@ -0,0 +1,77 @@ +source("scripts/builtin/hdbscan.dml") as hdb + +# 4 +# / | \ +# / | \ +# / (2) (5) +# | | \ +# | | \ +# (2) 1-(3)--3 +# | | / +# \ | (4) +# \ (1) / +# \ | / +# \|/ +# 2 + +distances = matrix(0, rows=4, cols=4) +distances[1,2] = 1 +distances[2,1] = 1 + +distances[1,3] = 3 +distances[3,1] = 3 + +distances[1,4] = 2 +distances[4,1] = 2 + +distances[2,3] = 4 +distances[3,2] = 4 + +distances[2,4] = 2 +distances[4,2] = 2 + +distances[3,4] = 5 +distances[4,3] = 5 + +[edges, weights] = hdb::buildMST(distances, 4) + +[hierarchy, sizes] = hdb::buildHierarchy(edges, weights, 4) + +print("Hierarchy (format: [cluster1, cluster2, merge_distance]):") +print(toString(hierarchy)) + +# Should have n-1 merge operations for n nodes +num_merges = nrow(hierarchy) +print("Number of merges: " + num_merges + " (should be 3)") +test1 = (num_merges == 3) + +# Merge distances should be in ascending order (or equal) +# Because we process edges from low weight to high weight +dist1 = as.scalar(hierarchy[1,3]) +dist2 = as.scalar(hierarchy[2,3]) +dist3 = as.scalar(hierarchy[3,3]) +print("\nMerge distances: [" + dist1 + ", " + dist2 + ", " + dist3 + "]" + " (Should be in ascending order)") +test2 = (dist1 <= dist2) & (dist2 <= dist3) + +# Cluster sizes should increase +size1 = as.scalar(sizes[1]) +size2 = as.scalar(sizes[2]) +size3 = as.scalar(sizes[3]) +print("\nCluster sizes: [" + size1 + ", " + size2 + ", " + size3 + "]" + " (Should be increasing)") +test3 = (size1 <= size2) & (size2 <= size3) + +# Final size should equal total number of nodes +print("Final cluster size: " + size3 + " (should be 4)") +test4 = (size3 == 4) + +# First merge should be size 2 +print("First merge size: " + size1 + " (should be 2)") +test5 = (size1 == 2) + +test_pass = test1 & test2 & test3 & test4 & test5 + +if(test_pass) { + print("Passed") +} else { + print("Failed") +} \ No newline at end of file diff --git a/test_build_mst.dml b/test_build_mst.dml index a8dd7a44ff9..e2ed10ab040 100644 --- a/test_build_mst.dml +++ b/test_build_mst.dml @@ -42,5 +42,5 @@ test_pass = (nrow(edges) == 3) & (totalWeight == 6) if(test_pass) { print("Passed") } else { - print("Failes") + print("Failed") } \ No newline at end of file diff --git a/test_union_find.dml b/test_union_find.dml new file mode 100644 index 00000000000..98eb14c7141 --- /dev/null +++ b/test_union_find.dml @@ -0,0 +1,108 @@ +source("scripts/builtin/hdbscan.dml") as hdb + +# 1 -> 1 +# 2 -> 1 +# 3 -> 2 -> 1 +# 4 -> 4 + +n = 4 +parent = matrix(0, rows=n, cols=1) +parent[1] = 1 +parent[2] = 1 +parent[3] = 2 +parent[4] = 4 + +root1 = hdb::find(parent, 1) +root2 = hdb::find(parent, 2) +root3 = hdb::find(parent, 3) +root4 = hdb::find(parent, 4) + +test_find = (root1 == 1) & (root2 == 1) & (root3 == 1) & (root4 == 4) + +if(test_find) { + print("Passed") +} else { + print("Failed") +} + +# ensure union() attaches lower-rank root under higher-rank root +# rank[1] < rank[2] => parent[1] becomes 2 + +parentA = matrix(0, rows=n, cols=1) +parentA[1] = 1 +parentA[2] = 2 +parentA[3] = 3 +parentA[4] = 4 + +rankA = matrix(0, rows=n, cols=1) +rankA[1] = 0 +rankA[2] = 1 +rankA[3] = 0 +rankA[4] = 0 + +sizeA = matrix(1, rows=n, cols=1) + +[newParentA, newRankA, newSizeA] = hdb::union(parentA, rankA, 1, 2, sizeA) +test_unionA = (as.scalar(newParentA[1]) == 2) & (as.scalar(newParentA[2]) == 2) + +if(test_unionA) { + print("Passed") +} else { + print("Failed") +} + +# ensure union() attaches lower-rank root under higher-rank root +# rank[1] > rank[2] => parent[2] becomes 1 + +parentB = matrix(0, rows=n, cols=1) +parentB[1] = 1 +parentB[2] = 2 +parentB[3] = 3 +parentB[4] = 4 + +rankB = matrix(0, rows=n, cols=1) +rankB[1] = 2 +rankB[2] = 1 +rankB[3] = 0 +rankB[4] = 0 + +sizeB = matrix(1, rows=n, cols=1) + +[newParentB, newRankB, newSizeB] = hdb::union(parentB, rankB, 1, 2, sizeB) +test_unionB = (as.scalar(newParentB[2]) == 1) & (as.scalar(newParentB[1]) == 1) + +if(test_unionB) { + print("Passed") +} else { + print("Failed") +} + +# ensure union() increments if height is same + +parentC = matrix(0, rows=n, cols=1) +parentC[1] = 1 +parentC[2] = 2 +parentC[3] = 3 +parentC[4] = 4 + +rankC = matrix(0, rows=n, cols=1) +rankC[1] = 0 +rankC[2] = 0 +rankC[3] = 0 +rankC[4] = 0 + +sizeC = matrix(1, rows=n, cols=1) + +[newParentC, newRankC, newSizeC] = hdb::union(parentC, rankC, 1, 2, sizeC) + +test_unionC_parent = (as.scalar(newParentC[2]) == 1) +test_unionC_rank = (as.scalar(newRankC[1]) == 1) + +test_unionC = test_unionC_parent & test_unionC_rank + +if(test_unionC) { + print("Passed") +} else { + print("Failed") + print(rankC[1]) +} \ No newline at end of file