diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 61c7d09..5098d23 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -6,11 +6,11 @@ "dockerfile": "Dockerfile", "args": { "BASE_IMAGE": "temurin-21-tools-deps-jammy", - "USERNAME": "${localEnv:USER}" + "USERNAME": "${localEnv:USER:vscode}" } }, - "remoteUser": "${localEnv:USER}", - "containerUser": "${localEnv:USER}", + "remoteUser": "${localEnv:USER:vscode}", + "containerUser": "${localEnv:USER:vscode}", "features": { "ghcr.io/devcontainers/features/git:1": {} }, diff --git a/src/scicloj/ml/xgboost/csr.clj b/src/scicloj/ml/xgboost/csr.clj index 7a484ea..bbd8561 100644 --- a/src/scicloj/ml/xgboost/csr.clj +++ b/src/scicloj/ml/xgboost/csr.clj @@ -1,3 +1,5 @@ +;; re-implements https://blog.newtum.com/sparse-matrix-in-java/ +;; maybe se here, nmot sure teh same: https://github.com/scipy/scipy/blob/v1.14.1/scipy/sparse/_csr.py (ns scicloj.ml.xgboost.csr (:require [ tech.v3.datatype :as dt] @@ -17,15 +19,18 @@ :row-pointers new-row-pointers}))) (defn ->csr [r-c-vs] + ;; data gets sorted by r and c + ;; not sure, if good idea for performace ? + (-> (reduce - (fn [csr [row col value]] (add-to-csr csr row col value)) {:values (dt/make-list :float) :column-indices (dt/make-list :int) - :row-pointers (dt/make-list :long [0])} - r-c-vs) + :row-pointers (dt/make-list :long [0])} + (sort-by (juxt first second) + r-c-vs)) (#(assoc % :row-pointers (conj (:row-pointers %) (count (:values %))))))) diff --git a/test/scicloj/ml/xgboost/csr_test.clj b/test/scicloj/ml/xgboost/csr_test.clj index 1baae86..11f1968 100644 --- a/test/scicloj/ml/xgboost/csr_test.clj +++ b/test/scicloj/ml/xgboost/csr_test.clj @@ -4,14 +4,91 @@ [tech.v3.tensor :as t])) +;; scipy +;;1 0 2 0 +;;4 0 0 3 +;;3 1 2 0 +;;csr=csr_matrix (np.array ([[1,0,2,0],[4,0,0,3],[3,1,2,0]])) + +;;>>> csr.data , same as dmatrix.data +;;array ([1, 2, 4, 3, 3, 1, 2]) + +;; >>> csr.indices, same as dmatrix.colIndex +;;array ([0, 2, 0, 3, 0, 1, 2], dtype=int32) + +;;>>> csr.indptr, same as dmatrix.rowHeaders +;;array ([0, 2, 4, 7], dtype=int32) + + (deftest ->csr +;; 3. 1. 2. +;;>>> csr=coo_array (([5,8,3,6],([0,1,2,3],[0,1,2,1])),shape= (4,4)) .tocsr () +;;>>> csr.data +;;array ([5, 8, 3, 6]) +;;>>> csr.indices +;;array ([0, 1, 2, 1]) +;;>>> csr.indptr +;;array ([0, 1, 2, 3, 4]) + (is (= {:values [5.0 8.0 3.0 6.0], :column-indices [0 1 2 1], :row-pointers [0 1 2 3 4]} (csr/->csr + ;; row,col,value [[0 0 5] [1 1 8] [2 2 3] - [4 1 6]])))) + [3 1 6]])))) + + +(deftest ->csr-2 +;; matches wikipedia https://en.wikipedia.org/wiki/Sparse_matrix +;;in python +;; coo_array (([10,20,30,40,50,60,70,80],([0,0,1,1,2,2,2,3],[0,1,1,3,2,3,4,5])),shape=(4,6)).todense() +;; array ([[10, 20, 0, 0, 0, 0], +;; [0, 30, 0, 40, 0, 0], +;; [0, 0, 50, 60, 70, 0], +;; [0, 0, 0, 0, 0, 80]]) +;; +;; 3. 1. 2. +;; >>> csr=coo_array (([10,20,30,40,50,60,70,80],([0,0,1,1,2,2,2,3],[0,1,1,3,2,3,4,5])),shape= (4,6)) .tocsr () +;; >>> csr.data +;; array ([10, 20, 30, 40, 50, 60, 70, 80]) +;; >>> csr.indices +;; array ([0, 1, 1, 3, 2, 3, 4, 5]) +;; >>> csr.indptr +;; array ([0, 2, 4, 7, 8]) + (is (= {:values [10.0 20.0 30.0 40.0 50.0 60.0 70.0 80.0] + :column-indices [0 1 1 3 2 3 4 5] + :row-pointers [0 2 4 7 8]} + (csr/->csr + [ + [0 0 10.0] + [0 1 20.0] + [1 1 30.0] + [1 3 40.0] + [2 2 50.0] + [2 3 60.0] + [2 4 70.] + [3 5 80.0]])))) +;;=> + + +(deftest unsorted [] + (is (= {:values [10.0 20.0 30.0 40.0 50.0 60.0 70.0 80.0] + :column-indices [0 1 1 3 2 3 4 5] + :row-pointers [0 2 4 7 8]} + (csr/->csr + (shuffle + [[0 0 10.0] + [0 1 20.0] + [1 1 30.0] + [1 3 40.0] + [2 2 50.0] + [2 3 60.0] + [2 4 70.0] + [3 5 80.0]])) + ))) + (deftest ->dense (is (= @@ -51,6 +128,32 @@ 4 4))))) +(t/->tensor + (csr/->dense + (csr/->csr + [[0 0 5] + [1 1 8] + [2 2 3] + [4 1 6]]) + 4 4)) +;;=> #tech.v3.tensor[4 4] +;; [[5.000 0 0 0] +;; [ 0 8.000 0 0] +;; [ 0 0 3.000 0] +;; [ 0 6.000 0 0]] + -(comment - ) \ No newline at end of file +(t/->tensor + (csr/->dense + (csr/->csr + [[1 1 8] + [2 2 3] + [4 1 6] + [0 0 5]] + ) + 4 4)) +;;=> #tech.v3.tensor[4 4] +;; [[ 0 0 0 0] +;; [ 0 8.000 0 0] +;; [ 0 0 3.000 0] +;; [5.000 6.000 0 0]]