Skip to content

Commit

Permalink
[ML-177][Native Bayes] Fix error when converting Vector to CSRNumeric…
Browse files Browse the repository at this point in the history
…Table (#176)

* 1. fix naiveBayes bug
2. add unit test for converting vector to CSRNumericTable

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* Update OneDAL.scala

* add unit test to test.sh

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* fix comments

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* modify code indent

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* Update mllib-dal/src/main/scala/com/intel/oap/mllib/OneDAL.scala

* Update mllib-dal/src/main/scala/com/intel/oap/mllib/OneDAL.scala

Co-authored-by: Xiaochang Wu <xiaochang.wu@intel.com>
  • Loading branch information
minmingzhu and xwu99 authored Feb 23, 2022
1 parent 0810769 commit 3a68664
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 12 deletions.
24 changes: 13 additions & 11 deletions mllib-dal/src/main/scala/com/intel/oap/mllib/OneDAL.scala
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,8 @@ object OneDAL {
matrixLabel
}

private def vectorsToSparseNumericTable(vectors: Array[Vector],
nFeatures: Long): CSRNumericTable = {
def vectorsToSparseNumericTable(vectors: Array[Vector],
nFeatures: Long): CSRNumericTable = {
require(vectors(0).isInstanceOf[SparseVector], "vectors should be sparse")

println(s"Features row x column: ${vectors.length} x ${vectors(0).size}")
Expand All @@ -250,10 +250,10 @@ object OneDAL {
val columnIndices = Array.fill(ratingsNum) {
0L
}
// First row index is 1
val rowOffsets = ArrayBuffer[Long](1L)

var indexValues = 0
var curRow = 0L

// Converted to one CSRNumericTable
for (row <- 0 until vectors.length) {
Expand All @@ -263,20 +263,22 @@ object OneDAL {
// one-based indexValues
columnIndices(indexValues) = column + 1

if (row > curRow) {
curRow = row
// one-based indexValues
rowOffsets += indexValues + 1
}

indexValues = indexValues + 1
}
// one-based row indexValues
rowOffsets += indexValues + 1
}
// one-based row indexValues
rowOffsets += indexValues + 1

val contextLocal = new DaalContext()

// check CSR encoding
assert(values.length == ratingsNum,
"the length of values should be equal to the number of non-zero elements")
assert(columnIndices.length == ratingsNum,
"the length of columnIndices should be equal to the number of non-zero elements")
assert(rowOffsets.size == (csrRowNum + 1),
"the size of rowOffsets should be equal to the number of rows + 1")

val cTable = OneDAL.cNewCSRNumericTableDouble(values, columnIndices, rowOffsets.toArray,
nFeatures, csrRowNum)
val table = new CSRNumericTable(contextLocal, cTable)
Expand Down
40 changes: 40 additions & 0 deletions mllib-dal/src/test/scala/org/apache/spark/ml/oneDALSuite.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package org.apache.spark.ml

import com.intel.oap.mllib.OneDAL
import org.apache.spark.internal.Logging
import org.apache.spark.ml.linalg.{Matrices, Vector, Vectors}
import org.apache.spark.sql.Row

class oneDALSuite extends FunctionsSuite with Logging {

import testImplicits._

test("test sparse vector to CSRNumericTable") {
val data = Seq(
Vectors.sparse(3, Seq((0, 1.0), (1, 2.0), (2, 3.0))),
Vectors.sparse(3, Seq((0, 10.0), (1, 20.0), (2, 30.0))),
Vectors.sparse(3, Seq.empty),
Vectors.sparse(3, Seq.empty),
Vectors.sparse(3, Seq((0, 1.0), (1, 2.0))),
Vectors.sparse(3, Seq((0, 10.0), (2, 20.0))),
)
val df = data.map(Tuple1.apply).toDF("features")
df.show()
val rowsRDD = df.rdd.map {
case Row(features: Vector) => features
}
val results = rowsRDD.coalesce(1).mapPartitions { it: Iterator[Vector] =>
val vectors: Array[Vector] = it.toArray
val numColumns = vectors(0).size
val CSRNumericTable = {
OneDAL.vectorsToSparseNumericTable(vectors, numColumns)
}
Iterator(CSRNumericTable.getCNumericTable)
}.collect()
val csr = OneDAL.makeNumericTable(results(0))
val resultMatrix = OneDAL.numericTableToMatrix(csr)
val matrix = Matrices.fromVectors(data)

assert((resultMatrix.toArray sameElements matrix.toArray) === true)
}
}
3 changes: 2 additions & 1 deletion mllib-dal/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ suiteArray=(
"classification.MLlibNaiveBayesSuite" \
"regression.MLlibLinearRegressionSuite" \
"stat.MLlibCorrelationSuite" \
"stat.MultivariateOnlineSummarizerSuite"
"stat.MultivariateOnlineSummarizerSuite" \
"oneDALSuite"
)

MVN_NO_TRANSFER_PROGRESS=
Expand Down

0 comments on commit 3a68664

Please sign in to comment.