From f779561bd0e65cb0804fd997afe4bc9a85303361 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Sun, 3 May 2015 13:41:55 +0530 Subject: [PATCH] [SPARK-7328] Pyspark.mllib.linalg.Vectors: Missing items --- python/pyspark/mllib/linalg.py | 175 +++++++++++++++++++++++++++++++-- python/pyspark/mllib/tests.py | 27 ++++- 2 files changed, 194 insertions(+), 8 deletions(-) diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py index 9f3b0baf9f19f..be96c5b3246f9 100644 --- a/python/pyspark/mllib/linalg.py +++ b/python/pyspark/mllib/linalg.py @@ -25,6 +25,7 @@ import sys import array +from math import sqrt if sys.version >= '3': basestring = str @@ -208,9 +209,55 @@ def __init__(self, ar): ar = ar.astype(np.float64) self.array = ar + def toString(self): + """ + Convert DenseVector to string representation. + + >>> a = DenseVector([0, 1, 2, 3]) + >>> a.toString() + '[0.0,1.0,2.0,3.0]' + """ + return str(self) + + def copy(self): + return DenseVector(np.copy(self.array)) + + @staticmethod + def parse(vectorString): + """ + Parse string representation back into the DenseVector. + + >>> DenseVector.parse('[0.0,1.0,2.0,3.0]') + DenseVector([0.0, 1.0, 2.0, 3.0]) + """ + vectorString = vectorString[1:-1] + return DenseVector([float(val) for val in vectorString.split(',')]) + def __reduce__(self): return DenseVector, (self.array.tostring(),) + def numNonzeros(self): + return np.nonzero(self.array)[0].size + + def norm(self, p): + """ + Calculte the norm of a DenseVector. + + >>> a = DenseVector([0, -1, 2, -3]) + >>> a.norm(2) + 3.7... + >>> a.norm(1) + 6.0 + """ + if p == 1: + return np.sum(np.abs(self.array)) + elif p == 2: + return sqrt(np.dot(self.array, self.array)) + elif p == np.inf: + return np.max(np.abs(self.array)) + else: + return pow(np.power(self.array, p), 1.0 / p) + def dot(self, other): """ Compute the dot product of two Vectors. We support @@ -387,9 +434,60 @@ def __init__(self, size, *args): if self.indices[i] >= self.indices[i + 1]: raise TypeError("indices array must be sorted") + def copy(self): + return SparseVector(self.size, np.copy(self.indices), np.copy(self.values)) + + def numNonzeros(self): + return np.nonzero(self.values)[0].size + + def norm(self, p): + """ + Calculte the norm of a SparseVector. + + >>> a = SparseVector(4, [0, 1], [3., -4.]) + >>> a.norm(1) + 7.0 + >>> a.norm(2) + 5.0 + """ + if p == 1: + return np.sum(np.abs(self.values)) + elif p == 2: + return sqrt(np.dot(self.values, self.values)) + elif p == np.inf: + return np.max(np.abs(self.values)) + else: + return pow(np.power(self.values, p), 1.0 / p) + def __reduce__(self): return (SparseVector, (self.size, self.indices.tostring(), self.values.tostring())) + def toString(self): + """ + Convert SparseVector to string representation. + + >>> a = SparseVector(4, [0, 1], [4, 5]) + >>> a.toString() + '(4,[0,1],[4.0,5.0])' + """ + return str(self) + + @staticmethod + def parse(vectorString): + """ + Parse string representation back into the DenseVector. + + >>> SparseVector.parse('(4,[0,1],[4.0,5.0])') + SparseVector(4, {0: 4.0, 1: 5.0}) + """ + size = int(vectorString[1]) + ind_end = vectorString.find(']') + index_string = vectorString[4: ind_end] + indices = [int(ind) for ind in index_string.split(',')] + value_string = vectorString[ind_end + 3: -2] + values = [float(val) for val in value_string.split(',')] + return SparseVector(size, indices, values) + def dot(self, other): """ Dot product with a SparseVector or 1- or 2-dimensional Numpy array. @@ -430,12 +528,15 @@ def dot(self, other): assert len(self) == _vector_size(other), "dimension mismatch" - if type(other) in (np.ndarray, array.array, DenseVector): + if type(other) in (np.ndarray, array.array): result = 0.0 - for i in xrange(len(self.indices)): - result += self.values[i] * other[self.indices[i]] + for i, ind in enumerate(self.indices): + result += self.values[i] * other[ind] return result + elif isinstance(other, DenseVector): + return np.dot(other.toArray()[self.indices], self.values) + elif type(other) is SparseVector: result = 0.0 i, j = 0, 0 @@ -479,19 +580,28 @@ def squared_distance(self, other): AssertionError: dimension mismatch """ assert len(self) == _vector_size(other), "dimension mismatch" - if type(other) in (list, array.array, DenseVector, np.array, np.ndarray): + if type(other) in (list, array.array, np.array, np.ndarray): if type(other) is np.array and other.ndim != 1: raise Exception("Cannot call squared_distance with %d-dimensional array" % other.ndim) result = 0.0 j = 0 # index into our own array - for i in xrange(len(other)): + for i, other_ind in enumerate(other): if j < len(self.indices) and self.indices[j] == i: - diff = self.values[j] - other[i] + diff = self.values[j] - other_ind result += diff * diff j += 1 else: - result += other[i] * other[i] + result += other_ind * other_ind + return result + + elif isinstance(other, DenseVector): + bool_ind = np.zeros(len(other), dtype=bool) + bool_ind[self.indices] = True + dist = other.toArray()[bool_ind] - self.values + result = np.dot(dist, dist) + other_values = other.toArray()[~bool_ind] + result += np.dot(other_values, other_values) return result elif type(other) is SparseVector: @@ -633,6 +743,57 @@ def stringify(vector): """ return str(vector) + @staticmethod + def dot(a, b): + """ + Dot product between two vectors. + a and b can be of type, SparseVector, DenseVector, np.ndarray + or array.array. + + >>> a = Vectors.sparse(4, [(0, 1), (3, 4)]) + >>> b = Vectors.dense([23, 41, 9, 1]) + >>> Vectors.dot(a, b) + 27.0 + >>> Vectors.dot(a, a) + 17.0 + >>> Vectors.dot(a, np.array([0, 1, 2, 4])) + 16.0 + """ + a, b = _convert_to_vector(a), _convert_to_vector(b) + return a.dot(b) + + @staticmethod + def squared_distance(a, b): + """ + Squared distance between two vectors. + a and b can be of type, SparseVector, DenseVector, np.ndarray + or array.array. + + >>> a = Vectors.sparse(4, [(0, 1), (3, 4)]) + >>> b = Vectors.dense([2, 5, 4, 1]) + >>> a.squared_distance(b) + 51.0 + """ + a, b = _convert_to_vector(a), _convert_to_vector(b) + return a.squared_distance(b) + + @staticmethod + def norm(vec, p): + """ + Find norm of the given vector. + """ + return _convert_to_vector(vec).norm(p) + + @staticmethod + def parse(vectorString): + if vectorString[0] == '[': + return DenseVector.parse(vectorString) + return SparseVector.parse(vectorString) + + @staticmethod + def zeros(num): + return DenseVector(np.zeros(num)) + class Matrix(object): """ diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index d05cfe2af04b2..19b3989b7d9f2 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -24,7 +24,7 @@ import tempfile import array as pyarray -from numpy import array, array_equal, zeros +from numpy import array, array_equal, zeros, inf from py4j.protocol import Py4JJavaError if sys.version_info[:2] <= (2, 6): @@ -110,6 +110,10 @@ def test_dot(self): self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat))) self.assertEquals(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat))) + self.assertEquals(Vectors.dot(sv, sv), 5.) + self.assertEquals(Vectors.dot(sv, dv), 10.) + self.assertEquals(Vectors.dot(dv, sv), 10.) + self.assertEquals(Vectors.dot(sv, array([2, 5, 7, 8])), 21.0) def test_squared_distance(self): sv = SparseVector(4, {1: 1, 3: 2}) @@ -220,6 +224,27 @@ def test_dense_matrix_is_transposed(self): self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5])) self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9])) + def test_parse_matrix(self): + a = DenseVector([3, 4, 6, 7]) + self.assertTrue(a.toString(), '[3.0,4.0,6.0,7.0]') + self.assertTrue(Vectors.parse(a.toString()), a) + a = SparseVector(4, [0, 2], [3, 4]) + self.assertTrue(a.toString(), '(4,[0,2],[3.0,4.0])') + self.assertTrue(Vectors.parse(a.toString()), a) + + def test_norms(self): + a = DenseVector([0, 2, 3, -1]) + self.assertAlmostEqual(a.norm(2), 3.742, 3) + self.assertTrue(a.norm(1), 6) + self.assertTrue(a.norm(inf), 3) + a = SparseVector(4, [0, 2], [3, -4]) + self.assertAlmostEqual(a.norm(2), 5) + self.assertTrue(a.norm(1), 7) + self.assertTrue(a.norm(inf), 4) + + tmp = SparseVector(4, [0, 2], [3, 0]) + self.assertEqual(tmp.numNonzeros(), 1) + class ListTests(MLlibTestCase):