[SPARK-7328] Pyspark.mllib.linalg.Vectors: Missing items

nemccarthy · May 6, 2015 · f779561 · f779561
1 parent 32cdc81
commit f779561
Show file tree

Hide file tree

Showing 2 changed files with 194 additions and 8 deletions.
diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
@@ -25,6 +25,7 @@
 
 import sys
 import array
+from math import sqrt
 
 if sys.version >= '3':
     basestring = str
@@ -208,9 +209,55 @@ def __init__(self, ar):
             ar = ar.astype(np.float64)
         self.array = ar
 
+    def toString(self):
+        """
+        Convert DenseVector to string representation.
+
+        >>> a = DenseVector([0, 1, 2, 3])
+        >>> a.toString()
+        '[0.0,1.0,2.0,3.0]'
+        """
+        return str(self)
+
+    def copy(self):
+        return DenseVector(np.copy(self.array))
+
+    @staticmethod
+    def parse(vectorString):
+        """
+        Parse string representation back into the DenseVector.
+
+        >>> DenseVector.parse('[0.0,1.0,2.0,3.0]')
+        DenseVector([0.0, 1.0, 2.0, 3.0])
+        """
+        vectorString = vectorString[1:-1]
+        return DenseVector([float(val) for val in vectorString.split(',')])
+
     def __reduce__(self):
         return DenseVector, (self.array.tostring(),)
 
+    def numNonzeros(self):
+        return np.nonzero(self.array)[0].size
+
+    def norm(self, p):
+        """
+        Calculte the norm of a DenseVector.
+
+        >>> a = DenseVector([0, -1, 2, -3])
+        >>> a.norm(2)
+        3.7...
+        >>> a.norm(1)
+        6.0
+        """
+        if p == 1:
+            return np.sum(np.abs(self.array))
+        elif p == 2:
+            return sqrt(np.dot(self.array, self.array))
+        elif p == np.inf:
+            return np.max(np.abs(self.array))
+        else:
+            return pow(np.power(self.array, p), 1.0 / p)
+
     def dot(self, other):
         """
         Compute the dot product of two Vectors. We support
@@ -387,9 +434,60 @@ def __init__(self, size, *args):
                 if self.indices[i] >= self.indices[i + 1]:
                     raise TypeError("indices array must be sorted")
 
+    def copy(self):
+        return SparseVector(self.size, np.copy(self.indices), np.copy(self.values))
+
+    def numNonzeros(self):
+        return np.nonzero(self.values)[0].size
+
+    def norm(self, p):
+        """
+        Calculte the norm of a SparseVector.
+
+        >>> a = SparseVector(4, [0, 1], [3., -4.])
+        >>> a.norm(1)
+        7.0
+        >>> a.norm(2)
+        5.0
+        """
+        if p == 1:
+            return np.sum(np.abs(self.values))
+        elif p == 2:
+            return sqrt(np.dot(self.values, self.values))
+        elif p == np.inf:
+            return np.max(np.abs(self.values))
+        else:
+            return pow(np.power(self.values, p), 1.0 / p)
+
     def __reduce__(self):
         return (SparseVector, (self.size, self.indices.tostring(), self.values.tostring()))
 
+    def toString(self):
+        """
+        Convert SparseVector to string representation.
+
+        >>> a = SparseVector(4, [0, 1], [4, 5])
+        >>> a.toString()
+        '(4,[0,1],[4.0,5.0])'
+        """
+        return str(self)
+
+    @staticmethod
+    def parse(vectorString):
+        """
+        Parse string representation back into the DenseVector.
+
+        >>> SparseVector.parse('(4,[0,1],[4.0,5.0])')
+        SparseVector(4, {0: 4.0, 1: 5.0})
+        """
+        size = int(vectorString[1])
+        ind_end = vectorString.find(']')
+        index_string = vectorString[4: ind_end]
+        indices = [int(ind) for ind in index_string.split(',')]
+        value_string = vectorString[ind_end + 3: -2]
+        values = [float(val) for val in value_string.split(',')]
+        return SparseVector(size, indices, values)
+
     def dot(self, other):
         """
         Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
@@ -430,12 +528,15 @@ def dot(self, other):
 
         assert len(self) == _vector_size(other), "dimension mismatch"
 
-        if type(other) in (np.ndarray, array.array, DenseVector):
+        if type(other) in (np.ndarray, array.array):
             result = 0.0
-            for i in xrange(len(self.indices)):
-                result += self.values[i] * other[self.indices[i]]
+            for i, ind in enumerate(self.indices):
+                result += self.values[i] * other[ind]
             return result
 
+        elif isinstance(other, DenseVector):
+            return np.dot(other.toArray()[self.indices], self.values)
+
         elif type(other) is SparseVector:
             result = 0.0
             i, j = 0, 0
@@ -479,19 +580,28 @@ def squared_distance(self, other):
         AssertionError: dimension mismatch
         """
         assert len(self) == _vector_size(other), "dimension mismatch"
-        if type(other) in (list, array.array, DenseVector, np.array, np.ndarray):
+        if type(other) in (list, array.array, np.array, np.ndarray):
             if type(other) is np.array and other.ndim != 1:
                 raise Exception("Cannot call squared_distance with %d-dimensional array" %
                                 other.ndim)
             result = 0.0
             j = 0   # index into our own array
-            for i in xrange(len(other)):
+            for i, other_ind in enumerate(other):
                 if j < len(self.indices) and self.indices[j] == i:
-                    diff = self.values[j] - other[i]
+                    diff = self.values[j] - other_ind
                     result += diff * diff
                     j += 1
                 else:
-                    result += other[i] * other[i]
+                    result += other_ind * other_ind
+            return result
+
+        elif isinstance(other, DenseVector):
+            bool_ind = np.zeros(len(other), dtype=bool)
+            bool_ind[self.indices] = True
+            dist = other.toArray()[bool_ind] - self.values
+            result = np.dot(dist, dist)
+            other_values = other.toArray()[~bool_ind]
+            result += np.dot(other_values, other_values)
             return result
 
         elif type(other) is SparseVector:
@@ -633,6 +743,57 @@ def stringify(vector):
         """
         return str(vector)
 
+    @staticmethod
+    def dot(a, b):
+        """
+        Dot product between two vectors.
+        a and b can be of type, SparseVector, DenseVector, np.ndarray
+        or array.array.
+
+        >>> a = Vectors.sparse(4, [(0, 1), (3, 4)])
+        >>> b = Vectors.dense([23, 41, 9, 1])
+        >>> Vectors.dot(a, b)
+        27.0
+        >>> Vectors.dot(a, a)
+        17.0
+        >>> Vectors.dot(a, np.array([0, 1, 2, 4]))
+        16.0
+        """
+        a, b = _convert_to_vector(a), _convert_to_vector(b)
+        return a.dot(b)
+
+    @staticmethod
+    def squared_distance(a, b):
+        """
+        Squared distance between two vectors.
+        a and b can be of type, SparseVector, DenseVector, np.ndarray
+        or array.array.
+
+        >>> a = Vectors.sparse(4, [(0, 1), (3, 4)])
+        >>> b = Vectors.dense([2, 5, 4, 1])
+        >>> a.squared_distance(b)
+        51.0
+        """
+        a, b = _convert_to_vector(a), _convert_to_vector(b)
+        return a.squared_distance(b)
+
+    @staticmethod
+    def norm(vec, p):
+        """
+        Find norm of the given vector.
+        """
+        return _convert_to_vector(vec).norm(p)
+
+    @staticmethod
+    def parse(vectorString):
+        if vectorString[0] == '[':
+            return DenseVector.parse(vectorString)
+        return SparseVector.parse(vectorString)
+
+    @staticmethod
+    def zeros(num):
+        return DenseVector(np.zeros(num))
+
 
 class Matrix(object):
     """

diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
@@ -24,7 +24,7 @@
 import tempfile
 import array as pyarray
 
-from numpy import array, array_equal, zeros
+from numpy import array, array_equal, zeros, inf
 from py4j.protocol import Py4JJavaError
 
 if sys.version_info[:2] <= (2, 6):
@@ -110,6 +110,10 @@ def test_dot(self):
         self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
         self.assertEquals(30.0, lst.dot(dv))
         self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
+        self.assertEquals(Vectors.dot(sv, sv), 5.)
+        self.assertEquals(Vectors.dot(sv, dv), 10.)
+        self.assertEquals(Vectors.dot(dv, sv), 10.)
+        self.assertEquals(Vectors.dot(sv, array([2, 5, 7, 8])), 21.0)
 
     def test_squared_distance(self):
         sv = SparseVector(4, {1: 1, 3: 2})
@@ -220,6 +224,27 @@ def test_dense_matrix_is_transposed(self):
         self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5]))
         self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))
 
+    def test_parse_matrix(self):
+        a = DenseVector([3, 4, 6, 7])
+        self.assertTrue(a.toString(), '[3.0,4.0,6.0,7.0]')
+        self.assertTrue(Vectors.parse(a.toString()), a)
+        a = SparseVector(4, [0, 2], [3, 4])
+        self.assertTrue(a.toString(), '(4,[0,2],[3.0,4.0])')
+        self.assertTrue(Vectors.parse(a.toString()), a)
+
+    def test_norms(self):
+        a = DenseVector([0, 2, 3, -1])
+        self.assertAlmostEqual(a.norm(2), 3.742, 3)
+        self.assertTrue(a.norm(1), 6)
+        self.assertTrue(a.norm(inf), 3)
+        a = SparseVector(4, [0, 2], [3, -4])
+        self.assertAlmostEqual(a.norm(2), 5)
+        self.assertTrue(a.norm(1), 7)
+        self.assertTrue(a.norm(inf), 4)
+
+        tmp = SparseVector(4, [0, 2], [3, 0])
+        self.assertEqual(tmp.numNonzeros(), 1)
+
 
 class ListTests(MLlibTestCase):