Skip to content

Commit

Permalink
[SPARK-7328] Pyspark.mllib.linalg.Vectors: Missing items
Browse files Browse the repository at this point in the history
  • Loading branch information
MechCoder committed May 6, 2015
1 parent 32cdc81 commit f779561
Show file tree
Hide file tree
Showing 2 changed files with 194 additions and 8 deletions.
175 changes: 168 additions & 7 deletions python/pyspark/mllib/linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

import sys
import array
from math import sqrt

if sys.version >= '3':
basestring = str
Expand Down Expand Up @@ -208,9 +209,55 @@ def __init__(self, ar):
ar = ar.astype(np.float64)
self.array = ar

def toString(self):
"""
Convert DenseVector to string representation.
>>> a = DenseVector([0, 1, 2, 3])
>>> a.toString()
'[0.0,1.0,2.0,3.0]'
"""
return str(self)

def copy(self):
return DenseVector(np.copy(self.array))

@staticmethod
def parse(vectorString):
"""
Parse string representation back into the DenseVector.
>>> DenseVector.parse('[0.0,1.0,2.0,3.0]')
DenseVector([0.0, 1.0, 2.0, 3.0])
"""
vectorString = vectorString[1:-1]
return DenseVector([float(val) for val in vectorString.split(',')])

def __reduce__(self):
return DenseVector, (self.array.tostring(),)

def numNonzeros(self):
return np.nonzero(self.array)[0].size

def norm(self, p):
"""
Calculte the norm of a DenseVector.
>>> a = DenseVector([0, -1, 2, -3])
>>> a.norm(2)
3.7...
>>> a.norm(1)
6.0
"""
if p == 1:
return np.sum(np.abs(self.array))
elif p == 2:
return sqrt(np.dot(self.array, self.array))
elif p == np.inf:
return np.max(np.abs(self.array))
else:
return pow(np.power(self.array, p), 1.0 / p)

def dot(self, other):
"""
Compute the dot product of two Vectors. We support
Expand Down Expand Up @@ -387,9 +434,60 @@ def __init__(self, size, *args):
if self.indices[i] >= self.indices[i + 1]:
raise TypeError("indices array must be sorted")

def copy(self):
return SparseVector(self.size, np.copy(self.indices), np.copy(self.values))

def numNonzeros(self):
return np.nonzero(self.values)[0].size

def norm(self, p):
"""
Calculte the norm of a SparseVector.
>>> a = SparseVector(4, [0, 1], [3., -4.])
>>> a.norm(1)
7.0
>>> a.norm(2)
5.0
"""
if p == 1:
return np.sum(np.abs(self.values))
elif p == 2:
return sqrt(np.dot(self.values, self.values))
elif p == np.inf:
return np.max(np.abs(self.values))
else:
return pow(np.power(self.values, p), 1.0 / p)

def __reduce__(self):
return (SparseVector, (self.size, self.indices.tostring(), self.values.tostring()))

def toString(self):
"""
Convert SparseVector to string representation.
>>> a = SparseVector(4, [0, 1], [4, 5])
>>> a.toString()
'(4,[0,1],[4.0,5.0])'
"""
return str(self)

@staticmethod
def parse(vectorString):
"""
Parse string representation back into the DenseVector.
>>> SparseVector.parse('(4,[0,1],[4.0,5.0])')
SparseVector(4, {0: 4.0, 1: 5.0})
"""
size = int(vectorString[1])
ind_end = vectorString.find(']')
index_string = vectorString[4: ind_end]
indices = [int(ind) for ind in index_string.split(',')]
value_string = vectorString[ind_end + 3: -2]
values = [float(val) for val in value_string.split(',')]
return SparseVector(size, indices, values)

def dot(self, other):
"""
Dot product with a SparseVector or 1- or 2-dimensional Numpy array.
Expand Down Expand Up @@ -430,12 +528,15 @@ def dot(self, other):

assert len(self) == _vector_size(other), "dimension mismatch"

if type(other) in (np.ndarray, array.array, DenseVector):
if type(other) in (np.ndarray, array.array):
result = 0.0
for i in xrange(len(self.indices)):
result += self.values[i] * other[self.indices[i]]
for i, ind in enumerate(self.indices):
result += self.values[i] * other[ind]
return result

elif isinstance(other, DenseVector):
return np.dot(other.toArray()[self.indices], self.values)

elif type(other) is SparseVector:
result = 0.0
i, j = 0, 0
Expand Down Expand Up @@ -479,19 +580,28 @@ def squared_distance(self, other):
AssertionError: dimension mismatch
"""
assert len(self) == _vector_size(other), "dimension mismatch"
if type(other) in (list, array.array, DenseVector, np.array, np.ndarray):
if type(other) in (list, array.array, np.array, np.ndarray):
if type(other) is np.array and other.ndim != 1:
raise Exception("Cannot call squared_distance with %d-dimensional array" %
other.ndim)
result = 0.0
j = 0 # index into our own array
for i in xrange(len(other)):
for i, other_ind in enumerate(other):
if j < len(self.indices) and self.indices[j] == i:
diff = self.values[j] - other[i]
diff = self.values[j] - other_ind
result += diff * diff
j += 1
else:
result += other[i] * other[i]
result += other_ind * other_ind
return result

elif isinstance(other, DenseVector):
bool_ind = np.zeros(len(other), dtype=bool)
bool_ind[self.indices] = True
dist = other.toArray()[bool_ind] - self.values
result = np.dot(dist, dist)
other_values = other.toArray()[~bool_ind]
result += np.dot(other_values, other_values)
return result

elif type(other) is SparseVector:
Expand Down Expand Up @@ -633,6 +743,57 @@ def stringify(vector):
"""
return str(vector)

@staticmethod
def dot(a, b):
"""
Dot product between two vectors.
a and b can be of type, SparseVector, DenseVector, np.ndarray
or array.array.
>>> a = Vectors.sparse(4, [(0, 1), (3, 4)])
>>> b = Vectors.dense([23, 41, 9, 1])
>>> Vectors.dot(a, b)
27.0
>>> Vectors.dot(a, a)
17.0
>>> Vectors.dot(a, np.array([0, 1, 2, 4]))
16.0
"""
a, b = _convert_to_vector(a), _convert_to_vector(b)
return a.dot(b)

@staticmethod
def squared_distance(a, b):
"""
Squared distance between two vectors.
a and b can be of type, SparseVector, DenseVector, np.ndarray
or array.array.
>>> a = Vectors.sparse(4, [(0, 1), (3, 4)])
>>> b = Vectors.dense([2, 5, 4, 1])
>>> a.squared_distance(b)
51.0
"""
a, b = _convert_to_vector(a), _convert_to_vector(b)
return a.squared_distance(b)

@staticmethod
def norm(vec, p):
"""
Find norm of the given vector.
"""
return _convert_to_vector(vec).norm(p)

@staticmethod
def parse(vectorString):
if vectorString[0] == '[':
return DenseVector.parse(vectorString)
return SparseVector.parse(vectorString)

@staticmethod
def zeros(num):
return DenseVector(np.zeros(num))


class Matrix(object):
"""
Expand Down
27 changes: 26 additions & 1 deletion python/pyspark/mllib/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import tempfile
import array as pyarray

from numpy import array, array_equal, zeros
from numpy import array, array_equal, zeros, inf
from py4j.protocol import Py4JJavaError

if sys.version_info[:2] <= (2, 6):
Expand Down Expand Up @@ -110,6 +110,10 @@ def test_dot(self):
self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat)))
self.assertEquals(30.0, lst.dot(dv))
self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
self.assertEquals(Vectors.dot(sv, sv), 5.)
self.assertEquals(Vectors.dot(sv, dv), 10.)
self.assertEquals(Vectors.dot(dv, sv), 10.)
self.assertEquals(Vectors.dot(sv, array([2, 5, 7, 8])), 21.0)

def test_squared_distance(self):
sv = SparseVector(4, {1: 1, 3: 2})
Expand Down Expand Up @@ -220,6 +224,27 @@ def test_dense_matrix_is_transposed(self):
self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5]))
self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))

def test_parse_matrix(self):
a = DenseVector([3, 4, 6, 7])
self.assertTrue(a.toString(), '[3.0,4.0,6.0,7.0]')
self.assertTrue(Vectors.parse(a.toString()), a)
a = SparseVector(4, [0, 2], [3, 4])
self.assertTrue(a.toString(), '(4,[0,2],[3.0,4.0])')
self.assertTrue(Vectors.parse(a.toString()), a)

def test_norms(self):
a = DenseVector([0, 2, 3, -1])
self.assertAlmostEqual(a.norm(2), 3.742, 3)
self.assertTrue(a.norm(1), 6)
self.assertTrue(a.norm(inf), 3)
a = SparseVector(4, [0, 2], [3, -4])
self.assertAlmostEqual(a.norm(2), 5)
self.assertTrue(a.norm(1), 7)
self.assertTrue(a.norm(inf), 4)

tmp = SparseVector(4, [0, 2], [3, 0])
self.assertEqual(tmp.numNonzeros(), 1)


class ListTests(MLlibTestCase):

Expand Down

0 comments on commit f779561

Please sign in to comment.