Skip to content

Commit

Permalink
[SPARK-26315][PySpark] auto cast threshold from Integer to Float in a…
Browse files Browse the repository at this point in the history
…pproxSimilarityJoin of BucketedRandomProjectionLSHModel
  • Loading branch information
Jing Chen He committed Dec 13, 2018
1 parent 8edae94 commit eb00074
Showing 1 changed file with 11 additions and 0 deletions.
11 changes: 11 additions & 0 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ def approxSimilarityJoin(self, datasetA, datasetB, threshold, distCol="distCol")
"datasetA" and "datasetB", and a column "distCol" is added to show the distance
between each pair.
"""
threshold = TypeConverters.toFloat(threshold)
return self._call_java("approxSimilarityJoin", datasetA, datasetB, threshold, distCol)


Expand Down Expand Up @@ -239,6 +240,16 @@ class BucketedRandomProjectionLSH(JavaEstimator, LSHParams, HasInputCol, HasOutp
| 3| 6| 2.23606797749979|
+---+---+-----------------+
...
>>> model.approxSimilarityJoin(df, df2, 3, distCol="EuclideanDistance").select(
... col("datasetA.id").alias("idA"),
... col("datasetB.id").alias("idB"),
... col("EuclideanDistance")).show()
+---+---+-----------------+
|idA|idB|EuclideanDistance|
+---+---+-----------------+
| 3| 6| 2.23606797749979|
+---+---+-----------------+
...
>>> brpPath = temp_path + "/brp"
>>> brp.save(brpPath)
>>> brp2 = BucketedRandomProjectionLSH.load(brpPath)
Expand Down

0 comments on commit eb00074

Please sign in to comment.