Skip to content

Commit

Permalink
feat(api): add levenshtein edit distance API
Browse files Browse the repository at this point in the history
  • Loading branch information
cpcloud authored and kszucs committed Aug 4, 2023
1 parent 304edd1 commit ab211a8
Show file tree
Hide file tree
Showing 9 changed files with 69 additions and 0 deletions.
1 change: 1 addition & 0 deletions ci/schema/postgres.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ CREATE EXTENSION IF NOT EXISTS postgis;
CREATE EXTENSION IF NOT EXISTS plpython3u;
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS first_last_agg;
CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;

DROP TABLE IF EXISTS diamonds CASCADE;

Expand Down
1 change: 1 addition & 0 deletions ibis/backends/oracle/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def _string_join(t, op):
# Generic
ops.Hash: unary(sa.func.ora_hash),
ops.Literal: _literal,
ops.Levenshtein: fixed_arity(sa.func.utl_match.edit_distance, 2),
}
)

Expand Down
1 change: 1 addition & 0 deletions ibis/backends/postgres/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,5 +715,6 @@ def _unnest(t, op):
ops.ExtractMicrosecond: fixed_arity(
lambda arg: sa.extract("microsecond", arg) % 1_000_000, 1
),
ops.Levenshtein: fixed_arity(sa.func.levenshtein, 2),
}
)
7 changes: 7 additions & 0 deletions ibis/backends/pyspark/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2059,3 +2059,10 @@ def compile_hash_column(t, op, **kwargs):
@compiles(ops.ArrayZip)
def compile_zip(t, op, **kwargs):
return F.arrays_zip(*map(partial(t.translate, **kwargs), op.arg))


@compiles(ops.Levenshtein)
def compile_levenshtein(t, op, **kwargs):
left = t.translate(op.left, **kwargs)
right = t.translate(op.right, **kwargs)
return F.levenshtein(left, right)
1 change: 1 addition & 0 deletions ibis/backends/snowflake/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,7 @@ def _map_get(t, op):
ops.ApproxMedian: reduction(lambda x: sa.func.approx_percentile(x, 0.5)),
ops.Median: reduction(sa.func.median),
ops.TableColumn: _table_column,
ops.Levenshtein: fixed_arity(sa.func.editdistance, 2),
}
)

Expand Down
25 changes: 25 additions & 0 deletions ibis/backends/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1023,3 +1023,28 @@ def test_multiple_subs(con):
expr = ibis.literal("foo").substitute(m)
result = con.execute(expr)
assert result == "FOO"


@pytest.mark.notimpl(
[
"clickhouse",
"dask",
"datafusion",
"druid",
"impala",
"mssql",
"mysql",
"pandas",
"polars",
"sqlite",
],
raises=com.OperationNotDefinedError,
)
@pytest.mark.parametrize(
"right", ["sitting", ibis.literal("sitting")], ids=["python", "ibis"]
)
def test_levenshtein(con, right):
left = ibis.literal("kitten")
expr = left.levenshtein(right)
result = con.execute(expr)
assert result == 3
1 change: 1 addition & 0 deletions ibis/backends/trino/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,7 @@ def _try_cast(t, op):
lambda arg: sa.cast(sa.func.date_format(arg, "%f"), sa.INTEGER()),
1,
),
ops.Levenshtein: fixed_arity(sa.func.levenshtein_distance, 2),
}
)

Expand Down
9 changes: 9 additions & 0 deletions ibis/expr/operations/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,12 @@ class StringContains(Value):

output_shape = rlz.shape_like("args")
output_dtype = dt.bool


@public
class Levenshtein(Value):
left = rlz.string
right = rlz.string

output_dtype = dt.int64
output_shape = rlz.shape_like("args")
23 changes: 23 additions & 0 deletions ibis/expr/types/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -1515,6 +1515,29 @@ def __mul__(self, n: int | ir.IntegerValue) -> StringValue | NotImplemented:

__rmul__ = __mul__

def levenshtein(self, other: StringValue) -> ir.IntegerValue:
"""Return the Levenshtein distance between two strings.
Parameters
----------
other
String to compare to
Returns
-------
IntegerValue
The edit distance between the two strings
Examples
--------
>>> import ibis
>>> ibis.options.interactive = True
>>> s = ibis.literal("kitten")
>>> s.levenshtein("sitting")
3
"""
return ops.Levenshtein(self, other).to_expr()


@public
class StringScalar(Scalar, StringValue):
Expand Down

1 comment on commit ab211a8

@ibis-squawk-bot
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Performance Alert ⚠️

Possible performance regression was detected for benchmark.
Benchmark result of this commit is worse than the previous benchmark result exceeding threshold 3.

Benchmark suite Current: ab211a8 Previous: 304edd1 Ratio
ibis/tests/benchmarks/test_benchmarks.py::test_compile[small-clickhouse] 975.1203171807895 iter/sec (stddev: 0.009809930323390175) 3660.137477033329 iter/sec (stddev: 0.00006278306113303521) 3.75

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.