Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Shape fields added, with support for LatLon and XY geometries. #23

Merged
merged 1 commit into from
Feb 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ PyLucene is not `pip` installable.
dev

* Python >=3.8 required
* Shape fields

3.0

Expand Down
2 changes: 1 addition & 1 deletion docs/engine.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

::: lupyne.engine.documents.DateTimeField

::: lupyne.engine.documents.SpatialField
::: lupyne.engine.documents.ShapeField

## queries

Expand Down
19 changes: 13 additions & 6 deletions docs/examples.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,14 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## searching\n",
"Advanced searching with custom fields.\n",
"\n",
"Lupyne SpatialFields and DateTimeFields are implemented as lucene Point fields.\n",
"Lupyne ShapeFields and DateTimeFields are implemented as lucene Shape and Point fields.\n",
"NestedFields simulate a composite index.\n",
"The fields have convenience methods for creating prefix and range queries."
]
Expand All @@ -161,6 +162,7 @@
"outputs": [],
"source": [
"from datetime import date\n",
"from org.apache.lucene import geo\n",
"\n",
"docs = [\n",
" {\n",
Expand Down Expand Up @@ -196,16 +198,16 @@
"indexer.set('incorporated', engine.DateTimeField)\n",
"indexer.set('year-month-day', engine.NestedField, sep='-')\n",
"indexer.set('population', dimensions=1)\n",
"indexer.set('point', engine.SpatialField)\n",
"indexer.set('point', engine.ShapeField)\n",
"# assigned fields can have a different key from their underlying field name\n",
"indexer.fields['location'] = engine.NestedField('state.city')\n",
"\n",
"for doc in docs:\n",
" doc['year-month-day'] = doc['incorporated']\n",
" point = doc.pop('longitude'), doc.pop('latitude')\n",
" point = geo.Point(doc.pop('latitude'), doc.pop('longitude'))\n",
" location = doc['state'] + '.' + doc['city']\n",
" incorporated = map(int, doc.pop('incorporated').split('-'))\n",
" indexer.add(doc, location=location, incorporated=date(*incorporated), point=[point])\n",
" indexer.add(doc, location=location, incorporated=date(*incorporated), point=point)\n",
"indexer.commit()\n",
"\n",
"query = indexer.fields['incorporated'].prefix([1850])\n",
Expand Down Expand Up @@ -278,7 +280,7 @@
"source": [
"cities = ['San Francisco', 'Los Angeles', 'Portland']\n",
"for index, distance in enumerate([1e3, 1e5, 7e5, 1e6]):\n",
" query = indexer.fields['point'].within(-122.4, 37.7, distance=distance)\n",
" query = indexer.fields['point'].within(geo.Circle(37.7, -122.4, distance))\n",
" print([hit['city'] for hit in indexer.search(query)])"
]
},
Expand Down Expand Up @@ -531,7 +533,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
"version": "3.11.1 (main, Dec 23 2022, 09:28:24) [Clang 14.0.0 (clang-1400.0.29.202)]"
},
"vscode": {
"interpreter": {
"hash": "6ecdebf77f2ee3a47348d003f751c63e810ca996c1c68d1179f338200fa83b34"
}
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion lupyne/engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import lucene
from .analyzers import Analyzer, TokenFilter # noqa
from .queries import Query # noqa
from .documents import Document, Field, NestedField, DateTimeField, SpatialField # noqa
from .documents import Document, Field, NestedField, DateTimeField, ShapeField, SpatialField # noqa
from .indexers import IndexSearcher, MultiSearcher, IndexWriter, Indexer # noqa

version = tuple(map(int, lucene.VERSION.split('.')))
Expand Down
70 changes: 66 additions & 4 deletions lupyne/engine/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@
import collections
import datetime
import operator
from typing import Callable, Iterator, Optional, Sequence
from typing import Callable, Iterator, Optional, Sequence, Union, no_type_check
import lucene # noqa
from java.lang import Long
from java.util import Arrays, HashSet
from org.apache.lucene import document, index, search, util
from org.apache.lucene import document, geo, index, search, util
from org.apache.lucene.search import grouping
from .queries import Query
from .utils import convert

FieldType = document.FieldType
QueryRelation = document.ShapeField.QueryRelation


class Field(FieldType): # type: ignore
Expand Down Expand Up @@ -211,7 +212,7 @@ def within(self, days=0, weeks=0, utc=True, **delta) -> Query:


class SpatialField(Field):
"""Geospatial points, indexed with optional docvalues."""
"""Deprecated: see `ShapeField`."""

def __init__(self, name: str, dimensions: int = 1, **settings):
super().__init__(name, dimensions=dimensions, **settings)
Expand All @@ -233,11 +234,72 @@ def within(self, lng: float, lat: float, distance: float) -> search.Query:
"""
return document.LatLonPoint.newDistanceQuery(self.name, lat, lng, distance)

def distances(self, lng: str, lat: str) -> search.Query:
def distances(self, lng: str, lat: str) -> search.SortField:
"""Return distance SortField."""
return document.LatLonDocValuesField.newDistanceSort(self.name, lat, lng)


class ShapeField:
"""Field which indexes geometries: LatLon or XY."""

def __init__(self, name: str, indexed=True, docvalues=False):
self.name, self.indexed, self.docvalues = name, bool(indexed), bool(docvalues)

@staticmethod
def as_tuple(shape: geo.Geometry) -> tuple:
if isinstance(shape, geo.Point):
return shape.lat, shape.lon
if isinstance(shape, geo.XYPoint):
return shape.x, shape.y
return (shape,)

@no_type_check
def items(self, *shapes: geo.Geometry) -> Iterator[document.Field]:
"""Generate lucene shape fields from geometries."""
for shape in shapes:
cls = document.XYShape if isinstance(shape, geo.XYGeometry) else document.LatLonShape
args = self.as_tuple(shape)
if self.indexed:
yield from cls.createIndexableFields(self.name, *args)
if self.docvalues:
yield cls.createDocValueField(self.name, *args)

def distances(self, point: Union[geo.Point, geo.XYPoint]) -> search.SortField:
"""Return distance SortField."""
xy = isinstance(point, geo.XYGeometry)
cls = document.XYDocValuesField if xy else document.LatLonDocValuesField
return cls.newDistanceSort(self.name, *self.as_tuple(point))

@no_type_check
def query(self, relation: QueryRelation, *shapes: geo.Geometry) -> search.Query:
shape = shapes[0]
cls = document.XYShape if isinstance(shape, geo.XYGeometry) else document.LatLonShape
func = cls.newGeometryQuery
if isinstance(shape, (geo.Line, geo.XYLine)):
func = cls.newLineQuery
if isinstance(shape, (geo.Circle, geo.XYCircle)):
func = cls.newDistanceQuery
if isinstance(shape, (geo.Polygon, geo.XYPolygon)):
func = cls.newPolygonQuery
return func(self.name, relation, *shapes)

def contains(self, *shapes: geo.Geometry) -> search.Query:
"""Return shape query with `contains` relation."""
return self.query(QueryRelation.CONTAINS, *shapes)

def disjoint(self, *shapes: geo.Geometry) -> search.Query:
"""Return shape query with `disjoint` relation."""
return self.query(QueryRelation.DISJOINT, *shapes)

def intersects(self, *shapes: geo.Geometry) -> search.Query:
"""Return shape query with `intersects` relation."""
return self.query(QueryRelation.INTERSECTS, *shapes)

def within(self, *shapes: geo.Geometry) -> search.Query:
"""Return shape query with `within` relation."""
return self.query(QueryRelation.WITHIN, *shapes)


class Document(dict):
"""Multimapping of field names to values, but default getters return the first value."""

Expand Down
40 changes: 39 additions & 1 deletion tests/test_engine.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import bisect
import datetime
import json
import math
import os
import pytest
import lucene
from org.apache.lucene import analysis, document, search, store, util
from org.apache.lucene import analysis, document, geo, search, store, util
from lupyne import engine

Q = engine.Query
Expand Down Expand Up @@ -422,6 +423,43 @@ def test_spatial(indexer, zipcodes):
assert 0 <= distances[ids[-1]] < distances[ids[0]] < 1e4


def test_shape(indexer, zipcodes):
for name in ('longitude', 'latitude'):
indexer.set(name, dimensions=1, stored=True)
latlon = indexer.set('latlon', engine.ShapeField)
xy = indexer.set('xy', engine.ShapeField, indexed=False, docvalues=True)
for doc in zipcodes:
if doc['state'] == 'CA':
point = doc['latitude'], doc['longitude']
indexer.add(doc, latlon=geo.Point(*point), xy=geo.XYPoint(*point[::-1]))
indexer.commit()
assert indexer.count(latlon.contains(geo.Point(*point))) == 0
assert indexer.count(latlon.disjoint(geo.Point(*point))) == 2647
line = [point[0] - 1, point[0] + 1], [point[1], point[1]]
assert indexer.count(latlon.disjoint(geo.Line(*line))) == 2647
assert indexer.count(latlon.intersects(geo.Line(*line))) == 0
circle = point[0], point[1], 1.0
assert indexer.count(latlon.disjoint(geo.Circle(*circle))) == 2646
assert indexer.count(latlon.intersects(geo.Circle(*circle))) == 1
assert indexer.count(latlon.within(geo.Circle(*circle))) == 1
lat, lon = point
(pg,) = geo.Polygon.fromGeoJSON(
json.dumps({'type': 'Polygon', 'coordinates': [[(lon, lat), (lon + 1, lat), (lon, lat + 1), (lon, lat)]]})
)
assert indexer.count(latlon.contains(pg)) == 0
assert indexer.count(latlon.disjoint(pg)) == 2639
assert indexer.count(latlon.intersects(pg)) == 8
assert indexer.count(latlon.within(pg)) == 8

assert isinstance(xy.contains(geo.XYPoint(*point)), search.Query)
assert isinstance(xy.disjoint(geo.XYLine(*line)), search.Query)
assert isinstance(xy.intersects(geo.XYCircle(*circle)), search.Query)
assert isinstance(latlon.distances(geo.Point(*point)), search.SortField)
assert isinstance(xy.distances(geo.XYPoint(*point)), search.SortField)
(field,) = latlon.items(pg)
assert isinstance(field, document.Field)


def test_fields(indexer, constitution):
with pytest.raises(lucene.InvalidArgsError):
engine.Field('', stored='invalid')
Expand Down