Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Build index for free-text search over newspaper titles (TT-1516) #70

Merged
merged 5 commits into from
Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,5 @@ replay_pid*

application-local.yml
target
index-data
index-data-test
13 changes: 13 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
<kotlin.version>1.9.23</kotlin.version>
<serialization.version>1.6.3</serialization.version>
<fasterxml.version>2.15.4</fasterxml.version>
<lucene.version>9.10.0</lucene.version>
<!-- Fasterxml version for latest spring boot here: https://docs.spring.io/spring-boot/docs/current/reference/html/dependency-versions.html -->
</properties>
<dependencies>
Expand Down Expand Up @@ -113,6 +114,18 @@
<version>0.2.0</version>
</dependency>

<!-- For building the search index -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analysis-common</artifactId>
<version>${lucene.version}</version>
</dependency>

<!-- Dependencies only for tests -->
<dependency>
<groupId>org.springframework.boot</groupId>
Expand Down
2 changes: 2 additions & 0 deletions src/main/kotlin/no/nb/bikube/BikubeApplication.kt
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ package no.nb.bikube
import org.springframework.boot.autoconfigure.SpringBootApplication
import org.springframework.boot.context.properties.ConfigurationPropertiesScan
import org.springframework.boot.runApplication
import org.springframework.scheduling.annotation.EnableScheduling

@ConfigurationPropertiesScan
@SpringBootApplication
@EnableScheduling
class BikubeApplication

fun main(args: Array<String>) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,14 @@ class CollectionsRepository(
)
}

fun getTitleByName(name: String): Mono<CollectionsModel> {
return searchTexts(
fun getAllNewspaperTitles(page: Int = 1): Mono<CollectionsModel> {
return getRecordsWebClientRequest(
"record_type=${CollectionsRecordType.WORK} and " +
"work.description_type=${CollectionsDescriptionType.SERIAL} and " +
"title=\"${name}\""
)
"work.description_type=${CollectionsDescriptionType.SERIAL}",
MariusLevang marked this conversation as resolved.
Show resolved Hide resolved
CollectionsDatabase.TEXTS,
limit = 50,
from = (page-1) * 50 + 1
).bodyToMono<CollectionsModel>()
}

fun searchPublisher(name: String): Mono<CollectionsNameModel> {
Expand Down Expand Up @@ -81,14 +83,21 @@ class CollectionsRepository(
return getRecordsWebClientRequest(query, db).bodyToMono<CollectionsTermModel>()
}

private fun getRecordsWebClientRequest(query: String, db: CollectionsDatabase): WebClient.ResponseSpec {
private fun getRecordsWebClientRequest(
query: String,
db: CollectionsDatabase,
limit: Int = 10,
from: Int = 1
): WebClient.ResponseSpec {
return webClient()
.get()
.uri {
it
.queryParam("database", db.value)
.queryParam("output", "json")
.queryParam("search", query)
.queryParam("limit", limit)
.queryParam("startfrom", from)
.build()
}
.retrieve()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import no.nb.bikube.core.model.CatalogueRecord
import no.nb.bikube.core.model.Item
import no.nb.bikube.core.model.Title
import no.nb.bikube.newspaper.service.NewspaperService
import no.nb.bikube.newspaper.service.TitleIndexService
import org.springframework.http.MediaType
import org.springframework.http.ResponseEntity
import org.springframework.web.bind.annotation.GetMapping
Expand All @@ -33,7 +34,8 @@ import java.time.LocalDate
@Tag(name = "Catalogue objects", description = "Endpoints related to catalog data for all text material")
@RequestMapping("")
class CoreController (
private val newspaperService: NewspaperService
private val newspaperService: NewspaperService,
private val titleIndexService: TitleIndexService
){
companion object {
const val DATE_REGEX = "^(17|18|19|20)\\d{2}(-)?(0[1-9]|1[0-2])(-)?(0[1-9]|[12][0-9]|3[01])$"
Expand Down Expand Up @@ -86,10 +88,10 @@ class CoreController (
fun searchTitle(
@RequestParam searchTerm: String,
@RequestParam materialType: MaterialType
): ResponseEntity<Flux<CatalogueRecord>> {
): ResponseEntity<List<CatalogueRecord>> {
if (searchTerm.isEmpty()) throw BadRequestBodyException("Search term cannot be empty.")
return when(materialTypeToCatalogueName(materialType)) {
CatalogueName.COLLECTIONS -> ResponseEntity.ok(newspaperService.searchTitleByName(searchTerm))
CatalogueName.COLLECTIONS -> ResponseEntity.ok(titleIndexService.searchTitle(searchTerm))
else -> throw NotSupportedException("Material type $materialType is not supported.")
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import jakarta.validation.ConstraintViolationException
import no.nb.bikube.core.exception.BadRequestBodyException
import no.nb.bikube.core.exception.NotSupportedException
import no.nb.bikube.core.exception.RecordAlreadyExistsException
import no.nb.bikube.core.exception.SearchIndexNotAvailableException
import no.nb.bikube.core.util.logger
import org.springframework.http.HttpStatus
import org.springframework.http.ProblemDetail
Expand Down Expand Up @@ -58,6 +59,17 @@ class GlobalControllerExceptionHandler {

return problemDetail
}

@ExceptionHandler
fun handlerSearchIndexNotAvailableException(exception: SearchIndexNotAvailableException): ProblemDetail {
logger().error("SearchIndexNotAvailableException occurred")

val problemDetail = ProblemDetail.forStatus(HttpStatus.SERVICE_UNAVAILABLE)
problemDetail.detail = "The search index is unavailable"
problemDetail.addDefaultProperties()

return problemDetail
}
}

fun ProblemDetail.addDefaultProperties() {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
package no.nb.bikube.core.exception

class SearchIndexNotAvailableException: Exception()
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import no.nb.bikube.core.model.Title
import no.nb.bikube.core.model.inputDto.TitleInputDto
import no.nb.bikube.core.util.logger
import no.nb.bikube.newspaper.service.NewspaperService
import no.nb.bikube.newspaper.service.TitleIndexService
import org.springframework.http.HttpStatus
import org.springframework.http.MediaType
import org.springframework.http.ResponseEntity
Expand All @@ -27,7 +28,8 @@ import reactor.core.publisher.Mono
@Tag(name="Newspaper titles", description="Endpoints related to newspaper titles.")
@RequestMapping("/newspapers/titles")
class TitleController (
private val newspaperService: NewspaperService
private val newspaperService: NewspaperService,
private val titleIndexService: TitleIndexService
) {
@PostMapping("/", produces = [MediaType.APPLICATION_JSON_VALUE])
@Operation(summary = "Create a newspaper title")
Expand Down Expand Up @@ -68,7 +70,10 @@ class TitleController (

return Mono.`when`(publisherMono, locationMono, languageMono)
.then(newspaperService.createNewspaperTitle(title))
.map { createdTitle -> ResponseEntity.status(HttpStatus.CREATED).body(createdTitle) }
.map { createdTitle ->
titleIndexService.addTitle(createdTitle)
ResponseEntity.status(HttpStatus.CREATED).body(createdTitle)
}
.doOnSuccess { responseEntity ->
logger().info("Newspaper title created with id: ${responseEntity.body?.catalogueId}")
}
Expand Down
19 changes: 15 additions & 4 deletions src/main/kotlin/no/nb/bikube/newspaper/service/NewspaperService.kt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import reactor.core.publisher.Flux
import reactor.core.publisher.Mono
import reactor.core.publisher.SynchronousSink
import reactor.kotlin.core.publisher.toMono
import reactor.util.function.Tuple2
import java.time.LocalDate
import java.time.format.DateTimeFormatter

Expand Down Expand Up @@ -95,10 +96,20 @@ class NewspaperService (
}
}

fun searchTitleByName(name: String): Flux<CatalogueRecord> {
return collectionsRepository.getTitleByName(name)
.flatMapIterable { it.getObjects() ?: emptyList() }
.map { mapCollectionsObjectToGenericTitle(it) }
fun getTitlesPage(pageNumber: Int): Mono<Tuple2<List<Title>, Int>> {
val pageContent = collectionsRepository.getAllNewspaperTitles(pageNumber)
.mapNotNull { model ->
model.getObjects()
?. map { mapCollectionsObjectToGenericTitle(it) }
}
return Mono.zip(pageContent, Mono.just(pageNumber))
}

fun getAllTitles(): Mono<List<Title>> {
return getTitlesPage(1)
.expand { p -> getTitlesPage(p.t2 + 1) }
.flatMapIterable { it.t1 }
.collectList()
}

fun getItemsByTitle(
Expand Down
161 changes: 161 additions & 0 deletions src/main/kotlin/no/nb/bikube/newspaper/service/TitleIndexService.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
package no.nb.bikube.newspaper.service

import no.nb.bikube.core.exception.SearchIndexNotAvailableException
import no.nb.bikube.core.model.Title
import no.nb.bikube.core.util.logger
import org.apache.lucene.analysis.core.LowerCaseFilterFactory
import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory
import org.apache.lucene.analysis.custom.CustomAnalyzer
import org.apache.lucene.document.Document
import org.apache.lucene.document.Field
import org.apache.lucene.document.StoredField
import org.apache.lucene.document.TextField
import org.apache.lucene.index.IndexWriter
import org.apache.lucene.index.IndexWriterConfig
import org.apache.lucene.index.Term
import org.apache.lucene.search.BooleanClause
import org.apache.lucene.search.BooleanQuery
import org.apache.lucene.search.SearcherManager
import org.apache.lucene.search.WildcardQuery
import org.apache.lucene.store.FSDirectory
import org.springframework.beans.factory.annotation.Value
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty
import org.springframework.scheduling.annotation.Scheduled
import org.springframework.stereotype.Service
import java.nio.file.Paths
import java.time.LocalDate
import java.util.concurrent.atomic.AtomicInteger

interface TitleIndexService {
fun indexAllTitles()
fun addTitle(title: Title)
fun searchTitle(query: String): List<Title>
}

@ConditionalOnProperty(
prefix = "search-index",
name = ["enabled"],
havingValue = "true"
)
@Service
class TitleIndexServiceImpl(
private val newspaperService: NewspaperService,
@Value("\${search-index.path}") private val searchIndexPath: String
): TitleIndexService {
private val titleAnalyzer = CustomAnalyzer.builder()
.withTokenizer(WhitespaceTokenizerFactory.NAME)
.addTokenFilter(LowerCaseFilterFactory.NAME)
.build()

private val indexWriter = IndexWriter(
FSDirectory.open(Paths.get(searchIndexPath)),
IndexWriterConfig(titleAnalyzer)
)

private val searcherManager = SearcherManager(indexWriter, null)

private fun makeDocument(title: Title): Document? {
if (title.name == null)
return null
val document = Document()
document.add(TextField("name", title.name, Field.Store.YES))
document.add(StoredField("catalogueId", title.catalogueId))
title.startDate ?. let { document.add(StoredField("startDate", it.toString())) }
title.endDate ?. let { document.add(StoredField("endDate", it.toString())) }
title.publisher ?. let { document.add(StoredField("publisher", it)) }
title.publisherPlace ?. let { document.add(StoredField("publisherPlace", it)) }
title.language ?. let { document.add(StoredField("language", it)) }
title.materialType ?. let { document.add(StoredField("materialType", it)) }
return document
}

private val indexStatus = AtomicInteger(IndexStatus.UNINITIALIZED.ordinal)

@Scheduled(
initialDelayString = "\${search-index.initial-delay}",
fixedDelayString = "\${search-index.rebuild-index-delay}"
)
override fun indexAllTitles() {
if (indexStatus.get() == IndexStatus.INDEXING.ordinal)
return
logger().debug("Start fetching all titles to index...")
newspaperService.getAllTitles()
.map { titles ->
titles.mapNotNull { makeDocument(it) }
}
.doOnSuccess { documents ->
indexStatus.set(IndexStatus.INDEXING.ordinal)
indexWriter.deleteAll()
indexWriter.addDocuments(documents)
indexWriter.commit()
searcherManager.maybeRefresh()
indexStatus.set(IndexStatus.READY.ordinal)
logger().info("Titles index ready")
}
.subscribe()
}

override fun addTitle(title: Title) {
logger().debug("Adding title ${title.name} to index")
indexWriter.addDocument(makeDocument(title))
indexWriter.commit()
searcherManager.maybeRefresh()
}

@Throws(SearchIndexNotAvailableException::class)
override fun searchTitle(query: String): List<Title> {
if (indexStatus.get() != IndexStatus.READY.ordinal)
throw SearchIndexNotAvailableException()
val indexSearcher = searcherManager.acquire()
val terms = query.split(Regex("\\s+"))
val queryBuilder = BooleanQuery.Builder()
terms.forEach {
queryBuilder.add(
WildcardQuery(Term("name", "*${it.lowercase()}*")),
BooleanClause.Occur.MUST
)
}

val q = queryBuilder.build()
logger().debug("Title search: {}", q)
val storedFields = indexSearcher.storedFields()
return indexSearcher.search(q, 50)
.scoreDocs
.map { storedFields.document(it.doc) }
.map { doc ->
Title(
catalogueId = doc.get("catalogueId"),
name = doc.get("name"),
startDate = doc.get("startDate") ?. let { LocalDate.parse(it) },
endDate = doc.get("endDate") ?. let { LocalDate.parse(it) },
publisher = doc.get("publisher"),
publisherPlace = doc.get("publisherPlace"),
language = doc.get("language"),
materialType = doc.get("materialType")
)
}
}

@Scheduled(fixedDelayString = "\${search-index.searcher-refresh-delay}")
fun refresh() {
searcherManager.maybeRefresh()
}
}

@ConditionalOnProperty(
prefix = "search-index",
name = ["enabled"],
havingValue = "false"
)
@Service
class TitleIndexServiceDisabledImpl: TitleIndexService {
override fun indexAllTitles() {}
override fun addTitle(title: Title) {}
override fun searchTitle(query: String) = emptyList<Title>()
}

enum class IndexStatus {
UNINITIALIZED,
INDEXING,
READY
}
7 changes: 7 additions & 0 deletions src/main/resources/application.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,10 @@ alma:
http-proxy:
host: ""
port: 0

search-index:
enabled: true
path: index-data
initial-delay: 1000
rebuild-index-delay: 1800000
searcher-refresh-delay: 600000
Loading
Loading