Skip to content

Commit

Permalink
feat: Switch to positional postings list based FTS index (#6)
Browse files Browse the repository at this point in the history
- Add a new `Posting` class that stores index information for a token, including the document ID, property name, property length, and positional offsets
- Add IR functions to extract postings from a document
- Modify the scoring algorithm to use postings

Fixes #5
  • Loading branch information
haroldadmin authored Jan 30, 2022
1 parent 2fb32f1 commit c8e1f60
Show file tree
Hide file tree
Showing 7 changed files with 250 additions and 219 deletions.
164 changes: 71 additions & 93 deletions core/src/main/kotlin/FtsIndex.kt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
package com.haroldadmin.lucilla.core

import com.haroldadmin.lucilla.core.rank.documentFrequency
import com.haroldadmin.lucilla.core.rank.termFrequency
import com.haroldadmin.lucilla.core.rank.tfIdf
import com.haroldadmin.lucilla.ir.Posting
import com.haroldadmin.lucilla.ir.extractDocumentId
import com.haroldadmin.lucilla.ir.extractPostings
import com.haroldadmin.lucilla.ir.extractProperties
import com.haroldadmin.lucilla.ir.extractTokens
import com.haroldadmin.lucilla.pipeline.Pipeline
Expand Down Expand Up @@ -32,14 +32,6 @@ public data class SearchResult(
val matchTerm: String,
)

/**
* Alias for [Map] that models a token's frequency of appearance in documents
*
* Key = Document ID
* Value = Frequency of the token in the document
*/
internal typealias DocumentFrequencies = MutableMap<Int, Int>

/**
* Alias for a token in a document
*/
Expand All @@ -58,25 +50,25 @@ internal typealias Token = String
* ```json
* {
* "<some-token>": {
* "name": {
* "<doc-id>": "<frequency>",
* },
* "author": {
* "<doc-id>": "<frequency>"
* }
* "name": [
* { "<doc-id>": "<offsets>", "<length>" }
* ],
* "author": [
* { "<doc-id>": "<offsets>", "<length>" }
* ]
* }
* }
* ```
* ```
*/
internal typealias InvertedIndex = Map<Token, Map<String, DocumentFrequencies>>
internal typealias InvertedIndex = Map<Token, List<Posting>>

/**
* Mutable variant of [InvertedIndex].
*
* Backed by a [PatriciaTrie] for efficient space utilisation.
*/
internal typealias MutableInvertedIndex = Trie<Token, MutableMap<String, DocumentFrequencies>>
internal typealias MutableInvertedIndex = Trie<Token, MutableList<Posting>>

/**
* A Full Text Search index for fast and efficient
Expand Down Expand Up @@ -112,13 +104,24 @@ public class FtsIndex<DocType : Any>(
* same score. To distinguish between the quality of match between the two,
* we divide their score with the number of tokens in each doc's properties.
*/
private val docs: MutableMap<Int, Map<String, Int>> = mutableMapOf()
private val _docs: MutableSet<Int> = mutableSetOf()

/**
* Map of the average lengths of each property in [DocType]
*/
private val averageLengths: MutableMap<String, Int> = mutableMapOf()

/**
* Public read-only view of the internal docs list
*/
public val docs: Set<Int>
get() = _docs

/**
* Number of documents in the index
*/
public val size: Int
get() = docs.size
get() = _docs.size

/**
* Number of tokens in the index
Expand All @@ -137,8 +140,7 @@ public class FtsIndex<DocType : Any>(
* 2. Converts each extracted property to its value as a string
* 3. Runs the string value through the Text Processing [Pipeline] to extract tokens
* from the document.
* 4. Add each token as a key to the index, with the value set to its property
* specific document frequencies
* 4. Calculate the [Posting] list for each token, and add it to the index
*
* Returns without modifying the index if a document with the given ID is already
* present in the index.
Expand All @@ -148,36 +150,26 @@ public class FtsIndex<DocType : Any>(
*/
public fun add(doc: DocType): Int {
val docId = extractDocumentId(doc)
if (docId in docs) {
if (docId in _docs) {
return 0
}
_docs.add(docId)

val docProps = extractProperties(doc)
val propsToTokens = docProps.associate { prop ->
val propValue = prop.call(doc)
val propTokens = extractTokens(propValue, pipeline)
prop.name to propTokens
}
val docProperties = extractProperties(doc)
val tokensToPostings = extractPostings(doc, pipeline, docId, docProperties)

var addedTokens = 0
for ((prop, tokens) in propsToTokens) {
for (token in tokens) {
val propsForToken = _index[token] ?: mutableMapOf<String, DocumentFrequencies>().also { addedTokens++ }
val docFrequenciesForProp = propsForToken[prop] ?: mutableMapOf()
val tokenFrequency = docFrequenciesForProp.getOrDefault(docId, 0) + 1

docFrequenciesForProp[docId] = tokenFrequency
propsForToken[prop] = docFrequenciesForProp
_index[token] = propsForToken
}
for ((token, posting) in tokensToPostings) {
val postingList = _index[token] ?: mutableListOf<Posting>().also { addedTokens++ }
postingList.add(posting)
_index[token] = postingList

val property = posting.property
val totalPropLength = averageLengths.getOrDefault(property, 0) * docs.size
val averagePropLength = totalPropLength / docs.size
averageLengths[property] = averagePropLength
}

val propLengths = mutableMapOf<String, Int>()
for ((prop, tokens) in propsToTokens) {
propLengths[prop] = tokens.size
}
docs[docId] = propLengths

return addedTokens
}

Expand All @@ -196,50 +188,28 @@ public class FtsIndex<DocType : Any>(
*/
public fun remove(doc: DocType) {
val docId = extractDocumentId(doc)
if (docId !in docs) {
if (docId !in _docs) {
return
}
_docs.remove(docId)

val docProps = extractProperties(doc)
val docTokens = docProps
.map { prop ->
val propValue = prop.call(doc)
extractTokens(propValue, pipeline)
}
val docTokens = extractProperties(doc)
.map { prop -> extractTokens(prop.call(doc), pipeline) }
.flatten()

for (token in docTokens) {
val tokenProps = _index[token]
if (tokenProps == null || tokenProps.isEmpty()) {
val postingList = _index[token]
if (postingList == null || postingList.isEmpty()) {
_index.remove(token)
continue
}

val propsToRemove = mutableListOf<String>()
for ((prop, documentFrequencies) in tokenProps) {
val existingFrequency = documentFrequencies.getOrDefault(docId, 0)
val newFrequency = existingFrequency - 1
documentFrequencies[docId] = newFrequency

if (newFrequency < 1) {
documentFrequencies.remove(docId)
}

if (documentFrequencies.isEmpty()) {
propsToRemove.add(prop)
}
}

for (prop in propsToRemove) {
tokenProps.remove(prop)
}
postingList.removeIf { it.docId == docId }

if (tokenProps.isEmpty()) {
if (postingList.isEmpty()) {
_index.remove(token)
}
}

docs.remove(docId)
}

/**
Expand All @@ -260,14 +230,12 @@ public class FtsIndex<DocType : Any>(

val results = mutableListOf<SearchResult>()
for (queryToken in queryTokens) {
val matchingProps = _index[queryToken] ?: continue
for ((prop, docFrequencies) in matchingProps) {
for (docId in docFrequencies.keys) {

val score = score(queryToken, docId, prop)
val result = SearchResult(docId, score, queryToken)
results.add(result)
}
val postingList = _index[queryToken] ?: continue
val postingListToDocs = postingList.groupBy { it.docId }
for ((docId, docPostingList) in postingListToDocs) {
val score = score(docPostingList)
val result = SearchResult(docId, score, queryToken)
results.add(result)
}
}

Expand All @@ -279,24 +247,34 @@ public class FtsIndex<DocType : Any>(
* Clears all documents added to the FTS index
*/
public fun clear() {
this.docs.clear()
this._docs.clear()
this._index.clear()
}

/**
* Calculates the match score for a term in a document using the
* TF-IDF algorithm
* TF-IDF algorithm.
*
* @param term The term whose TF-IDF value must be calculated
* @param docId The document in which the term appears
* The given postings must all be for the same document.
*
* @param postingList The posting list for the term for a specific document
* @return The TF-IDF value for the term in the document
*/
private fun score(term: String, docId: Int, prop: String): Double {
val propLength = docs[docId]?.get(prop) ?: 0
val tf = termFrequency(term, docId, propLength, _index, prop)
val df = documentFrequency(term, _index, prop)
val n = docs.size
return tfIdf(tf, df, n)
private fun score(postingList: List<Posting>): Double {
val df = postingList.size
val n = _docs.size

var tf = 0
var propLength = 0
var averageLength = 0
for (posting in postingList) {
tf += posting.offsets.size
propLength += posting.propertyLength
averageLength += averageLengths.getOrDefault(posting.property, 0)
}

val normalizationFactor = propLength / (1 + averageLength)
return tfIdf(tf, df, n) / normalizationFactor
}
}

Expand Down
56 changes: 11 additions & 45 deletions core/src/main/kotlin/rank/TfIdf.kt
Original file line number Diff line number Diff line change
@@ -1,55 +1,21 @@
package com.haroldadmin.lucilla.core.rank

import com.haroldadmin.lucilla.core.InvertedIndex
import kotlin.math.ln

/**
* Calculates the frequency of the term in the document with the given ID
*
* This method uses the "scaled frequency" variant of the term-frequency metric.
* - Raw frequency is the number of times the term appears in the document.
* - Scaled frequency adjusts for the raw frequency by dividing it with the length of the
* document
*
* @param term The term to find the frequency of
* @param docId The document to find the term's frequency in
* @param docLength The length of the tokens in the document
* @param index The FTS index
* @param prop The name of the document property to fetch document frequencies for
* @return The scaled frequency of the term in the document
*/
internal fun termFrequency(
term: String,
docId: Int,
docLength: Int,
index: InvertedIndex,
prop: String,
): Double {
val propsWithTerm = index[term] ?: return 0.0
val docsWithTerm = propsWithTerm[prop] ?: return 0.0
val rawTf = docsWithTerm[docId] ?: return 0.0
return rawTf.toDouble() / docLength
}

/**
* Returns the number of documents in which a term appears
* Calculates the TF-IDF value for a term.
*
* This method finds only the documents containing the exact term. Documents
* that contain words with the given term as a prefix are not considered.
* This method uses the logarithmic IDF variant.
* The denominator is adjusted to be (1 + df) to avoid division
* by zero errors.
*
* @param term The term to find the document frequency of
* @param index The FTS index
* @param prop The prop to fetch docs for
* @return The number of documents in which the term appears
* @param tf The term frequency in a document
* @param df The number of documents in which the term appears
* @param n The number of documents in the index
* @return TF-IDF value for the term
*/
internal fun documentFrequency(
term: String,
index: InvertedIndex,
prop: String,
): Int {
val propsWithTerm = index[term] ?: return 0
val docsWithTerm = propsWithTerm[prop] ?: return 0
return docsWithTerm.keys.size
internal fun tfIdf(tf: Double, df: Int, n: Int): Double {
return tf * (ln(n.toDouble() / (1 + df)))
}

/**
Expand All @@ -64,6 +30,6 @@ internal fun documentFrequency(
* @param n The number of documents in the index
* @return TF-IDF value for the term
*/
internal fun tfIdf(tf: Double, df: Int, n: Int): Double {
internal fun tfIdf(tf: Int, df: Int, n: Int): Double {
return tf * (ln(n.toDouble() / (1 + df)))
}
7 changes: 1 addition & 6 deletions core/src/test/kotlin/FtsIndexTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,7 @@ class FtsIndexTest : DescribeSpec({
val books = generateBooks().take(10).toList()
books.forEach { fts.add(it) }

val docIds = fts.index.values
.flatMap { propDocs -> propDocs.values }
.flatMap { docFreqs -> docFreqs.keys }
.toSet()

docIds shouldContainExactlyInAnyOrder books.map { b -> b.id }
fts.docs shouldContainExactlyInAnyOrder books.map { b -> b.id }
}

it("should throw an error when adding a document with no '@Id' annotated element") {
Expand Down
Loading

0 comments on commit c8e1f60

Please sign in to comment.