-
Notifications
You must be signed in to change notification settings - Fork 62
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
54027a7
commit d0e834e
Showing
125 changed files
with
3,220 additions
and
1,341 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,27 +1,80 @@ | ||
#pragma once | ||
|
||
#include <math.h> | ||
#include <algorithm> | ||
|
||
namespace reindexer { | ||
const static double kKeofBm25k1 = 2.0; | ||
const static double kKeofBm25b = 0.75; | ||
|
||
inline double IDF(double totalDocCount, double matchedDocCount) { | ||
double f = log((totalDocCount - matchedDocCount + 1) / matchedDocCount) / log(1 + totalDocCount); | ||
// saturate min to 0.2 | ||
if (f < 0.2) f = 0.2; | ||
return f; | ||
} | ||
|
||
inline double TF(double termCountInDoc, double mostFreqWordCountInDoc, double wordsInDoc) { | ||
(void)mostFreqWordCountInDoc; | ||
(void)wordsInDoc; | ||
return termCountInDoc; | ||
} | ||
|
||
inline double bm25score(double termCountInDoc, double mostFreqWordCountInDoc, double wordsInDoc, double avgDocLen) { | ||
auto termFreq = TF(termCountInDoc, mostFreqWordCountInDoc, wordsInDoc); | ||
return termFreq * (kKeofBm25k1 + 1.0) / (termFreq + kKeofBm25k1 * (1.0 - kKeofBm25b + kKeofBm25b * wordsInDoc / avgDocLen)); | ||
} | ||
|
||
class Bm25Rx { | ||
public: | ||
Bm25Rx(double totalDocCount, double matchedDocCount, double k1, double b) noexcept | ||
: k1_(k1), b_(b), idf_(IDF(totalDocCount, matchedDocCount)) {} | ||
|
||
RX_ALWAYS_INLINE double Get(double termCountInDoc, double wordsInDoc, double avgDocLen) const noexcept { | ||
auto termFreq = TF(termCountInDoc, wordsInDoc); | ||
return idf_ * termFreq * (k1_ + 1.0) / (termFreq + k1_ * (1.0 - b_ + b_ * wordsInDoc / avgDocLen)); | ||
} | ||
RX_ALWAYS_INLINE double GetIDF() const noexcept { return idf_; } | ||
|
||
private: | ||
static RX_ALWAYS_INLINE double IDF(double totalDocCount, double matchedDocCount) noexcept { | ||
double f = log((totalDocCount - matchedDocCount + 1) / matchedDocCount) / log(1 + totalDocCount); | ||
// saturate min to 0.2 | ||
if (f < 0.2) f = 0.2; | ||
return f; | ||
} | ||
static RX_ALWAYS_INLINE double TF(double termCountInDoc, double wordsInDoc) noexcept { | ||
(void)wordsInDoc; | ||
return termCountInDoc; | ||
} | ||
|
||
const double k1_; | ||
const double b_; | ||
const double idf_; | ||
}; | ||
|
||
class Bm25Classic { | ||
public: | ||
Bm25Classic(double totalDocCount, double matchedDocCount, double k1, double b) noexcept | ||
: k1_(k1), b_(b), idf_(IDF(totalDocCount, matchedDocCount)) {} | ||
|
||
RX_ALWAYS_INLINE double Get(double termCountInDoc, double wordsInDoc, double avgDocLen) const { | ||
auto termFreq = TF(termCountInDoc, wordsInDoc); | ||
return idf_ * termFreq * (k1_ + 1.0) / (termFreq + k1_ * (1.0 - b_ + b_ * wordsInDoc / avgDocLen)); | ||
} | ||
RX_ALWAYS_INLINE double GetIDF() const noexcept { return idf_; } | ||
|
||
private: | ||
static RX_ALWAYS_INLINE double IDF(double totalDocCount, double matchedDocCount) noexcept { | ||
return log(totalDocCount / (matchedDocCount + 1)) + 1; | ||
} | ||
static RX_ALWAYS_INLINE double TF(double termCountInDoc, double wordsInDoc) noexcept { return termCountInDoc / wordsInDoc; } | ||
|
||
const double k1_; | ||
const double b_; | ||
const double idf_; | ||
}; | ||
|
||
class TermCount { | ||
public: | ||
TermCount(double /*totalDocCount*/, double /*matchedDocCount*/, double /*k1*/, double /*b*/) noexcept {} | ||
|
||
RX_ALWAYS_INLINE double Get(double termCountInDoc, double /*wordsInDoc*/, double /*avgDocLen*/) const noexcept { | ||
return termCountInDoc; | ||
} | ||
RX_ALWAYS_INLINE double GetIDF() const noexcept { return 0.0; } | ||
}; | ||
|
||
template <typename BM> | ||
class Bm25Calculator { | ||
public: | ||
Bm25Calculator(double totalDocCount, double matchedDocCount, double k1, double b) : bm_(totalDocCount, matchedDocCount, k1, b) {} | ||
RX_ALWAYS_INLINE double Get(double termCountInDoc, double wordsInDoc, double avgDocLen) const { | ||
return bm_.Get(termCountInDoc, wordsInDoc, avgDocLen); | ||
} | ||
RX_ALWAYS_INLINE double GetIDF() const noexcept { return bm_.GetIDF(); } | ||
|
||
private: | ||
const BM bm_; | ||
}; | ||
|
||
} // namespace reindexer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.