Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix bug with BM 25 transformer - need to fit additional parameter #18

Merged
merged 4 commits into from
Jan 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/abstract_text_transformer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,17 @@ function _fit(transformer::AbstractTextTransformer, verbosity::Int, X::Corpus)
if transformer.max_doc_freq < 1 || transformer.min_doc_freq > 0
high = round(Int, transformer.max_doc_freq * n)
low = round(Int, transformer.min_doc_freq * n)
new_dtm, vocab = limit_features(dtm_matrix, high, low)
new_doc_term_mat, vocab = limit_features(dtm_matrix, high, low)
else
new_dtm = dtm_matrix.dtm
new_doc_term_mat = dtm_matrix.dtm
vocab = dtm_matrix.terms
end

# calculate IDF
idf = compute_idf(transformer.smooth_idf, new_dtm)
idf = compute_idf(transformer.smooth_idf, new_doc_term_mat)

# prepare result
fitresult = get_result(transformer, idf, vocab)
fitresult = get_result(transformer, idf, vocab, new_doc_term_mat)
cache = nothing

return fitresult, cache, NamedTuple()
Expand Down
2 changes: 0 additions & 2 deletions src/bagofwords_transformer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
BagOfWordsTransformer()

Convert a collection of raw documents to matrix representing a bag-of-words structure.

Essentially, a bag-of-words approach to representing documents in a matrix is comprised of
a count of every word in the document corpus/collection for every document. This is a simple
but often quite powerful way of representing documents as vectors. The resulting representation is
Expand All @@ -12,7 +11,6 @@ document.

Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
to words occuring in a maximum or minimum portion of documents.

The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
that the transformer will consider. `max_doc_freq` indicates that terms in only
up to the specified percentage of documents will be considered. For example, if
Expand Down
21 changes: 14 additions & 7 deletions src/bm25_transformer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,22 @@ end
struct BMI25TransformerResult
vocab::Vector{String}
idf_vector::Vector{Float64}
mean_words_in_docs::Float64
end

get_result(::BM25Transformer, idf::Vector{Float64}, vocab::Vector{String}) = BMI25TransformerResult(vocab, idf)
function get_result(::BM25Transformer, idf::Vector{F}, vocab::Vector{String}, doc_term_mat::SparseMatrixCSC) where {F <: AbstractFloat}
words_in_documents = F.(sum(doc_term_mat; dims=1))
mean_words_in_docs = mean(words_in_documents)
BMI25TransformerResult(vocab, idf, mean_words_in_docs)
end

# BM25: Okapi Best Match 25
# Details at: https://en.wikipedia.org/wiki/Okapi_BM25
# derived from https://github.com/zgornel/StringAnalysis.jl/blob/master/src/stats.jl
function build_bm25!(doc_term_mat::SparseMatrixCSC{T},
bm25::SparseMatrixCSC{F},
idf_vector::Vector{F};
idf_vector::Vector{F},
mean_words_in_docs::Float64;
κ::Int=2,
β::Float64=0.75) where {T <: Real, F <: AbstractFloat}
@assert size(doc_term_mat) == size(bm25)
Expand All @@ -82,7 +88,7 @@ function build_bm25!(doc_term_mat::SparseMatrixCSC{T},

# TF tells us what proportion of a document is defined by a term
words_in_documents = F.(sum(doc_term_mat; dims=1))
ln = words_in_documents ./ mean(words_in_documents)
ln = words_in_documents ./ mean_words_in_docs
oneval = one(F)

for i = 1:n
Expand All @@ -100,9 +106,9 @@ end
function _transform(transformer::BM25Transformer,
result::BMI25TransformerResult,
v::Corpus)
dtm_matrix = build_dtm(v, result.vocab)
bm25 = similar(dtm_matrix.dtm, eltype(result.idf_vector))
build_bm25!(dtm_matrix.dtm, bm25, result.idf_vector; κ=transformer.κ, β=transformer.β)
doc_terms = build_dtm(v, result.vocab)
bm25 = similar(doc_terms.dtm, eltype(result.idf_vector))
build_bm25!(doc_terms.dtm, bm25, result.idf_vector, result.mean_words_in_docs; κ=transformer.κ, β=transformer.β)

# here we return the `adjoint` of our sparse matrix to conform to
# the `n x p` dimensions throughout MLJ
Expand All @@ -113,7 +119,8 @@ end
function MMI.fitted_params(::BM25Transformer, fitresult)
vocab = fitresult.vocab
idf_vector = fitresult.idf_vector
return (vocab = vocab, idf_vector = idf_vector)
mean_words_in_docs = fitresult.mean_words_in_docs
return (vocab = vocab, idf_vector = idf_vector, mean_words_in_docs = mean_words_in_docs)
end


Expand Down
9 changes: 5 additions & 4 deletions src/tfidf_transformer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ struct TfidfTransformerResult
idf_vector::Vector{Float64}
end

get_result(::TfidfTransformer, idf::Vector{Float64}, vocab::Vector{String}) = TfidfTransformerResult(vocab, idf)
get_result(::TfidfTransformer, idf::Vector{<:AbstractFloat}, vocab::Vector{String}, ::SparseMatrixCSC) =
TfidfTransformerResult(vocab, idf)

function build_tfidf!(doc_term_mat::SparseMatrixCSC{T},
tfidf::SparseMatrixCSC{F},
Expand Down Expand Up @@ -89,9 +90,9 @@ end
function _transform(::TfidfTransformer,
result::TfidfTransformerResult,
v::Corpus)
dtm_matrix = build_dtm(v, result.vocab)
tfidf = similar(dtm_matrix.dtm, eltype(result.idf_vector))
build_tfidf!(dtm_matrix.dtm, tfidf, result.idf_vector)
doc_terms = build_dtm(v, result.vocab)
tfidf = similar(doc_terms.dtm, eltype(result.idf_vector))
build_tfidf!(doc_terms.dtm, tfidf, result.idf_vector)

# here we return the `adjoint` of our sparse matrix to conform to
# the `n x p` dimensions throughout MLJ
Expand Down
14 changes: 7 additions & 7 deletions src/utils.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
function limit_features(doc_term_matrix::DocumentTermMatrix,
function limit_features(doc_terms::DocumentTermMatrix,
high::Int,
low::Int)
doc_freqs = vec(sum(doc_term_matrix.dtm, dims=2))
doc_freqs = vec(sum(doc_terms.dtm, dims=2))

# build mask to restrict terms
mask = trues(length(doc_freqs))
Expand All @@ -12,9 +12,9 @@ function limit_features(doc_term_matrix::DocumentTermMatrix,
mask .&= (doc_freqs .>= low)
end

new_terms = doc_term_matrix.terms[mask]
new_terms = doc_terms.terms[mask]

return (doc_term_matrix.dtm[mask, :], new_terms)
return (doc_terms.dtm[mask, :], new_terms)
end

## Helper functions to build Corpus ##
Expand Down Expand Up @@ -55,11 +55,11 @@ function build_dtm(docs::Corpus, terms::Vector{T}) where {T}
end
end
if length(rows) > 0
doc_term_matrix = sparse(rows, columns, values, m, n)
doc_term_mat = sparse(rows, columns, values, m, n)
else
doc_term_matrix = spzeros(Int, m, n)
doc_term_mat = spzeros(Int, m, n)
end
DocumentTermMatrix(doc_term_matrix, terms, row_indices)
DocumentTermMatrix(doc_term_mat, terms, row_indices)
end

## General method to calculate IDF vector ##
Expand Down
10 changes: 10 additions & 0 deletions test/abstract_text_transformer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,16 @@ using TextAnalysis
@test sum(test_doc_transform, dims=2)[2] > 0.0
@test size(test_doc_transform) == (2, 11)
end

# test proper fit:
# here we are testing to make sure the size of the corpus to be
# transformed does not alter the transformation that the model
# is doing.
for mach = test_machines
single_doc_transform = transform(mach, [test_doc2])
multiple_doc_transform = transform(mach, [test_doc2, test_doc2])
@test single_doc_transform[1, :] == multiple_doc_transform[1, :]
end
end

@testset "bag of words use" begin
Expand Down