diff --git a/src/abstract_text_transformer.jl b/src/abstract_text_transformer.jl index 0961400..5d7e704 100644 --- a/src/abstract_text_transformer.jl +++ b/src/abstract_text_transformer.jl @@ -34,17 +34,17 @@ function _fit(transformer::AbstractTextTransformer, verbosity::Int, X::Corpus) if transformer.max_doc_freq < 1 || transformer.min_doc_freq > 0 high = round(Int, transformer.max_doc_freq * n) low = round(Int, transformer.min_doc_freq * n) - new_dtm, vocab = limit_features(dtm_matrix, high, low) + new_doc_term_mat, vocab = limit_features(dtm_matrix, high, low) else - new_dtm = dtm_matrix.dtm + new_doc_term_mat = dtm_matrix.dtm vocab = dtm_matrix.terms end # calculate IDF - idf = compute_idf(transformer.smooth_idf, new_dtm) + idf = compute_idf(transformer.smooth_idf, new_doc_term_mat) # prepare result - fitresult = get_result(transformer, idf, vocab) + fitresult = get_result(transformer, idf, vocab, new_doc_term_mat) cache = nothing return fitresult, cache, NamedTuple() diff --git a/src/bagofwords_transformer.jl b/src/bagofwords_transformer.jl index 1ebaa9f..d1f6b62 100644 --- a/src/bagofwords_transformer.jl +++ b/src/bagofwords_transformer.jl @@ -2,7 +2,6 @@ BagOfWordsTransformer() Convert a collection of raw documents to matrix representing a bag-of-words structure. - Essentially, a bag-of-words approach to representing documents in a matrix is comprised of a count of every word in the document corpus/collection for every document. This is a simple but often quite powerful way of representing documents as vectors. The resulting representation is @@ -12,7 +11,6 @@ document. Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted to words occuring in a maximum or minimum portion of documents. - The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary that the transformer will consider. `max_doc_freq` indicates that terms in only up to the specified percentage of documents will be considered. For example, if diff --git a/src/bm25_transformer.jl b/src/bm25_transformer.jl index 859ad77..80faf1c 100644 --- a/src/bm25_transformer.jl +++ b/src/bm25_transformer.jl @@ -57,16 +57,22 @@ end struct BMI25TransformerResult vocab::Vector{String} idf_vector::Vector{Float64} + mean_words_in_docs::Float64 end -get_result(::BM25Transformer, idf::Vector{Float64}, vocab::Vector{String}) = BMI25TransformerResult(vocab, idf) +function get_result(::BM25Transformer, idf::Vector{F}, vocab::Vector{String}, doc_term_mat::SparseMatrixCSC) where {F <: AbstractFloat} + words_in_documents = F.(sum(doc_term_mat; dims=1)) + mean_words_in_docs = mean(words_in_documents) + BMI25TransformerResult(vocab, idf, mean_words_in_docs) +end # BM25: Okapi Best Match 25 # Details at: https://en.wikipedia.org/wiki/Okapi_BM25 # derived from https://github.com/zgornel/StringAnalysis.jl/blob/master/src/stats.jl function build_bm25!(doc_term_mat::SparseMatrixCSC{T}, bm25::SparseMatrixCSC{F}, - idf_vector::Vector{F}; + idf_vector::Vector{F}, + mean_words_in_docs::Float64; κ::Int=2, β::Float64=0.75) where {T <: Real, F <: AbstractFloat} @assert size(doc_term_mat) == size(bm25) @@ -82,7 +88,7 @@ function build_bm25!(doc_term_mat::SparseMatrixCSC{T}, # TF tells us what proportion of a document is defined by a term words_in_documents = F.(sum(doc_term_mat; dims=1)) - ln = words_in_documents ./ mean(words_in_documents) + ln = words_in_documents ./ mean_words_in_docs oneval = one(F) for i = 1:n @@ -100,9 +106,9 @@ end function _transform(transformer::BM25Transformer, result::BMI25TransformerResult, v::Corpus) - dtm_matrix = build_dtm(v, result.vocab) - bm25 = similar(dtm_matrix.dtm, eltype(result.idf_vector)) - build_bm25!(dtm_matrix.dtm, bm25, result.idf_vector; κ=transformer.κ, β=transformer.β) + doc_terms = build_dtm(v, result.vocab) + bm25 = similar(doc_terms.dtm, eltype(result.idf_vector)) + build_bm25!(doc_terms.dtm, bm25, result.idf_vector, result.mean_words_in_docs; κ=transformer.κ, β=transformer.β) # here we return the `adjoint` of our sparse matrix to conform to # the `n x p` dimensions throughout MLJ @@ -113,7 +119,8 @@ end function MMI.fitted_params(::BM25Transformer, fitresult) vocab = fitresult.vocab idf_vector = fitresult.idf_vector - return (vocab = vocab, idf_vector = idf_vector) + mean_words_in_docs = fitresult.mean_words_in_docs + return (vocab = vocab, idf_vector = idf_vector, mean_words_in_docs = mean_words_in_docs) end diff --git a/src/tfidf_transformer.jl b/src/tfidf_transformer.jl index 59ea1a7..e8bc232 100644 --- a/src/tfidf_transformer.jl +++ b/src/tfidf_transformer.jl @@ -60,7 +60,8 @@ struct TfidfTransformerResult idf_vector::Vector{Float64} end -get_result(::TfidfTransformer, idf::Vector{Float64}, vocab::Vector{String}) = TfidfTransformerResult(vocab, idf) +get_result(::TfidfTransformer, idf::Vector{<:AbstractFloat}, vocab::Vector{String}, ::SparseMatrixCSC) = + TfidfTransformerResult(vocab, idf) function build_tfidf!(doc_term_mat::SparseMatrixCSC{T}, tfidf::SparseMatrixCSC{F}, @@ -89,9 +90,9 @@ end function _transform(::TfidfTransformer, result::TfidfTransformerResult, v::Corpus) - dtm_matrix = build_dtm(v, result.vocab) - tfidf = similar(dtm_matrix.dtm, eltype(result.idf_vector)) - build_tfidf!(dtm_matrix.dtm, tfidf, result.idf_vector) + doc_terms = build_dtm(v, result.vocab) + tfidf = similar(doc_terms.dtm, eltype(result.idf_vector)) + build_tfidf!(doc_terms.dtm, tfidf, result.idf_vector) # here we return the `adjoint` of our sparse matrix to conform to # the `n x p` dimensions throughout MLJ diff --git a/src/utils.jl b/src/utils.jl index a22e329..7f5a820 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,7 +1,7 @@ -function limit_features(doc_term_matrix::DocumentTermMatrix, +function limit_features(doc_terms::DocumentTermMatrix, high::Int, low::Int) - doc_freqs = vec(sum(doc_term_matrix.dtm, dims=2)) + doc_freqs = vec(sum(doc_terms.dtm, dims=2)) # build mask to restrict terms mask = trues(length(doc_freqs)) @@ -12,9 +12,9 @@ function limit_features(doc_term_matrix::DocumentTermMatrix, mask .&= (doc_freqs .>= low) end - new_terms = doc_term_matrix.terms[mask] + new_terms = doc_terms.terms[mask] - return (doc_term_matrix.dtm[mask, :], new_terms) + return (doc_terms.dtm[mask, :], new_terms) end ## Helper functions to build Corpus ## @@ -55,11 +55,11 @@ function build_dtm(docs::Corpus, terms::Vector{T}) where {T} end end if length(rows) > 0 - doc_term_matrix = sparse(rows, columns, values, m, n) + doc_term_mat = sparse(rows, columns, values, m, n) else - doc_term_matrix = spzeros(Int, m, n) + doc_term_mat = spzeros(Int, m, n) end - DocumentTermMatrix(doc_term_matrix, terms, row_indices) + DocumentTermMatrix(doc_term_mat, terms, row_indices) end ## General method to calculate IDF vector ## diff --git a/test/abstract_text_transformer.jl b/test/abstract_text_transformer.jl index 5653bd7..af87528 100644 --- a/test/abstract_text_transformer.jl +++ b/test/abstract_text_transformer.jl @@ -60,6 +60,16 @@ using TextAnalysis @test sum(test_doc_transform, dims=2)[2] > 0.0 @test size(test_doc_transform) == (2, 11) end + + # test proper fit: + # here we are testing to make sure the size of the corpus to be + # transformed does not alter the transformation that the model + # is doing. + for mach = test_machines + single_doc_transform = transform(mach, [test_doc2]) + multiple_doc_transform = transform(mach, [test_doc2, test_doc2]) + @test single_doc_transform[1, :] == multiple_doc_transform[1, :] + end end @testset "bag of words use" begin