From 62e04d3a434f6ad4684eab462b224646b5b2592f Mon Sep 17 00:00:00 2001 From: Christopher Alexander Date: Mon, 7 Feb 2022 12:11:34 -0500 Subject: [PATCH 1/2] change bagofwords transformer to count transformer --- README.md | 8 ++-- src/MLJText.jl | 4 +- ...ds_transformer.jl => count_transformer.jl} | 42 +++++++++---------- test/abstract_text_transformer.jl | 30 ++++++------- 4 files changed, 42 insertions(+), 42 deletions(-) rename src/{bagofwords_transformer.jl => count_transformer.jl} (64%) diff --git a/README.md b/README.md index 98291ab..303ae65 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ BM25Transformer( ``` Please see [http://ethen8181.github.io/machine-learning/search/bm25_intro.html](http://ethen8181.github.io/machine-learning/search/bm25_intro.html) for more details about how these parameters affect the matrix that is generated. -## Bag-of-Words Transformer +## Count Transformer The `MLJText` package also offers a way to represent documents using the simpler bag-of-words representation. This returns a document-term matrix (as you would get in `TextAnalysis`) that consists of the count for every word in the corpus for each document in the corpus. ### Usage @@ -97,11 +97,11 @@ The `MLJText` package also offers a way to represent documents using the simpler using MLJ, MLJText, TextAnalysis docs = ["Hi my name is Sam.", "How are you today?"] -bagofwords_transformer = BagOfWordsTransformer() -mach = machine(bagofwords_transformer, tokenize.(docs)) +count_transformer = CountTransformer() +mach = machine(count_transformer, tokenize.(docs)) MLJ.fit!(mach) -bagofwords_mat = transform(mach, tokenize.(docs)) +count_mat = transform(mach, tokenize.(docs)) ``` The resulting matrix looks like: diff --git a/src/MLJText.jl b/src/MLJText.jl index 8f97f0d..e54bf1a 100644 --- a/src/MLJText.jl +++ b/src/MLJText.jl @@ -21,9 +21,9 @@ include("scitypes.jl") include("utils.jl") include("abstract_text_transformer.jl") include("tfidf_transformer.jl") -include("bagofwords_transformer.jl") +include("count_transformer.jl") include("bm25_transformer.jl") -export TfidfTransformer, BM25Transformer, BagOfWordsTransformer +export TfidfTransformer, BM25Transformer, CountTransformer end # module diff --git a/src/bagofwords_transformer.jl b/src/count_transformer.jl similarity index 64% rename from src/bagofwords_transformer.jl rename to src/count_transformer.jl index d1f6b62..504fa31 100644 --- a/src/bagofwords_transformer.jl +++ b/src/count_transformer.jl @@ -1,13 +1,13 @@ """ - BagOfWordsTransformer() + CountTransformer() -Convert a collection of raw documents to matrix representing a bag-of-words structure. -Essentially, a bag-of-words approach to representing documents in a matrix is comprised of -a count of every word in the document corpus/collection for every document. This is a simple -but often quite powerful way of representing documents as vectors. The resulting representation is -a matrix with rows representing every document in the corpus and columns representing every word -in the corpus. The value for each cell is the raw count of a particular word in a particular -document. +Convert a collection of raw documents to matrix representing a bag-of-words structure from +word counts. Essentially, a bag-of-words approach to representing documents in a matrix is +comprised of a count of every word in the document corpus/collection for every document. +This is a simple but often quite powerful way of representing documents as vectors. The +resulting representation is a matrix with rows representing every document in the corpus +and columns representing every word in the corpus. The value for each cell is the raw count +of a particular word in a particular document. Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted to words occuring in a maximum or minimum portion of documents. @@ -19,23 +19,23 @@ will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the other direction. A value of 0.01 means that only terms that are at least in 1% of documents will be included. """ -mutable struct BagOfWordsTransformer <: AbstractTextTransformer +mutable struct CountTransformer <: AbstractTextTransformer max_doc_freq::Float64 min_doc_freq::Float64 end -function BagOfWordsTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0) - transformer = BagOfWordsTransformer(max_doc_freq, min_doc_freq) +function CountTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0) + transformer = CountTransformer(max_doc_freq, min_doc_freq) message = MMI.clean!(transformer) isempty(message) || @warn message return transformer end -struct BagOfWordsTransformerResult +struct CountTransformerResult vocab::Vector{String} end -function _fit(transformer::BagOfWordsTransformer, verbosity::Int, X::Corpus) +function _fit(transformer::CountTransformer, verbosity::Int, X::Corpus) # process corpus vocab update_lexicon!(X) @@ -52,14 +52,14 @@ function _fit(transformer::BagOfWordsTransformer, verbosity::Int, X::Corpus) end # prepare result - fitresult = BagOfWordsTransformerResult(vocab) + fitresult = CountTransformerResult(vocab) cache = nothing return fitresult, cache, NamedTuple() end -function _transform(::BagOfWordsTransformer, - result::BagOfWordsTransformerResult, +function _transform(::CountTransformer, + result::CountTransformerResult, v::Corpus) dtm_matrix = build_dtm(v, result.vocab) @@ -69,14 +69,14 @@ function _transform(::BagOfWordsTransformer, end # for returning user-friendly form of the learned parameters: -function MMI.fitted_params(::BagOfWordsTransformer, fitresult::BagOfWordsTransformerResult) +function MMI.fitted_params(::CountTransformer, fitresult::CountTransformerResult) vocab = fitresult.vocab return (vocab = vocab,) end ## META DATA -MMI.metadata_pkg(BagOfWordsTransformer, +MMI.metadata_pkg(CountTransformer, name="$PKG", uuid="7876af07-990d-54b4-ab0e-23690620f79a", url="https://github.com/JuliaAI/MLJText.jl", @@ -85,13 +85,13 @@ MMI.metadata_pkg(BagOfWordsTransformer, is_wrapper=false ) -MMI.metadata_model(BagOfWordsTransformer, +MMI.metadata_model(CountTransformer, input_scitype = Union{ AbstractVector{<:AbstractVector{STB.Textual}}, AbstractVector{<:STB.Multiset{<:ScientificNGram}}, AbstractVector{<:STB.Multiset{STB.Textual}} }, output_scitype = AbstractMatrix{STB.Continuous}, - docstring = "Build Bag-of-Words matrix for corpus of documents", - path = "MLJText.BagOfWordsTransformer" + docstring = "Build Bag-of-Words matrix from word counts for corpus of documents", + path = "MLJText.CountTransformer" ) \ No newline at end of file diff --git a/test/abstract_text_transformer.jl b/test/abstract_text_transformer.jl index af87528..676ec76 100644 --- a/test/abstract_text_transformer.jl +++ b/test/abstract_text_transformer.jl @@ -13,17 +13,17 @@ using TextAnalysis test_tfidf_machine = @test_logs machine(tfidf_transformer, ngram_vec) MLJBase.fit!(test_tfidf_machine) - # train bag_of_words transformer - bagofwords_vectorizer = MLJText.BagOfWordsTransformer() - test_bow_machine = @test_logs machine(bagofwords_vectorizer, ngram_vec) - MLJBase.fit!(test_bow_machine) + # train count transformer + count_transformer = MLJText.CountTransformer() + test_count_machine = @test_logs machine(count_transformer, ngram_vec) + MLJBase.fit!(test_count_machine) # train bm25 transformer bm25_transformer = MLJText.BM25Transformer() test_bm25_machine = @test_logs machine(bm25_transformer, ngram_vec) MLJBase.fit!(test_bm25_machine) - test_machines = [test_tfidf_machine, test_bow_machine, test_bm25_machine] + test_machines = [test_tfidf_machine, test_count_machine, test_bm25_machine] # test single doc test_doc1 = ngrams(NGramDocument("Another sentence ok")) @@ -91,10 +91,10 @@ end test_tfidf_machine2 = @test_logs machine(tfidf_transformer, [bag]) MLJBase.fit!(test_tfidf_machine2) - # train bag_of_words transformer - bagofwords_vectorizer = MLJText.BagOfWordsTransformer() - test_bow_machine2 = @test_logs machine(bagofwords_vectorizer, [bag]) - MLJBase.fit!(test_bow_machine2) + # train count transformer + count_transformer = MLJText.CountTransformer() + test_count_machine2 = @test_logs machine(count_transformer, [bag]) + MLJBase.fit!(test_count_machine2) # train bm25 transformer bm25_transformer = MLJText.BM25Transformer() @@ -102,7 +102,7 @@ end MLJBase.fit!(test_bm25_machine2) test_doc5 = ["How about a cat in a hat"] - for mach = [test_tfidf_machine2, test_bow_machine2, test_bm25_machine2] + for mach = [test_tfidf_machine2, test_count_machine2, test_bm25_machine2] test_doc_transform = transform(mach, test_doc5) @test sum(test_doc_transform, dims=2)[1] > 0.0 @test size(test_doc_transform) == (1, 8) @@ -126,10 +126,10 @@ end test_tfidf_machine3 = @test_logs machine(tfidf_transformer, ngram_vec) MLJBase.fit!(test_tfidf_machine3) - # train bag_of_words transformer - bagofwords_vectorizer = MLJText.BagOfWordsTransformer(max_doc_freq=0.8) - test_bow_machine3 = @test_logs machine(bagofwords_vectorizer, ngram_vec) - MLJBase.fit!(test_bow_machine3) + # train count transformer + count_transformer = MLJText.CountTransformer(max_doc_freq=0.8) + test_count_machine3 = @test_logs machine(count_transformer, ngram_vec) + MLJBase.fit!(test_count_machine3) # train bm25 transformer bm25_transformer = MLJText.BM25Transformer(max_doc_freq=0.8, min_doc_freq=0.2) @@ -140,7 +140,7 @@ end test_doc_transform = transform(test_tfidf_machine3, ngram_vec) @test (Vector(vec(sum(test_doc_transform, dims=2))) .> 0.2) == Bool[1, 1, 1, 1, 1, 1] - test_doc_transform = transform(test_bow_machine3, ngram_vec) + test_doc_transform = transform(test_count_machine3, ngram_vec) @test Vector(vec(sum(test_doc_transform, dims=2))) == [14, 10, 14, 9, 13, 7] test_doc_transform = transform(test_bm25_machine3, ngram_vec) From c37e01cc36b12c8ef988250caee6c2a027e20d26 Mon Sep 17 00:00:00 2001 From: Chris Alexander Date: Mon, 7 Feb 2022 20:50:12 -0500 Subject: [PATCH 2/2] bump to version 0.2.0 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index cbec6ce..ca7cd5c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJText" uuid = "5e27fcf9-6bac-46ba-8580-b5712f3d6387" authors = ["Chris Alexander , Anthony D. Blaom "] -version = "0.1.3" +version = "0.2.0" [deps] CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8"