From 9c67aed2a41d10df2f1b38652365210e6219131a Mon Sep 17 00:00:00 2001 From: josephsdavid Date: Mon, 29 Aug 2022 14:15:03 -0500 Subject: [PATCH 1/7] update --- src/MLJText.jl | 111 +++++++++++++++++++++++++++ src/bm25_transformer.jl | 154 +++++++++++++++++++++++++++++--------- src/tfidf_transformer.jl | 157 +++++++++++++++++++++++++++------------ 3 files changed, 339 insertions(+), 83 deletions(-) diff --git a/src/MLJText.jl b/src/MLJText.jl index e54bf1a..5f7d4dc 100644 --- a/src/MLJText.jl +++ b/src/MLJText.jl @@ -26,4 +26,115 @@ include("bm25_transformer.jl") export TfidfTransformer, BM25Transformer, CountTransformer +""" +$(MMI.doc_header(TfidfTransformer)) + + +`TfidfTransformer`: Convert a collection of raw documents to a matrix of TF-IDF features. +"TF" means term-frequency while "TF-IDF" means term-frequency times inverse +document-frequency. This is a common term weighting scheme in information retrieval, that +has also found good use in document classification. The goal of using TF-IDF instead of the +raw frequencies of occurrence of a token in a given document is to scale down the impact of +tokens that occur very frequently in a given corpus and that are hence empirically less +informative than features that occur in a small fraction of the training corpus.The formula +that is used to compute the TF-IDF for a term `t` of a document `d` in a document set is +`tf_idf(t, d) = tf(t, d) * idf(t)`. + + +# Training data + + +In MLJ or MLJBase, bind an instance `model` to data with + +mach = machine(model, X) + +Where + +- `X`: is any matrix of input features whose items are of scitype + `ScientificTypesBase.Textual`, `ScientificTypesBase.{Multiset{<:ScientificNGram}}`, or + `ScientificTypesBase.Multiset{.ScientificTypesBase.Textual}`; check the scitype with + `schema(X)` + +Train the machine using `fit!(mach, rows=...)`. + + +# Hyper-parameters + + +- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider. + Terms that occur in `> max_doc_freq` documents will not be considered by the + transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than + 90% of the documents will be removed +- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider. + Terms that occur in `< max_doc_freq` documents will not be considered by the + transformer. A value of 0.01 means that only terms that are at least in 1% of the + documents will be included +- `smooth_idf=true`: Assuming `smooth_idf` is false, IDF is calculated using the equation + `idf(t) = log [ n / df(t) ] + 1`, with term `d`, `n` documents, and document frequency + `df(t)`. The `1` term outside of the logarithm has the effect that terms with zero idf + (i.e. they occur in all documents) will not be entirely ignored. If `smooth_idf` is + true, another `1` term is added, giving: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`. + These `1`'s have the same affect as adding an extra document which contains every term + in the collection exactly once, preventing division by 0 + matrix + + +# Operations + + +- `transform(mach, Xnew)`: Return a transformed matrix of scitype + `Continuous` given new features `Xnew`. + + +# Fitted parameters + + +The fields of `fitted_params(mach)` are: + +- `vocab`: A vector containing the string used in the transformer's vocabulary. +- `idf_vector`: The transformer's calculated IDF vector. + + +# Examples + + +`TfidfTransformer` accepts a variety of inputs. In the example below, we use simple +tokenized documents: + +```julia +using MLJ, MLJText, TextAnalysis + +docs = ["Hi my name is Sam.", "How are you today?"] +tfidf_transformer = TfidfTransformer() +mach = machine(tfidf_transformer, tokenize.(docs)) +MLJ.fit!(mach) + +fitted_params(mach) + +tfidf_mat = transform(mach, tokenize.(docs)) +``` + +We can also use the `TextAnalysis` package to implement funcionality similar to SciKit +Learn's N-grams: + +```julia +using MLJ, MLJText, TextAnalysis + +docs = ["Hi my name is Sam.", "How are you today?"] +corpus = Corpus(NGramDocument.(docs, 1, 2)) +ngram_docs = ngrams.(corpus) + +tfidf_transformer = TfidfTransformer() +mach = machine(tfidf_transformer, ngram_docs) +MLJ.fit!(mach) +fitted_params(mach) + +tfidf_mat = transform(mach, ngram_docs) +``` + +See also +[`GaussianNBClassifier`](@ref) + +""" + end # module diff --git a/src/bm25_transformer.jl b/src/bm25_transformer.jl index 80faf1c..06be25f 100644 --- a/src/bm25_transformer.jl +++ b/src/bm25_transformer.jl @@ -1,37 +1,117 @@ """ - BM25Transformer() - -Convert a collection of raw documents to a matrix using the Okapi BM25 document-word statistic. - -BM25 is an approach similar to that of TF-IDF in terms of representing documents in a vector -space. The BM25 scoring function uses both term frequency (TF) and inverse document frequency -(IDF) so that, for each term in a document, its relative concentration in the document is -scored (like TF-IDF). However, BM25 improves upon TF-IDF by incorporating probability - particularly, -the probability that a user will consider a search result relevant based on the terms in the search query -and those in each document. - -The parameters `max_doc_freq`, `min_doc_freq`, and `smooth_idf` all work identically to those in the -`TfidfTransformer`. BM25 introduces two additional parameters: - -`κ` is the term frequency saturation characteristic. Higher values represent slower saturation. What -we mean by saturation is the degree to which a term occuring extra times adds to the overall score. This defaults -to 2. - -`β` is a parameter, bound between 0 and 1, that amplifies the particular document length compared to the average length. -The bigger β is, the more document length is amplified in terms of the overall score. The default value is 0.75. - -For more explanations, please see: -- http://ethen8181.github.io/machine-learning/search/bm25_intro.html -- https://en.wikipedia.org/wiki/Okapi_BM25 -- https://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html - -The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary -that the transformer will consider. `max_doc_freq` indicates that terms in only -up to the specified percentage of documents will be considered. For example, if -`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents -will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the -other direction. A value of 0.01 means that only terms that are at least in 1% of -documents will be included. +$(MMI.doc_header(BM25Transformer)) + +`BM25Transformer`: Convert a collection of raw documents to a matrix using the Okapi BM25 +document-word statistic. BM25 is an approach similar to that of TF-IDF in terms of +representing documents in a vector space. The BM25 scoring function uses both term +frequency (TF) and inverse document frequency (IDF) so that, for each term in a document, +its relative concentration in the document is scored (like TF-IDF). However, BM25 improves +upon TF-IDF by incorporating probability - particularly, the probability that a user will +consider a search result relevant based on the terms in the search query and those in each +document. + + + +# Training data + + +In MLJ or MLJBase, bind an instance `model` to data with + +mach = machine(model, X) + +Where + +- `X`: is any matrix of input features whose items are of scitype + `ScientificTypesBase.Textual`, `ScientificTypesBase.{Multiset{<:ScientificNGram}}`, or + `ScientificTypesBase.Multiset{.ScientificTypesBase.Textual}`; check the scitype with + `schema(X)` + +Train the machine using `fit!(mach, rows=...)`. + + +# Hyper-parameters + + +- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider. + Terms that occur in `> max_doc_freq` documents will not be considered by the + transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than + 90% of the documents will be removed. +- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider. + Terms that occur in `< max_doc_freq` documents will not be considered by the + transformer. A value of 0.01 means that only terms that are at least in 1% of the + documents will be included. +- `κ=2`: The term frequency saturation characteristic. Higher values represent slower + saturation. What we mean by saturation is the degree to which a term occurring extra + times adds to the overall score. +- `β=0.075`: Amplifies the particular document length compared to the average length. The + bigger β is, the more document length is amplified in terms of the overall score. The + default value is 0.75, and the bounds are restricted between 0 and 1. +- `smooth_idf=true`: Assuming `smooth_idf` is false, IDF is calculated using the equation + `idf(t) = log [ n / df(t) ] + 1`, with term `d`, `n` documents, and document frequency + `df(t)`. The `1` term outside of the logarithm has the effect that terms with zero idf + (i.e. they occur in all documents) will not be entirely ignored. If `smooth_idf` is + true, another `1` term is added, giving: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`. + These `1`'s have the same affect as adding an extra document which contains every term + in the collection exactly once, preventing division by 0. + + +# Operations + + +- `transform(mach, Xnew)`: Return a transformed matrix of type + `ScientificTypesBase.Continuous` given new features `Xnew`. + + +# Fitted parameters + + +The fields of `fitted_params(mach)` are: + +- `vocab`: A vector containing the string used in the transformer's vocabulary. +- `idf_vector`: The transformer's calculated IDF vector. +- `mean_words_in_docs`: The mean number of words in each document. + + +# Examples + + +`BM25Transformer` accepts a variety of inputs. In the example below, we use simple +tokenized documents: + +```julia +using MLJ, MLJText, TextAnalysis + +docs = ["Hi my name is Sam.", "How are you today?"] +bm25_transformer = BM25Transformer() +mach = machine(bm25_transformer, tokenize.(docs)) +MLJ.fit!(mach) + +fitted_params(mach) + +bm25_mat = transform(mach, tokenize.(docs)) +``` + +We can also use the `TextAnalysis` package to implement funcionality similar to SciKit +Learn's N-grams: + +```julia +using MLJ, MLJText, TextAnalysis + +docs = ["Hi my name is Sam.", "How are you today?"] +corpus = Corpus(NGramDocument.(docs, 1, 2)) +ngram_docs = ngrams.(corpus) + +bm25_transformer = BM25Transformer() +mach = machine(bm25_transformer, ngram_docs) +MLJ.fit!(mach) +fitted_params(mach) + +tfidf_mat = transform(mach, ngram_docs) +``` + +See also +[`GaussianNBClassifier`](@ref) + """ mutable struct BM25Transformer <: AbstractTextTransformer max_doc_freq::Float64 @@ -41,13 +121,13 @@ mutable struct BM25Transformer <: AbstractTextTransformer smooth_idf::Bool end -function BM25Transformer(; +function BM25Transformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0, κ::Int=2, β::Float64=0.75, smooth_idf::Bool = true - ) + ) transformer = BM25Transformer(max_doc_freq, min_doc_freq, κ, β, smooth_idf) message = MMI.clean!(transformer) isempty(message) || @warn message @@ -103,14 +183,14 @@ function build_bm25!(doc_term_mat::SparseMatrixCSC{T}, return bm25 end -function _transform(transformer::BM25Transformer, +function _transform(transformer::BM25Transformer, result::BMI25TransformerResult, v::Corpus) doc_terms = build_dtm(v, result.vocab) bm25 = similar(doc_terms.dtm, eltype(result.idf_vector)) build_bm25!(doc_terms.dtm, bm25, result.idf_vector, result.mean_words_in_docs; κ=transformer.κ, β=transformer.β) - # here we return the `adjoint` of our sparse matrix to conform to + # here we return the `adjoint` of our sparse matrix to conform to # the `n x p` dimensions throughout MLJ return adjoint(bm25) end diff --git a/src/tfidf_transformer.jl b/src/tfidf_transformer.jl index e8bc232..48c4502 100644 --- a/src/tfidf_transformer.jl +++ b/src/tfidf_transformer.jl @@ -1,46 +1,111 @@ """ - TfidfTransformer() - -The following is taken largely from scikit-learn's documentation: -https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/feature_extraction/text.py - -Convert a collection of raw documents to a matrix of TF-IDF features. - -"TF" means term-frequency while "TF-IDF" means term-frequency times -inverse document-frequency. This is a common term weighting scheme in -information retrieval, that has also found good use in document -classification. - -The goal of using TF-IDF instead of the raw frequencies of occurrence -of a token in a given document is to scale down the impact of tokens -that occur very frequently in a given corpus and that are hence -empirically less informative than features that occur in a small -fraction of the training corpus. - -The formula that is used to compute the TF-IDF for a term `t` of a -document `d` in a document set is `tf_idf(t, d) = tf(t, d) * -idf(t)`. Assuming `smooth_idf=false`, `idf(t) = log [ n / df(t) ] + 1` -where `n` is the total number of documents in the document set and -`df(t)` is the document frequency of `t`. The document frequency is -the number of documents in the document set that contain the term -`t`. The effect of adding “1” to the idf in the equation above is that -terms with zero idf, i.e., terms that occur in all documents in a -training set, will not be entirely ignored. (Note that the idf formula -above differs from that appearing in standard texts, `idf(t) = log [ n -/ (df(t) + 1) ])`. - -If `smooth_idf=true` (the default), the constant “1” is added to the -numerator and denominator of the idf as if an extra document was seen -containing every term in the collection exactly once, which prevents -zero divisions: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`. - -The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary -that the transformer will consider. `max_doc_freq` indicates that terms in only -up to the specified percentage of documents will be considered. For example, if -`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents -will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the -other direction. A value of 0.01 means that only terms that are at least in 1% of -documents will be included. +$(MMI.doc_header(TfidfTransformer)) + + +`TfidfTransformer`: Convert a collection of raw documents to a matrix of TF-IDF features. +"TF" means term-frequency while "TF-IDF" means term-frequency times inverse +document-frequency. This is a common term weighting scheme in information retrieval, that +has also found good use in document classification. The goal of using TF-IDF instead of the +raw frequencies of occurrence of a token in a given document is to scale down the impact of +tokens that occur very frequently in a given corpus and that are hence empirically less +informative than features that occur in a small fraction of the training corpus.The formula +that is used to compute the TF-IDF for a term `t` of a document `d` in a document set is +`tf_idf(t, d) = tf(t, d) * idf(t)`. + + +# Training data + + +In MLJ or MLJBase, bind an instance `model` to data with + +mach = machine(model, X) + +Where + +- `X`: is any matrix of input features whose items are of scitype + `ScientificTypesBase.Textual`, `ScientificTypesBase.{Multiset{<:ScientificNGram}}`, or + `ScientificTypesBase.Multiset{.ScientificTypesBase.Textual}`; check the scitype with + `schema(X)` + +Train the machine using `fit!(mach, rows=...)`. + + +# Hyper-parameters + + +- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider. + Terms that occur in `> max_doc_freq` documents will not be considered by the + transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than + 90% of the documents will be removed. +- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider. + Terms that occur in `< max_doc_freq` documents will not be considered by the + transformer. A value of 0.01 means that only terms that are at least in 1% of the + documents will be included. +- `smooth_idf=true`: Assuming `smooth_idf` is false, IDF is calculated using the equation + `idf(t) = log [ n / df(t) ] + 1`, with term `d`, `n` documents, and document frequency + `df(t)`. The `1` term outside of the logarithm has the effect that terms with zero idf + (i.e. they occur in all documents) will not be entirely ignored. If `smooth_idf` is + true, another `1` term is added, giving: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`. + These `1`'s have the same affect as adding an extra document which contains every term + in the collection exactly once, preventing division by 0. + + +# Operations + + +- `transform(mach, Xnew)`: Return a transformed matrix of scitype + `Continuous` given new features `Xnew`. + + +# Fitted parameters + + +The fields of `fitted_params(mach)` are: + +- `vocab`: A vector containing the string used in the transformer's vocabulary. +- `idf_vector`: The transformer's calculated IDF vector. + + +# Examples + + +`TfidfTransformer` accepts a variety of inputs. In the example below, we use simple +tokenized documents: + +```julia +using MLJ, MLJText, TextAnalysis + +docs = ["Hi my name is Sam.", "How are you today?"] +tfidf_transformer = TfidfTransformer() +mach = machine(tfidf_transformer, tokenize.(docs)) +MLJ.fit!(mach) + +fitted_params(mach) + +tfidf_mat = transform(mach, tokenize.(docs)) +``` + +We can also use the `TextAnalysis` package to implement funcionality similar to SciKit +Learn's N-grams: + +```julia +using MLJ, MLJText, TextAnalysis + +docs = ["Hi my name is Sam.", "How are you today?"] +corpus = Corpus(NGramDocument.(docs, 1, 2)) +ngram_docs = ngrams.(corpus) + +tfidf_transformer = TfidfTransformer() +mach = machine(tfidf_transformer, ngram_docs) +MLJ.fit!(mach) +fitted_params(mach) + +tfidf_mat = transform(mach, ngram_docs) +``` + +See also +[`GaussianNBClassifier`](@ref) + """ mutable struct TfidfTransformer <: AbstractTextTransformer max_doc_freq::Float64 @@ -48,7 +113,7 @@ mutable struct TfidfTransformer <: AbstractTextTransformer smooth_idf::Bool end -function TfidfTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0, smooth_idf::Bool = true) +function TfidfTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0, smooth_idf::Bool = true) transformer = TfidfTransformer(max_doc_freq, min_doc_freq, smooth_idf) message = MMI.clean!(transformer) isempty(message) || @warn message @@ -60,7 +125,7 @@ struct TfidfTransformerResult idf_vector::Vector{Float64} end -get_result(::TfidfTransformer, idf::Vector{<:AbstractFloat}, vocab::Vector{String}, ::SparseMatrixCSC) = +get_result(::TfidfTransformer, idf::Vector{<:AbstractFloat}, vocab::Vector{String}, ::SparseMatrixCSC) = TfidfTransformerResult(vocab, idf) function build_tfidf!(doc_term_mat::SparseMatrixCSC{T}, @@ -87,14 +152,14 @@ function build_tfidf!(doc_term_mat::SparseMatrixCSC{T}, return tfidf end -function _transform(::TfidfTransformer, +function _transform(::TfidfTransformer, result::TfidfTransformerResult, v::Corpus) doc_terms = build_dtm(v, result.vocab) tfidf = similar(doc_terms.dtm, eltype(result.idf_vector)) build_tfidf!(doc_terms.dtm, tfidf, result.idf_vector) - # here we return the `adjoint` of our sparse matrix to conform to + # here we return the `adjoint` of our sparse matrix to conform to # the `n x p` dimensions throughout MLJ return adjoint(tfidf) end From bde10aec834d398b6090f7fb76923e34751bef6a Mon Sep 17 00:00:00 2001 From: josephsdavid Date: Mon, 29 Aug 2022 14:18:18 -0500 Subject: [PATCH 2/7] up --- src/count_transformer.jl | 117 +++++++++++++++++++++++++++++++-------- 1 file changed, 93 insertions(+), 24 deletions(-) diff --git a/src/count_transformer.jl b/src/count_transformer.jl index 504fa31..8c5c0b8 100644 --- a/src/count_transformer.jl +++ b/src/count_transformer.jl @@ -1,30 +1,99 @@ """ - CountTransformer() - -Convert a collection of raw documents to matrix representing a bag-of-words structure from -word counts. Essentially, a bag-of-words approach to representing documents in a matrix is -comprised of a count of every word in the document corpus/collection for every document. -This is a simple but often quite powerful way of representing documents as vectors. The -resulting representation is a matrix with rows representing every document in the corpus -and columns representing every word in the corpus. The value for each cell is the raw count -of a particular word in a particular document. - -Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted -to words occuring in a maximum or minimum portion of documents. -The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary -that the transformer will consider. `max_doc_freq` indicates that terms in only -up to the specified percentage of documents will be considered. For example, if -`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents -will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the -other direction. A value of 0.01 means that only terms that are at least in 1% of -documents will be included. +$(MMI.doc_header(CountTransformer)) + +`CountTransformer`:Convert a collection of raw documents to matrix representing a +bag-of-words structure from word counts. Essentially, a bag-of-words approach to +representing documents in a matrix is comprised of a count of every word in the document +corpus/collection for every document. This is a simple but often quite powerful way of +representing documents as vectors. The resulting representation is a matrix with rows +representing every document in the corpus and columns representing every word in the corpus. +The value for each cell is the raw count of a particular word in a particular document. + + +# Training data + +In MLJ or MLJBase, bind an instance `model` to data with + + mach = machine(model, X) + +Where + +- `X`: is any matrix of input features whose items are of scitype + `ScientificTypesBase.Textual`, `ScientificTypesBase.{Multiset{<:ScientificNGram}}`, or + `ScientificTypesBase.Multiset{.ScientificTypesBase.Textual}`; check the scitype with + `schema(X)` + +Train the machine using `fit!(mach, rows=...)`. + +# Hyper-parameters + + +- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider. + Terms that occur in `> max_doc_freq` documents will not be considered by the + transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than + 90% of the documents will be removed. +- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider. + Terms that occur in `< max_doc_freq` documents will not be considered by the + transformer. A value of 0.01 means that only terms that are at least in 1% of the + documents will be included. + + +# Operations + +- `transform(mach, Xnew)`: Return a transformed matrix of type + `ScientificTypesBase.Continuous` given new features `Xnew`. + +# Fitted parameters + +The fields of `fitted_params(mach)` are: + +- `vocab`: A vector containing the string used in the transformer's vocabulary. + +# Examples + +`CountTransformer` accepts a variety of inputs. In the example below, we use simple +tokenized documents: + +```julia +using MLJ, MLJText, TextAnalysis + +docs = ["Hi my name is Sam.", "How are you today?"] +count_transformer = CountTransformer() +mach = machine(count_transformer, tokenize.(docs)) +MLJ.fit!(mach) + +fitted_params(mach) + +count_mat = transform(mach, tokenize.(docs)) +``` + +We can also use the `TextAnalysis` package to implement funcionality similar to SciKit +Learn's N-grams: + +```julia +using MLJ, MLJText, TextAnalysis + +docs = ["Hi my name is Sam.", "How are you today?"] +corpus = Corpus(NGramDocument.(docs, 1, 2)) +ngram_docs = ngrams.(corpus) + +count_transformer = CountTransformer() +mach = machine(count_transformer, ngram_docs) +MLJ.fit!(mach) +fitted_params(mach) + +count_mat = transform(mach, ngram_docs) +``` + +See also +[`GaussianNBClassifier`](@ref) """ mutable struct CountTransformer <: AbstractTextTransformer max_doc_freq::Float64 min_doc_freq::Float64 end -function CountTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0) +function CountTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0) transformer = CountTransformer(max_doc_freq, min_doc_freq) message = MMI.clean!(transformer) isempty(message) || @warn message @@ -37,7 +106,7 @@ end function _fit(transformer::CountTransformer, verbosity::Int, X::Corpus) # process corpus vocab - update_lexicon!(X) + update_lexicon!(X) # calculate min and max doc freq limits if transformer.max_doc_freq < 1 || transformer.min_doc_freq > 0 @@ -58,12 +127,12 @@ function _fit(transformer::CountTransformer, verbosity::Int, X::Corpus) return fitresult, cache, NamedTuple() end -function _transform(::CountTransformer, +function _transform(::CountTransformer, result::CountTransformerResult, v::Corpus) dtm_matrix = build_dtm(v, result.vocab) - # here we return the `adjoint` of our sparse matrix to conform to + # here we return the `adjoint` of our sparse matrix to conform to # the `n x p` dimensions throughout MLJ return adjoint(dtm_matrix.dtm) end @@ -94,4 +163,4 @@ MMI.metadata_model(CountTransformer, output_scitype = AbstractMatrix{STB.Continuous}, docstring = "Build Bag-of-Words matrix from word counts for corpus of documents", path = "MLJText.CountTransformer" - ) \ No newline at end of file + ) From a1699d5b6fb9693eae45f829babff9dedee8176c Mon Sep 17 00:00:00 2001 From: josephsdavid Date: Mon, 29 Aug 2022 14:21:29 -0500 Subject: [PATCH 3/7] doc export complete --- src/bm25_transformer.jl | 18 +++--------------- src/count_transformer.jl | 4 +--- src/tfidf_transformer.jl | 16 ++-------------- 3 files changed, 6 insertions(+), 32 deletions(-) diff --git a/src/bm25_transformer.jl b/src/bm25_transformer.jl index 06be25f..1efb371 100644 --- a/src/bm25_transformer.jl +++ b/src/bm25_transformer.jl @@ -10,14 +10,11 @@ upon TF-IDF by incorporating probability - particularly, the probability that a consider a search result relevant based on the terms in the search query and those in each document. - - # Training data - In MLJ or MLJBase, bind an instance `model` to data with -mach = machine(model, X) + mach = machine(model, X) Where @@ -28,10 +25,8 @@ Where Train the machine using `fit!(mach, rows=...)`. - # Hyper-parameters - - `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider. Terms that occur in `> max_doc_freq` documents will not be considered by the transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than @@ -54,27 +49,21 @@ Train the machine using `fit!(mach, rows=...)`. These `1`'s have the same affect as adding an extra document which contains every term in the collection exactly once, preventing division by 0. - # Operations - - `transform(mach, Xnew)`: Return a transformed matrix of type `ScientificTypesBase.Continuous` given new features `Xnew`. - # Fitted parameters - The fields of `fitted_params(mach)` are: - `vocab`: A vector containing the string used in the transformer's vocabulary. - `idf_vector`: The transformer's calculated IDF vector. - `mean_words_in_docs`: The mean number of words in each document. - # Examples - `BM25Transformer` accepts a variety of inputs. In the example below, we use simple tokenized documents: @@ -106,12 +95,11 @@ mach = machine(bm25_transformer, ngram_docs) MLJ.fit!(mach) fitted_params(mach) -tfidf_mat = transform(mach, ngram_docs) +bm25_mat = transform(mach, ngram_docs) ``` See also -[`GaussianNBClassifier`](@ref) - +[`TfidfTransformer`](@ref), [`CountTransformer`](@ref) """ mutable struct BM25Transformer <: AbstractTextTransformer max_doc_freq::Float64 diff --git a/src/count_transformer.jl b/src/count_transformer.jl index 8c5c0b8..7ccde1d 100644 --- a/src/count_transformer.jl +++ b/src/count_transformer.jl @@ -27,7 +27,6 @@ Train the machine using `fit!(mach, rows=...)`. # Hyper-parameters - - `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider. Terms that occur in `> max_doc_freq` documents will not be considered by the transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than @@ -37,7 +36,6 @@ Train the machine using `fit!(mach, rows=...)`. transformer. A value of 0.01 means that only terms that are at least in 1% of the documents will be included. - # Operations - `transform(mach, Xnew)`: Return a transformed matrix of type @@ -86,7 +84,7 @@ count_mat = transform(mach, ngram_docs) ``` See also -[`GaussianNBClassifier`](@ref) +[`TfidfTransformer`](@ref), [`BM25Transformer`](@ref) """ mutable struct CountTransformer <: AbstractTextTransformer max_doc_freq::Float64 diff --git a/src/tfidf_transformer.jl b/src/tfidf_transformer.jl index 48c4502..49b600f 100644 --- a/src/tfidf_transformer.jl +++ b/src/tfidf_transformer.jl @@ -1,7 +1,6 @@ """ $(MMI.doc_header(TfidfTransformer)) - `TfidfTransformer`: Convert a collection of raw documents to a matrix of TF-IDF features. "TF" means term-frequency while "TF-IDF" means term-frequency times inverse document-frequency. This is a common term weighting scheme in information retrieval, that @@ -12,13 +11,11 @@ informative than features that occur in a small fraction of the training corpus. that is used to compute the TF-IDF for a term `t` of a document `d` in a document set is `tf_idf(t, d) = tf(t, d) * idf(t)`. - # Training data - In MLJ or MLJBase, bind an instance `model` to data with -mach = machine(model, X) + mach = machine(model, X) Where @@ -29,10 +26,8 @@ Where Train the machine using `fit!(mach, rows=...)`. - # Hyper-parameters - - `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider. Terms that occur in `> max_doc_freq` documents will not be considered by the transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than @@ -49,26 +44,20 @@ Train the machine using `fit!(mach, rows=...)`. These `1`'s have the same affect as adding an extra document which contains every term in the collection exactly once, preventing division by 0. - # Operations - - `transform(mach, Xnew)`: Return a transformed matrix of scitype `Continuous` given new features `Xnew`. - # Fitted parameters - The fields of `fitted_params(mach)` are: - `vocab`: A vector containing the string used in the transformer's vocabulary. - `idf_vector`: The transformer's calculated IDF vector. - # Examples - `TfidfTransformer` accepts a variety of inputs. In the example below, we use simple tokenized documents: @@ -104,8 +93,7 @@ tfidf_mat = transform(mach, ngram_docs) ``` See also -[`GaussianNBClassifier`](@ref) - +[`CountTransformer`](@ref), [`BM25Transformer`](@ref) """ mutable struct TfidfTransformer <: AbstractTextTransformer max_doc_freq::Float64 From 2d5f1a70ff90e34d5140b36bfe65ef756ee49743 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Mon, 5 Sep 2022 10:27:38 +1200 Subject: [PATCH 4/7] remove duplicate docstring --- src/MLJText.jl | 111 ------------------------------------------------- 1 file changed, 111 deletions(-) diff --git a/src/MLJText.jl b/src/MLJText.jl index 5f7d4dc..e54bf1a 100644 --- a/src/MLJText.jl +++ b/src/MLJText.jl @@ -26,115 +26,4 @@ include("bm25_transformer.jl") export TfidfTransformer, BM25Transformer, CountTransformer -""" -$(MMI.doc_header(TfidfTransformer)) - - -`TfidfTransformer`: Convert a collection of raw documents to a matrix of TF-IDF features. -"TF" means term-frequency while "TF-IDF" means term-frequency times inverse -document-frequency. This is a common term weighting scheme in information retrieval, that -has also found good use in document classification. The goal of using TF-IDF instead of the -raw frequencies of occurrence of a token in a given document is to scale down the impact of -tokens that occur very frequently in a given corpus and that are hence empirically less -informative than features that occur in a small fraction of the training corpus.The formula -that is used to compute the TF-IDF for a term `t` of a document `d` in a document set is -`tf_idf(t, d) = tf(t, d) * idf(t)`. - - -# Training data - - -In MLJ or MLJBase, bind an instance `model` to data with - -mach = machine(model, X) - -Where - -- `X`: is any matrix of input features whose items are of scitype - `ScientificTypesBase.Textual`, `ScientificTypesBase.{Multiset{<:ScientificNGram}}`, or - `ScientificTypesBase.Multiset{.ScientificTypesBase.Textual}`; check the scitype with - `schema(X)` - -Train the machine using `fit!(mach, rows=...)`. - - -# Hyper-parameters - - -- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider. - Terms that occur in `> max_doc_freq` documents will not be considered by the - transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than - 90% of the documents will be removed -- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider. - Terms that occur in `< max_doc_freq` documents will not be considered by the - transformer. A value of 0.01 means that only terms that are at least in 1% of the - documents will be included -- `smooth_idf=true`: Assuming `smooth_idf` is false, IDF is calculated using the equation - `idf(t) = log [ n / df(t) ] + 1`, with term `d`, `n` documents, and document frequency - `df(t)`. The `1` term outside of the logarithm has the effect that terms with zero idf - (i.e. they occur in all documents) will not be entirely ignored. If `smooth_idf` is - true, another `1` term is added, giving: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`. - These `1`'s have the same affect as adding an extra document which contains every term - in the collection exactly once, preventing division by 0 - matrix - - -# Operations - - -- `transform(mach, Xnew)`: Return a transformed matrix of scitype - `Continuous` given new features `Xnew`. - - -# Fitted parameters - - -The fields of `fitted_params(mach)` are: - -- `vocab`: A vector containing the string used in the transformer's vocabulary. -- `idf_vector`: The transformer's calculated IDF vector. - - -# Examples - - -`TfidfTransformer` accepts a variety of inputs. In the example below, we use simple -tokenized documents: - -```julia -using MLJ, MLJText, TextAnalysis - -docs = ["Hi my name is Sam.", "How are you today?"] -tfidf_transformer = TfidfTransformer() -mach = machine(tfidf_transformer, tokenize.(docs)) -MLJ.fit!(mach) - -fitted_params(mach) - -tfidf_mat = transform(mach, tokenize.(docs)) -``` - -We can also use the `TextAnalysis` package to implement funcionality similar to SciKit -Learn's N-grams: - -```julia -using MLJ, MLJText, TextAnalysis - -docs = ["Hi my name is Sam.", "How are you today?"] -corpus = Corpus(NGramDocument.(docs, 1, 2)) -ngram_docs = ngrams.(corpus) - -tfidf_transformer = TfidfTransformer() -mach = machine(tfidf_transformer, ngram_docs) -MLJ.fit!(mach) -fitted_params(mach) - -tfidf_mat = transform(mach, ngram_docs) -``` - -See also -[`GaussianNBClassifier`](@ref) - -""" - end # module From 84d75842588cfacfb9f81579cedaa99a644f7236 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Mon, 5 Sep 2022 15:31:48 +1200 Subject: [PATCH 5/7] move docstrings and refactor to reduce duplication --- src/MLJText.jl | 2 + src/bm25_transformer.jl | 180 +++++++++++++++++---------------------- src/count_transformer.jl | 138 +++++++++++------------------- src/docstring_helpers.jl | 98 +++++++++++++++++++++ src/tfidf_transformer.jl | 168 +++++++++++++++--------------------- 5 files changed, 295 insertions(+), 291 deletions(-) create mode 100644 src/docstring_helpers.jl diff --git a/src/MLJText.jl b/src/MLJText.jl index e54bf1a..c52a6f5 100644 --- a/src/MLJText.jl +++ b/src/MLJText.jl @@ -17,6 +17,7 @@ const PKG = "MLJText" # substitute model-providing package name const ScientificNGram{N} = NTuple{<:Any,STB.Textual} const NGram{N} = NTuple{<:Any,<:AbstractString} +include("docstring_helpers.jl") include("scitypes.jl") include("utils.jl") include("abstract_text_transformer.jl") @@ -26,4 +27,5 @@ include("bm25_transformer.jl") export TfidfTransformer, BM25Transformer, CountTransformer + end # module diff --git a/src/bm25_transformer.jl b/src/bm25_transformer.jl index 1efb371..dd1a9e6 100644 --- a/src/bm25_transformer.jl +++ b/src/bm25_transformer.jl @@ -1,106 +1,3 @@ -""" -$(MMI.doc_header(BM25Transformer)) - -`BM25Transformer`: Convert a collection of raw documents to a matrix using the Okapi BM25 -document-word statistic. BM25 is an approach similar to that of TF-IDF in terms of -representing documents in a vector space. The BM25 scoring function uses both term -frequency (TF) and inverse document frequency (IDF) so that, for each term in a document, -its relative concentration in the document is scored (like TF-IDF). However, BM25 improves -upon TF-IDF by incorporating probability - particularly, the probability that a user will -consider a search result relevant based on the terms in the search query and those in each -document. - -# Training data - -In MLJ or MLJBase, bind an instance `model` to data with - - mach = machine(model, X) - -Where - -- `X`: is any matrix of input features whose items are of scitype - `ScientificTypesBase.Textual`, `ScientificTypesBase.{Multiset{<:ScientificNGram}}`, or - `ScientificTypesBase.Multiset{.ScientificTypesBase.Textual}`; check the scitype with - `schema(X)` - -Train the machine using `fit!(mach, rows=...)`. - -# Hyper-parameters - -- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider. - Terms that occur in `> max_doc_freq` documents will not be considered by the - transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than - 90% of the documents will be removed. -- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider. - Terms that occur in `< max_doc_freq` documents will not be considered by the - transformer. A value of 0.01 means that only terms that are at least in 1% of the - documents will be included. -- `κ=2`: The term frequency saturation characteristic. Higher values represent slower - saturation. What we mean by saturation is the degree to which a term occurring extra - times adds to the overall score. -- `β=0.075`: Amplifies the particular document length compared to the average length. The - bigger β is, the more document length is amplified in terms of the overall score. The - default value is 0.75, and the bounds are restricted between 0 and 1. -- `smooth_idf=true`: Assuming `smooth_idf` is false, IDF is calculated using the equation - `idf(t) = log [ n / df(t) ] + 1`, with term `d`, `n` documents, and document frequency - `df(t)`. The `1` term outside of the logarithm has the effect that terms with zero idf - (i.e. they occur in all documents) will not be entirely ignored. If `smooth_idf` is - true, another `1` term is added, giving: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`. - These `1`'s have the same affect as adding an extra document which contains every term - in the collection exactly once, preventing division by 0. - -# Operations - -- `transform(mach, Xnew)`: Return a transformed matrix of type - `ScientificTypesBase.Continuous` given new features `Xnew`. - -# Fitted parameters - -The fields of `fitted_params(mach)` are: - -- `vocab`: A vector containing the string used in the transformer's vocabulary. -- `idf_vector`: The transformer's calculated IDF vector. -- `mean_words_in_docs`: The mean number of words in each document. - -# Examples - -`BM25Transformer` accepts a variety of inputs. In the example below, we use simple -tokenized documents: - -```julia -using MLJ, MLJText, TextAnalysis - -docs = ["Hi my name is Sam.", "How are you today?"] -bm25_transformer = BM25Transformer() -mach = machine(bm25_transformer, tokenize.(docs)) -MLJ.fit!(mach) - -fitted_params(mach) - -bm25_mat = transform(mach, tokenize.(docs)) -``` - -We can also use the `TextAnalysis` package to implement funcionality similar to SciKit -Learn's N-grams: - -```julia -using MLJ, MLJText, TextAnalysis - -docs = ["Hi my name is Sam.", "How are you today?"] -corpus = Corpus(NGramDocument.(docs, 1, 2)) -ngram_docs = ngrams.(corpus) - -bm25_transformer = BM25Transformer() -mach = machine(bm25_transformer, ngram_docs) -MLJ.fit!(mach) -fitted_params(mach) - -bm25_mat = transform(mach, ngram_docs) -``` - -See also -[`TfidfTransformer`](@ref), [`CountTransformer`](@ref) -""" mutable struct BM25Transformer <: AbstractTextTransformer max_doc_freq::Float64 min_doc_freq::Float64 @@ -213,3 +110,80 @@ MMI.metadata_model(BM25Transformer, docstring = "Build BM-25 matrix from raw documents", path = "MLJText.BM25Transformer" ) + +# # DOC STRING + +""" +$(MMI.doc_header(BM25Transformer)) + +The transformer converts a collection of documents, tokenized or pre-parsed as bags of +words/ngrams, to a matrix of [Okapi BM25 document-word +statistics](https://en.wikipedia.org/wiki/Okapi_BM25). The BM25 scoring function uses both +term frequency (TF) and inverse document frequency (IDF, defined below), as in +[`TfidfTransformer`](ref), but additionally adjusts for the probability that a user will +consider a search result relevant based, on the terms in the search query and those in +each document. + +$DOC_IDF + +References: + +- http://ethen8181.github.io/machine-learning/search/bm25_intro.html +- https://en.wikipedia.org/wiki/Okapi_BM25 +- https://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html + +# Training data + +In MLJ or MLJBase, bind an instance `model` to data with + + mach = machine(model, X) + +$DOC_IDF + +Train the machine using `fit!(mach, rows=...)`. + +# Hyper-parameters + +- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider. + Terms that occur in `> max_doc_freq` documents will not be considered by the + transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than + 90% of the documents will be removed. + +- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider. + Terms that occur in `< max_doc_freq` documents will not be considered by the + transformer. A value of 0.01 means that only terms that are at least in 1% of the + documents will be included. + +- `κ=2`: The term frequency saturation characteristic. Higher values represent slower + saturation. What we mean by saturation is the degree to which a term occurring extra + times adds to the overall score. + +- `β=0.075`: Amplifies the particular document length compared to the average length. The + bigger β is, the more document length is amplified in terms of the overall score. The + default value is 0.75, and the bounds are restricted between 0 and 1. + +- `smooth_idf=true`: Control which definition of IDF to use (see above). + +# Operations + +- `transform(mach, Xnew)`: Based on the vocabulary, IDF, and mean word counts learned in + training, return the matrix of BM25 scores for `Xnew`, a vector of the same form as `X` + above. The matrix has size `(n, p)`, where `n = length(Xnew)` and `p` the size of the + vocabulary. Tokens/ngrams not appearing in the learned vocabulary are scored zero. + +# Fitted parameters + +The fields of `fitted_params(mach)` are: + +- `vocab`: A vector containing the string used in the transformer's vocabulary. + +- `idf_vector`: The transformer's calculated IDF vector. + +- `mean_words_in_docs`: The mean number of words in each document. + +$(doc_examples(:BM25Transformer)) + +See also [`TfidfTransformer`](@ref), [`CountTransformer`](@ref) + +""" +BM25Transformer diff --git a/src/count_transformer.jl b/src/count_transformer.jl index 7ccde1d..b701e6f 100644 --- a/src/count_transformer.jl +++ b/src/count_transformer.jl @@ -1,91 +1,3 @@ -""" -$(MMI.doc_header(CountTransformer)) - -`CountTransformer`:Convert a collection of raw documents to matrix representing a -bag-of-words structure from word counts. Essentially, a bag-of-words approach to -representing documents in a matrix is comprised of a count of every word in the document -corpus/collection for every document. This is a simple but often quite powerful way of -representing documents as vectors. The resulting representation is a matrix with rows -representing every document in the corpus and columns representing every word in the corpus. -The value for each cell is the raw count of a particular word in a particular document. - - -# Training data - -In MLJ or MLJBase, bind an instance `model` to data with - - mach = machine(model, X) - -Where - -- `X`: is any matrix of input features whose items are of scitype - `ScientificTypesBase.Textual`, `ScientificTypesBase.{Multiset{<:ScientificNGram}}`, or - `ScientificTypesBase.Multiset{.ScientificTypesBase.Textual}`; check the scitype with - `schema(X)` - -Train the machine using `fit!(mach, rows=...)`. - -# Hyper-parameters - -- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider. - Terms that occur in `> max_doc_freq` documents will not be considered by the - transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than - 90% of the documents will be removed. -- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider. - Terms that occur in `< max_doc_freq` documents will not be considered by the - transformer. A value of 0.01 means that only terms that are at least in 1% of the - documents will be included. - -# Operations - -- `transform(mach, Xnew)`: Return a transformed matrix of type - `ScientificTypesBase.Continuous` given new features `Xnew`. - -# Fitted parameters - -The fields of `fitted_params(mach)` are: - -- `vocab`: A vector containing the string used in the transformer's vocabulary. - -# Examples - -`CountTransformer` accepts a variety of inputs. In the example below, we use simple -tokenized documents: - -```julia -using MLJ, MLJText, TextAnalysis - -docs = ["Hi my name is Sam.", "How are you today?"] -count_transformer = CountTransformer() -mach = machine(count_transformer, tokenize.(docs)) -MLJ.fit!(mach) - -fitted_params(mach) - -count_mat = transform(mach, tokenize.(docs)) -``` - -We can also use the `TextAnalysis` package to implement funcionality similar to SciKit -Learn's N-grams: - -```julia -using MLJ, MLJText, TextAnalysis - -docs = ["Hi my name is Sam.", "How are you today?"] -corpus = Corpus(NGramDocument.(docs, 1, 2)) -ngram_docs = ngrams.(corpus) - -count_transformer = CountTransformer() -mach = machine(count_transformer, ngram_docs) -MLJ.fit!(mach) -fitted_params(mach) - -count_mat = transform(mach, ngram_docs) -``` - -See also -[`TfidfTransformer`](@ref), [`BM25Transformer`](@ref) -""" mutable struct CountTransformer <: AbstractTextTransformer max_doc_freq::Float64 min_doc_freq::Float64 @@ -162,3 +74,53 @@ MMI.metadata_model(CountTransformer, docstring = "Build Bag-of-Words matrix from word counts for corpus of documents", path = "MLJText.CountTransformer" ) + +# # DOCUMENT STRING + +""" +$(MMI.doc_header(CountTransformer)) + +The transformer converts a collection of documents, tokenized or pre-parsed as bags of +words/ngrams, to a matrix of term counts. + +# Training data + +In MLJ or MLJBase, bind an instance `model` to data with + + mach = machine(model, X) + +$DOC_TRANSFORMER_INPUTS + +Train the machine using `fit!(mach, rows=...)`. + +# Hyper-parameters + +- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider. + Terms that occur in `> max_doc_freq` documents will not be considered by the + transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than + 90% of the documents will be removed. + +- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider. + Terms that occur in `< max_doc_freq` documents will not be considered by the + transformer. A value of 0.01 means that only terms that are at least in 1% of the + documents will be included. + +# Operations + +- `transform(mach, Xnew)`: Based on the vocabulary learned in training, return the matrix + of counts for `Xnew`, a vector of the same form as `X` above. The matrix has size `(n, + p)`, where `n = length(Xnew)` and `p` the size of the vocabulary. Tokens/ngrams not + appearing in the learned vocabulary are scored zero. + +# Fitted parameters + +The fields of `fitted_params(mach)` are: + +- `vocab`: A vector containing the string used in the transformer's vocabulary. + +$(doc_examples(:CountTransformer)) + +See also +[`TfidfTransformer`](@ref), [`BM25Transformer`](@ref) +""" +CountTransformer diff --git a/src/docstring_helpers.jl b/src/docstring_helpers.jl new file mode 100644 index 0000000..ff8b024 --- /dev/null +++ b/src/docstring_helpers.jl @@ -0,0 +1,98 @@ +const DOC_IDF = + """ + In textbooks and implementations there is variation in the definition of IDF. Here two + IDF definitions are available. The default, smoothed option provides the IDF for a + term `t` as `log((1 + n)/(1 + df(t))) + 1`, where `n` is the total number of documents + and `df(t)` the number of documents in which `t` appears. Setting `smooth_df = false` + provides an IDF of `log(n/df(t)) + 1`. + + """ + +const DOC_TRANSFORMER_INPUTS = + """ + Here: + + - `X` is any vector whose elements are either tokenized documents or bags of + words/ngrams. Specifically, each element is one of the following: + + - A vector of abstract strings (tokens), e.g., `["I", "like", "Sam", ".", "Sam", + "is", "nice", "."]` (scitype `AbstractVector{Textual}`) + + - A dictionary of counts, indexed on abstract strings, e.g., `Dict("I"=>1, "Sam"=>2, + "Sam is"=>1)` (scitype `Multiset{Textual}}`) + + - A dictionary of counts, indexed on plain ngrams, e.g., `Dict(("I",)=>1, + ("Sam",)=>2, ("I", "Sam")=>1)` (scitype `Multiset{<:NTuple{N,Textual} where N}`); + here a *plain ngram* is a tuple of abstract strings. + + """ + +function doc_examples(T) + t = begin + T == :TfidfTransformer ? "tfidf_transformer" : + T == :BM25Transformer ? "bm25_transformer" : + T == :CountTransformer ? "count_transformer" : + error("Problem generating a document string for $T.") + end + + """ + + # Examples + + `$T` accepts a variety of inputs. The example below transforms tokenized documents: + + ```julia + using MLJ + import TextAnalysis + + $T = @load $T pkg=MLJText + + docs = ["Hi my name is Sam.", "How are you today?"] + $t = $T() + + julia> tokenized_docs = TextAnalysis.tokenize.(docs) + 2-element Vector{Vector{String}}: + ["Hi", "my", "name", "is", "Sam", "."] + ["How", "are", "you", "today", "?"] + + mach = machine($t, tokenized_docs) + fit!(mach) + + fitted_params(mach) + + tfidf_mat = transform(mach, tokenized_docs) + ``` + + Alternatively, one can provide documents pre-parsed as ngrams counts: + + ```julia + using MLJ + import TextAnalysis + + docs = ["Hi my name is Sam.", "How are you today?"] + corpus = TextAnalysis.Corpus(TextAnalysis.NGramDocument.(docs, 1, 2)) + ngram_docs = TextAnalysis.ngrams.(corpus) + + julia> ngram_docs[1] + Dict{AbstractString, Int64} with 11 entries: + "is" => 1 + "my" => 1 + "name" => 1 + "." => 1 + "Hi" => 1 + "Sam" => 1 + "my name" => 1 + "Hi my" => 1 + "name is" => 1 + "Sam ." => 1 + "is Sam" => 1 + + $t = $T() + mach = machine($t, ngram_docs) + MLJ.fit!(mach) + fitted_params(mach) + + tfidf_mat = transform(mach, ngram_docs) + ``` + """ +end diff --git a/src/tfidf_transformer.jl b/src/tfidf_transformer.jl index 49b600f..2bfeaa4 100644 --- a/src/tfidf_transformer.jl +++ b/src/tfidf_transformer.jl @@ -1,100 +1,3 @@ -""" -$(MMI.doc_header(TfidfTransformer)) - -`TfidfTransformer`: Convert a collection of raw documents to a matrix of TF-IDF features. -"TF" means term-frequency while "TF-IDF" means term-frequency times inverse -document-frequency. This is a common term weighting scheme in information retrieval, that -has also found good use in document classification. The goal of using TF-IDF instead of the -raw frequencies of occurrence of a token in a given document is to scale down the impact of -tokens that occur very frequently in a given corpus and that are hence empirically less -informative than features that occur in a small fraction of the training corpus.The formula -that is used to compute the TF-IDF for a term `t` of a document `d` in a document set is -`tf_idf(t, d) = tf(t, d) * idf(t)`. - -# Training data - -In MLJ or MLJBase, bind an instance `model` to data with - - mach = machine(model, X) - -Where - -- `X`: is any matrix of input features whose items are of scitype - `ScientificTypesBase.Textual`, `ScientificTypesBase.{Multiset{<:ScientificNGram}}`, or - `ScientificTypesBase.Multiset{.ScientificTypesBase.Textual}`; check the scitype with - `schema(X)` - -Train the machine using `fit!(mach, rows=...)`. - -# Hyper-parameters - -- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider. - Terms that occur in `> max_doc_freq` documents will not be considered by the - transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than - 90% of the documents will be removed. -- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider. - Terms that occur in `< max_doc_freq` documents will not be considered by the - transformer. A value of 0.01 means that only terms that are at least in 1% of the - documents will be included. -- `smooth_idf=true`: Assuming `smooth_idf` is false, IDF is calculated using the equation - `idf(t) = log [ n / df(t) ] + 1`, with term `d`, `n` documents, and document frequency - `df(t)`. The `1` term outside of the logarithm has the effect that terms with zero idf - (i.e. they occur in all documents) will not be entirely ignored. If `smooth_idf` is - true, another `1` term is added, giving: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`. - These `1`'s have the same affect as adding an extra document which contains every term - in the collection exactly once, preventing division by 0. - -# Operations - -- `transform(mach, Xnew)`: Return a transformed matrix of scitype - `Continuous` given new features `Xnew`. - -# Fitted parameters - -The fields of `fitted_params(mach)` are: - -- `vocab`: A vector containing the string used in the transformer's vocabulary. -- `idf_vector`: The transformer's calculated IDF vector. - -# Examples - -`TfidfTransformer` accepts a variety of inputs. In the example below, we use simple -tokenized documents: - -```julia -using MLJ, MLJText, TextAnalysis - -docs = ["Hi my name is Sam.", "How are you today?"] -tfidf_transformer = TfidfTransformer() -mach = machine(tfidf_transformer, tokenize.(docs)) -MLJ.fit!(mach) - -fitted_params(mach) - -tfidf_mat = transform(mach, tokenize.(docs)) -``` - -We can also use the `TextAnalysis` package to implement funcionality similar to SciKit -Learn's N-grams: - -```julia -using MLJ, MLJText, TextAnalysis - -docs = ["Hi my name is Sam.", "How are you today?"] -corpus = Corpus(NGramDocument.(docs, 1, 2)) -ngram_docs = ngrams.(corpus) - -tfidf_transformer = TfidfTransformer() -mach = machine(tfidf_transformer, ngram_docs) -MLJ.fit!(mach) -fitted_params(mach) - -tfidf_mat = transform(mach, ngram_docs) -``` - -See also -[`CountTransformer`](@ref), [`BM25Transformer`](@ref) -""" mutable struct TfidfTransformer <: AbstractTextTransformer max_doc_freq::Float64 min_doc_freq::Float64 @@ -139,7 +42,6 @@ function build_tfidf!(doc_term_mat::SparseMatrixCSC{T}, return tfidf end - function _transform(::TfidfTransformer, result::TfidfTransformerResult, v::Corpus) @@ -178,6 +80,72 @@ MMI.metadata_model(TfidfTransformer, AbstractVector{<:STB.Multiset{STB.Textual}} }, output_scitype = AbstractMatrix{STB.Continuous}, - docstring = "Build TF-IDF matrix from raw documents", - path = "MLJText.TfidfTransformer" + human_name="TF-IFD transformer", + path = "MLJText.TfidfTransformer", ) + + +# # DOCUMENT STRING + +""" +$(MMI.doc_header(TfidfTransformer)) + +The transformer converts a collection of documents, tokenized or pre-parsed as bags of +words/ngrams, to a matrix of [TF-IDF +scores](https://en.wikipedia.org/wiki/Tf–idf#Inverse_document_frequency_2). Here "TF" +means term-frequency while "IDF" means inverse document frequency (defined below). The +TF-IDF score is the product of the two. This is a common term weighting scheme in +information retrieval, that has also found good use in document classification. The goal +of using TF-IDF instead of the raw frequencies of occurrence of a token in a given +document is to scale down the impact of tokens that occur very frequently in a given +corpus and that are hence empirically less informative than features that occur in a small +fraction of the training corpus. + +$DOC_IDF + +# Training data + +In MLJ or MLJBase, bind an instance `model` to data with + + mach = machine(model, X) + +$DOC_TRANSFORMER_INPUTS + +Train the machine using `fit!(mach, rows=...)`. + +# Hyper-parameters + +- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider. Terms + that occur in `> max_doc_freq` documents will not be considered by the transformer. For + example, if `max_doc_freq` is set to 0.9, terms that are in more than 90% of the + documents will be removed. + +- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider. Terms + that occur in `< max_doc_freq` documents will not be considered by the transformer. A + value of 0.01 means that only terms that are at least in 1% of the documents will be + included. + +- `smooth_idf=true`: Control which definition of IDF to use (see above). + +# Operations + +- `transform(mach, Xnew)`: Based on the vocabulary and IDF learned in training, return the + matrix of TF-IDF scores for `Xnew`, a vector of the same form as `X` above. The matrix + has size `(n, p)`, where `n = length(Xnew)` and `p` the size of the + vocabulary. Tokens/ngrams not appearing in the learned vocabulary are scored zero. + +# Fitted parameters + +The fields of `fitted_params(mach)` are: + +- `vocab`: A vector containing the strings used in the transformer's vocabulary. + +- `idf_vector`: The transformer's calculated IDF vector. + + +$(doc_examples(:TfidfTransformer)) + +See also [`CountTransformer`](@ref), [`BM25Transformer`](@ref) + +""" +TfidfTransformer From a086384b0713695ce964ea0ccc1f2e1ac5241595 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 8 Sep 2022 09:53:29 +1200 Subject: [PATCH 6/7] bump compat MLJModelInterface = "1.4" --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index ca7cd5c..9b23ca9 100644 --- a/Project.toml +++ b/Project.toml @@ -14,7 +14,7 @@ TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d" [compat] CorpusLoaders = "0.3" -MLJModelInterface = "1.3" +MLJModelInterface = "1.4" ScientificTypes = "2.2.2, 3" ScientificTypesBase = "2.2.0, 3" TextAnalysis = "0.7.3" From de2281eee20d3bc001cf2160bf230188ab7c3861 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Thu, 8 Sep 2022 09:54:59 +1200 Subject: [PATCH 7/7] remove deprecated use of `docstring` in metadata_model declaration --- src/bm25_transformer.jl | 1 - src/count_transformer.jl | 1 - 2 files changed, 2 deletions(-) diff --git a/src/bm25_transformer.jl b/src/bm25_transformer.jl index dd1a9e6..0b2013f 100644 --- a/src/bm25_transformer.jl +++ b/src/bm25_transformer.jl @@ -107,7 +107,6 @@ MMI.metadata_model(BM25Transformer, AbstractVector{<:STB.Multiset{STB.Textual}} }, output_scitype = AbstractMatrix{STB.Continuous}, - docstring = "Build BM-25 matrix from raw documents", path = "MLJText.BM25Transformer" ) diff --git a/src/count_transformer.jl b/src/count_transformer.jl index b701e6f..f99e9c6 100644 --- a/src/count_transformer.jl +++ b/src/count_transformer.jl @@ -71,7 +71,6 @@ MMI.metadata_model(CountTransformer, AbstractVector{<:STB.Multiset{STB.Textual}} }, output_scitype = AbstractMatrix{STB.Continuous}, - docstring = "Build Bag-of-Words matrix from word counts for corpus of documents", path = "MLJText.CountTransformer" )