From 9c67aed2a41d10df2f1b38652365210e6219131a Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 29 Aug 2022 14:15:03 -0500
Subject: [PATCH 1/7] update

---
 src/MLJText.jl           | 111 +++++++++++++++++++++++++++
 src/bm25_transformer.jl  | 154 +++++++++++++++++++++++++++++---------
 src/tfidf_transformer.jl | 157 +++++++++++++++++++++++++++------------
 3 files changed, 339 insertions(+), 83 deletions(-)

diff --git a/src/MLJText.jl b/src/MLJText.jl
index e54bf1a..5f7d4dc 100644
--- a/src/MLJText.jl
+++ b/src/MLJText.jl
@@ -26,4 +26,115 @@ include("bm25_transformer.jl")
 
 export TfidfTransformer, BM25Transformer, CountTransformer
 
+"""
+$(MMI.doc_header(TfidfTransformer))
+
+
+`TfidfTransformer`: Convert a collection of raw documents to a matrix of TF-IDF features.
+"TF" means term-frequency while "TF-IDF" means term-frequency times inverse
+document-frequency.  This is a common term weighting scheme in information retrieval, that
+has also found good use in document classification. The goal of using TF-IDF instead of the
+raw frequencies of occurrence of a token in a given document is to scale down the impact of
+tokens that occur very frequently in a given corpus and that are hence empirically less
+informative than features that occur in a small fraction of the training corpus.The formula
+that is used to compute the TF-IDF for a term `t` of a document `d` in a document set is
+`tf_idf(t, d) = tf(t, d) * idf(t)`.
+
+
+# Training data
+
+
+In MLJ or MLJBase, bind an instance `model` to data with
+
+mach = machine(model, X)
+
+Where
+
+- `X`: is any matrix of input features whose items are of scitype
+  `ScientificTypesBase.Textual`, `ScientificTypesBase.{Multiset{<:ScientificNGram}}`, or
+  `ScientificTypesBase.Multiset{.ScientificTypesBase.Textual}`; check the scitype with
+  `schema(X)`
+
+Train the machine using `fit!(mach, rows=...)`.
+
+
+# Hyper-parameters
+
+
+- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `> max_doc_freq` documents will not be considered by the
+  transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
+  90% of the documents will be removed
+- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `< max_doc_freq` documents will not be considered by the
+  transformer. A value of 0.01 means that only terms that are at least in 1% of the
+  documents will be included
+- `smooth_idf=true`: Assuming `smooth_idf` is false, IDF is calculated using the equation
+  `idf(t) = log [ n / df(t) ] + 1`, with term `d`, `n` documents, and document frequency
+  `df(t)`. The `1` term outside of the logarithm has the effect that terms with zero idf
+  (i.e. they occur in all documents) will not be entirely ignored. If `smooth_idf` is
+  true, another `1` term is added, giving: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`.
+  These `1`'s have the same affect as adding an extra document which contains every term
+  in the collection exactly once, preventing division by 0
+  matrix
+
+
+# Operations
+
+
+- `transform(mach, Xnew)`: Return a transformed matrix of scitype
+  `Continuous` given new features `Xnew`.
+
+
+# Fitted parameters
+
+
+The fields of `fitted_params(mach)` are:
+
+- `vocab`: A vector containing the string used in the transformer's vocabulary.
+- `idf_vector`: The transformer's calculated IDF vector.
+
+
+# Examples
+
+
+`TfidfTransformer` accepts a variety of inputs. In the example below, we use simple
+tokenized documents:
+
+```julia
+using MLJ, MLJText, TextAnalysis
+
+docs = ["Hi my name is Sam.", "How are you today?"]
+tfidf_transformer = TfidfTransformer()
+mach = machine(tfidf_transformer, tokenize.(docs))
+MLJ.fit!(mach)
+
+fitted_params(mach)
+
+tfidf_mat = transform(mach, tokenize.(docs))
+```
+
+We can also use the `TextAnalysis` package to implement funcionality similar to SciKit
+Learn's N-grams:
+
+```julia
+using MLJ, MLJText, TextAnalysis
+
+docs = ["Hi my name is Sam.", "How are you today?"]
+corpus = Corpus(NGramDocument.(docs, 1, 2))
+ngram_docs = ngrams.(corpus)
+
+tfidf_transformer = TfidfTransformer()
+mach = machine(tfidf_transformer, ngram_docs)
+MLJ.fit!(mach)
+fitted_params(mach)
+
+tfidf_mat = transform(mach, ngram_docs)
+```
+
+See also
+[`GaussianNBClassifier`](@ref)
+
+"""
+
 end # module
diff --git a/src/bm25_transformer.jl b/src/bm25_transformer.jl
index 80faf1c..06be25f 100644
--- a/src/bm25_transformer.jl
+++ b/src/bm25_transformer.jl
@@ -1,37 +1,117 @@
 """
-    BM25Transformer()
-
-Convert a collection of raw documents to a matrix using the Okapi BM25 document-word statistic.
-
-BM25 is an approach similar to that of TF-IDF in terms of representing documents in a vector
-space.  The BM25 scoring function uses both term frequency (TF) and inverse document frequency 
-(IDF) so that, for each term in a document, its relative concentration in the document is
-scored (like TF-IDF). However, BM25 improves upon TF-IDF by incorporating probability - particularly,
-the probability that a user will consider a search result relevant based on the terms in the search query
-and those in each document.
-
-The parameters `max_doc_freq`, `min_doc_freq`, and `smooth_idf` all work identically to those in the
-`TfidfTransformer`. BM25 introduces two additional parameters:
-
-`κ` is the term frequency saturation characteristic. Higher values represent slower saturation. What 
-we mean by saturation is the degree to which a term occuring extra times adds to the overall score. This defaults
-to 2.
-
-`β` is a parameter, bound between 0 and 1, that amplifies the particular document length compared to the average length.
-The bigger β is, the more document length is amplified in terms of the overall score. The default value is 0.75.
-
-For more explanations, please see:
-- http://ethen8181.github.io/machine-learning/search/bm25_intro.html
-- https://en.wikipedia.org/wiki/Okapi_BM25
-- https://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
-
-The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
-that the transformer will consider. `max_doc_freq` indicates that terms in only
-up to the specified percentage of documents will be considered. For example, if
-`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
-will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
-other direction. A value of 0.01 means that only terms that are at least in 1% of
-documents will be included.
+$(MMI.doc_header(BM25Transformer))
+
+`BM25Transformer`: Convert a collection of raw documents to a matrix using the Okapi BM25
+document-word statistic. BM25 is an approach similar to that of TF-IDF in terms of
+representing documents in a vector space.  The BM25 scoring function uses both term
+frequency (TF) and inverse document frequency (IDF) so that, for each term in a document,
+its relative concentration in the document is scored (like TF-IDF). However, BM25 improves
+upon TF-IDF by incorporating probability - particularly, the probability that a user will
+consider a search result relevant based on the terms in the search query and those in each
+document.
+
+
+
+# Training data
+
+
+In MLJ or MLJBase, bind an instance `model` to data with
+
+mach = machine(model, X)
+
+Where
+
+- `X`: is any matrix of input features whose items are of scitype
+  `ScientificTypesBase.Textual`, `ScientificTypesBase.{Multiset{<:ScientificNGram}}`, or
+  `ScientificTypesBase.Multiset{.ScientificTypesBase.Textual}`; check the scitype with
+  `schema(X)`
+
+Train the machine using `fit!(mach, rows=...)`.
+
+
+# Hyper-parameters
+
+
+- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `> max_doc_freq` documents will not be considered by the
+  transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
+  90% of the documents will be removed.
+- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `< max_doc_freq` documents will not be considered by the
+  transformer. A value of 0.01 means that only terms that are at least in 1% of the
+  documents will be included.
+- `κ=2`: The term frequency saturation characteristic. Higher values represent slower
+  saturation. What we mean by saturation is the degree to which a term occurring extra
+  times adds to the overall score.
+- `β=0.075`: Amplifies the particular document length compared to the average length. The
+  bigger β is, the more document length is amplified in terms of the overall score. The
+  default value is 0.75, and the bounds are restricted between 0 and 1.
+- `smooth_idf=true`: Assuming `smooth_idf` is false, IDF is calculated using the equation
+  `idf(t) = log [ n / df(t) ] + 1`, with term `d`, `n` documents, and document frequency
+  `df(t)`. The `1` term outside of the logarithm has the effect that terms with zero idf
+  (i.e. they occur in all documents) will not be entirely ignored. If `smooth_idf` is
+  true, another `1` term is added, giving: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`.
+  These `1`'s have the same affect as adding an extra document which contains every term
+  in the collection exactly once, preventing division by 0.
+
+
+# Operations
+
+
+- `transform(mach, Xnew)`: Return a transformed matrix of type
+  `ScientificTypesBase.Continuous` given new features `Xnew`.
+
+
+# Fitted parameters
+
+
+The fields of `fitted_params(mach)` are:
+
+- `vocab`: A vector containing the string used in the transformer's vocabulary.
+- `idf_vector`: The transformer's calculated IDF vector.
+- `mean_words_in_docs`: The mean number of words in each document.
+
+
+# Examples
+
+
+`BM25Transformer` accepts a variety of inputs. In the example below, we use simple
+tokenized documents:
+
+```julia
+using MLJ, MLJText, TextAnalysis
+
+docs = ["Hi my name is Sam.", "How are you today?"]
+bm25_transformer = BM25Transformer()
+mach = machine(bm25_transformer, tokenize.(docs))
+MLJ.fit!(mach)
+
+fitted_params(mach)
+
+bm25_mat = transform(mach, tokenize.(docs))
+```
+
+We can also use the `TextAnalysis` package to implement funcionality similar to SciKit
+Learn's N-grams:
+
+```julia
+using MLJ, MLJText, TextAnalysis
+
+docs = ["Hi my name is Sam.", "How are you today?"]
+corpus = Corpus(NGramDocument.(docs, 1, 2))
+ngram_docs = ngrams.(corpus)
+
+bm25_transformer = BM25Transformer()
+mach = machine(bm25_transformer, ngram_docs)
+MLJ.fit!(mach)
+fitted_params(mach)
+
+tfidf_mat = transform(mach, ngram_docs)
+```
+
+See also
+[`GaussianNBClassifier`](@ref)
+
 """
 mutable struct BM25Transformer <: AbstractTextTransformer
     max_doc_freq::Float64
@@ -41,13 +121,13 @@ mutable struct BM25Transformer <: AbstractTextTransformer
     smooth_idf::Bool
 end
 
-function BM25Transformer(; 
+function BM25Transformer(;
     max_doc_freq::Float64 = 1.0,
     min_doc_freq::Float64 = 0.0,
     κ::Int=2,
     β::Float64=0.75,
     smooth_idf::Bool = true
-    )    
+    )
     transformer = BM25Transformer(max_doc_freq, min_doc_freq, κ, β, smooth_idf)
     message = MMI.clean!(transformer)
     isempty(message) || @warn message
@@ -103,14 +183,14 @@ function build_bm25!(doc_term_mat::SparseMatrixCSC{T},
     return bm25
 end
 
-function _transform(transformer::BM25Transformer, 
+function _transform(transformer::BM25Transformer,
                     result::BMI25TransformerResult,
                     v::Corpus)
     doc_terms = build_dtm(v, result.vocab)
     bm25 = similar(doc_terms.dtm, eltype(result.idf_vector))
     build_bm25!(doc_terms.dtm, bm25, result.idf_vector, result.mean_words_in_docs; κ=transformer.κ, β=transformer.β)
 
-    # here we return the `adjoint` of our sparse matrix to conform to 
+    # here we return the `adjoint` of our sparse matrix to conform to
     # the `n x p` dimensions throughout MLJ
     return adjoint(bm25)
 end
diff --git a/src/tfidf_transformer.jl b/src/tfidf_transformer.jl
index e8bc232..48c4502 100644
--- a/src/tfidf_transformer.jl
+++ b/src/tfidf_transformer.jl
@@ -1,46 +1,111 @@
 """
-    TfidfTransformer()
-
-The following is taken largely from scikit-learn's documentation:
-https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/feature_extraction/text.py
-
-Convert a collection of raw documents to a matrix of TF-IDF features.
-
-"TF" means term-frequency while "TF-IDF" means term-frequency times
-inverse document-frequency.  This is a common term weighting scheme in
-information retrieval, that has also found good use in document
-classification.
-
-The goal of using TF-IDF instead of the raw frequencies of occurrence
-of a token in a given document is to scale down the impact of tokens
-that occur very frequently in a given corpus and that are hence
-empirically less informative than features that occur in a small
-fraction of the training corpus.
-
-The formula that is used to compute the TF-IDF for a term `t` of a
-document `d` in a document set is `tf_idf(t, d) = tf(t, d) *
-idf(t)`. Assuming `smooth_idf=false`, `idf(t) = log [ n / df(t) ] + 1`
-where `n` is the total number of documents in the document set and
-`df(t)` is the document frequency of `t`. The document frequency is
-the number of documents in the document set that contain the term
-`t`. The effect of adding “1” to the idf in the equation above is that
-terms with zero idf, i.e., terms that occur in all documents in a
-training set, will not be entirely ignored. (Note that the idf formula
-above differs from that appearing in standard texts, `idf(t) = log [ n
-/ (df(t) + 1) ])`.
-
-If `smooth_idf=true` (the default), the constant “1” is added to the
-numerator and denominator of the idf as if an extra document was seen
-containing every term in the collection exactly once, which prevents
-zero divisions: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`.
-
-The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
-that the transformer will consider. `max_doc_freq` indicates that terms in only
-up to the specified percentage of documents will be considered. For example, if
-`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
-will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
-other direction. A value of 0.01 means that only terms that are at least in 1% of
-documents will be included.
+$(MMI.doc_header(TfidfTransformer))
+
+
+`TfidfTransformer`: Convert a collection of raw documents to a matrix of TF-IDF features.
+"TF" means term-frequency while "TF-IDF" means term-frequency times inverse
+document-frequency.  This is a common term weighting scheme in information retrieval, that
+has also found good use in document classification. The goal of using TF-IDF instead of the
+raw frequencies of occurrence of a token in a given document is to scale down the impact of
+tokens that occur very frequently in a given corpus and that are hence empirically less
+informative than features that occur in a small fraction of the training corpus.The formula
+that is used to compute the TF-IDF for a term `t` of a document `d` in a document set is
+`tf_idf(t, d) = tf(t, d) * idf(t)`.
+
+
+# Training data
+
+
+In MLJ or MLJBase, bind an instance `model` to data with
+
+mach = machine(model, X)
+
+Where
+
+- `X`: is any matrix of input features whose items are of scitype
+  `ScientificTypesBase.Textual`, `ScientificTypesBase.{Multiset{<:ScientificNGram}}`, or
+  `ScientificTypesBase.Multiset{.ScientificTypesBase.Textual}`; check the scitype with
+  `schema(X)`
+
+Train the machine using `fit!(mach, rows=...)`.
+
+
+# Hyper-parameters
+
+
+- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `> max_doc_freq` documents will not be considered by the
+  transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
+  90% of the documents will be removed.
+- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `< max_doc_freq` documents will not be considered by the
+  transformer. A value of 0.01 means that only terms that are at least in 1% of the
+  documents will be included.
+- `smooth_idf=true`: Assuming `smooth_idf` is false, IDF is calculated using the equation
+  `idf(t) = log [ n / df(t) ] + 1`, with term `d`, `n` documents, and document frequency
+  `df(t)`. The `1` term outside of the logarithm has the effect that terms with zero idf
+  (i.e. they occur in all documents) will not be entirely ignored. If `smooth_idf` is
+  true, another `1` term is added, giving: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`.
+  These `1`'s have the same affect as adding an extra document which contains every term
+  in the collection exactly once, preventing division by 0.
+
+
+# Operations
+
+
+- `transform(mach, Xnew)`: Return a transformed matrix of scitype
+  `Continuous` given new features `Xnew`.
+
+
+# Fitted parameters
+
+
+The fields of `fitted_params(mach)` are:
+
+- `vocab`: A vector containing the string used in the transformer's vocabulary.
+- `idf_vector`: The transformer's calculated IDF vector.
+
+
+# Examples
+
+
+`TfidfTransformer` accepts a variety of inputs. In the example below, we use simple
+tokenized documents:
+
+```julia
+using MLJ, MLJText, TextAnalysis
+
+docs = ["Hi my name is Sam.", "How are you today?"]
+tfidf_transformer = TfidfTransformer()
+mach = machine(tfidf_transformer, tokenize.(docs))
+MLJ.fit!(mach)
+
+fitted_params(mach)
+
+tfidf_mat = transform(mach, tokenize.(docs))
+```
+
+We can also use the `TextAnalysis` package to implement funcionality similar to SciKit
+Learn's N-grams:
+
+```julia
+using MLJ, MLJText, TextAnalysis
+
+docs = ["Hi my name is Sam.", "How are you today?"]
+corpus = Corpus(NGramDocument.(docs, 1, 2))
+ngram_docs = ngrams.(corpus)
+
+tfidf_transformer = TfidfTransformer()
+mach = machine(tfidf_transformer, ngram_docs)
+MLJ.fit!(mach)
+fitted_params(mach)
+
+tfidf_mat = transform(mach, ngram_docs)
+```
+
+See also
+[`GaussianNBClassifier`](@ref)
+
 """
 mutable struct TfidfTransformer <: AbstractTextTransformer
     max_doc_freq::Float64
@@ -48,7 +113,7 @@ mutable struct TfidfTransformer <: AbstractTextTransformer
     smooth_idf::Bool
 end
 
-function TfidfTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0, smooth_idf::Bool = true)    
+function TfidfTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0, smooth_idf::Bool = true)
     transformer = TfidfTransformer(max_doc_freq, min_doc_freq, smooth_idf)
     message = MMI.clean!(transformer)
     isempty(message) || @warn message
@@ -60,7 +125,7 @@ struct TfidfTransformerResult
     idf_vector::Vector{Float64}
 end
 
-get_result(::TfidfTransformer, idf::Vector{<:AbstractFloat}, vocab::Vector{String}, ::SparseMatrixCSC) = 
+get_result(::TfidfTransformer, idf::Vector{<:AbstractFloat}, vocab::Vector{String}, ::SparseMatrixCSC) =
     TfidfTransformerResult(vocab, idf)
 
 function build_tfidf!(doc_term_mat::SparseMatrixCSC{T},
@@ -87,14 +152,14 @@ function build_tfidf!(doc_term_mat::SparseMatrixCSC{T},
     return tfidf
 end
 
-function _transform(::TfidfTransformer, 
+function _transform(::TfidfTransformer,
                     result::TfidfTransformerResult,
                     v::Corpus)
     doc_terms = build_dtm(v, result.vocab)
     tfidf = similar(doc_terms.dtm, eltype(result.idf_vector))
     build_tfidf!(doc_terms.dtm, tfidf, result.idf_vector)
 
-    # here we return the `adjoint` of our sparse matrix to conform to 
+    # here we return the `adjoint` of our sparse matrix to conform to
     # the `n x p` dimensions throughout MLJ
     return adjoint(tfidf)
 end

From bde10aec834d398b6090f7fb76923e34751bef6a Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 29 Aug 2022 14:18:18 -0500
Subject: [PATCH 2/7] up

---
 src/count_transformer.jl | 117 +++++++++++++++++++++++++++++++--------
 1 file changed, 93 insertions(+), 24 deletions(-)

diff --git a/src/count_transformer.jl b/src/count_transformer.jl
index 504fa31..8c5c0b8 100644
--- a/src/count_transformer.jl
+++ b/src/count_transformer.jl
@@ -1,30 +1,99 @@
 """
-    CountTransformer()
-
-Convert a collection of raw documents to matrix representing a bag-of-words structure from 
-word counts. Essentially, a bag-of-words approach to representing documents in a matrix is 
-comprised of a count of every word in the document corpus/collection for every document. 
-This is a simple but often quite powerful way of representing documents as vectors. The 
-resulting representation is a matrix with rows representing every document in the corpus 
-and columns representing every word in the corpus. The value for each cell is the raw count 
-of a particular word in a particular document.
-
-Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
-to words occuring in a maximum or minimum portion of documents.
-The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
-that the transformer will consider. `max_doc_freq` indicates that terms in only
-up to the specified percentage of documents will be considered. For example, if
-`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
-will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
-other direction. A value of 0.01 means that only terms that are at least in 1% of
-documents will be included.
+$(MMI.doc_header(CountTransformer))
+
+`CountTransformer`:Convert a collection of raw documents to matrix representing a
+bag-of-words structure from word counts. Essentially, a bag-of-words approach to
+representing documents in a matrix is comprised of a count of every word in the document
+corpus/collection for every document. This is a simple but often quite powerful way of
+representing documents as vectors. The resulting representation is a matrix with rows
+representing every document in the corpus and columns representing every word in the corpus.
+The value for each cell is the raw count of a particular word in a particular document.
+
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+
+    mach = machine(model, X)
+
+Where
+
+- `X`: is any matrix of input features whose items are of scitype
+  `ScientificTypesBase.Textual`, `ScientificTypesBase.{Multiset{<:ScientificNGram}}`, or
+  `ScientificTypesBase.Multiset{.ScientificTypesBase.Textual}`; check the scitype with
+  `schema(X)`
+
+Train the machine using `fit!(mach, rows=...)`.
+
+# Hyper-parameters
+
+
+- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `> max_doc_freq` documents will not be considered by the
+  transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
+  90% of the documents will be removed.
+- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `< max_doc_freq` documents will not be considered by the
+  transformer. A value of 0.01 means that only terms that are at least in 1% of the
+  documents will be included.
+
+
+# Operations
+
+- `transform(mach, Xnew)`: Return a transformed matrix of type
+  `ScientificTypesBase.Continuous` given new features `Xnew`.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `vocab`: A vector containing the string used in the transformer's vocabulary.
+
+# Examples
+
+`CountTransformer` accepts a variety of inputs. In the example below, we use simple
+tokenized documents:
+
+```julia
+using MLJ, MLJText, TextAnalysis
+
+docs = ["Hi my name is Sam.", "How are you today?"]
+count_transformer = CountTransformer()
+mach = machine(count_transformer, tokenize.(docs))
+MLJ.fit!(mach)
+
+fitted_params(mach)
+
+count_mat = transform(mach, tokenize.(docs))
+```
+
+We can also use the `TextAnalysis` package to implement funcionality similar to SciKit
+Learn's N-grams:
+
+```julia
+using MLJ, MLJText, TextAnalysis
+
+docs = ["Hi my name is Sam.", "How are you today?"]
+corpus = Corpus(NGramDocument.(docs, 1, 2))
+ngram_docs = ngrams.(corpus)
+
+count_transformer = CountTransformer()
+mach = machine(count_transformer, ngram_docs)
+MLJ.fit!(mach)
+fitted_params(mach)
+
+count_mat = transform(mach, ngram_docs)
+```
+
+See also
+[`GaussianNBClassifier`](@ref)
 """
 mutable struct CountTransformer <: AbstractTextTransformer
     max_doc_freq::Float64
     min_doc_freq::Float64
 end
 
-function CountTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)    
+function CountTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)
     transformer = CountTransformer(max_doc_freq, min_doc_freq)
     message = MMI.clean!(transformer)
     isempty(message) || @warn message
@@ -37,7 +106,7 @@ end
 
 function _fit(transformer::CountTransformer, verbosity::Int, X::Corpus)
     # process corpus vocab
-    update_lexicon!(X)    
+    update_lexicon!(X)
 
     # calculate min and max doc freq limits
     if transformer.max_doc_freq < 1 || transformer.min_doc_freq > 0
@@ -58,12 +127,12 @@ function _fit(transformer::CountTransformer, verbosity::Int, X::Corpus)
     return fitresult, cache, NamedTuple()
 end
 
-function _transform(::CountTransformer, 
+function _transform(::CountTransformer,
                     result::CountTransformerResult,
                     v::Corpus)
     dtm_matrix = build_dtm(v, result.vocab)
 
-    # here we return the `adjoint` of our sparse matrix to conform to 
+    # here we return the `adjoint` of our sparse matrix to conform to
     # the `n x p` dimensions throughout MLJ
     return adjoint(dtm_matrix.dtm)
 end
@@ -94,4 +163,4 @@ MMI.metadata_model(CountTransformer,
                output_scitype = AbstractMatrix{STB.Continuous},
                docstring = "Build Bag-of-Words matrix from word counts for corpus of documents",
                path = "MLJText.CountTransformer"
-               )
\ No newline at end of file
+               )

From a1699d5b6fb9693eae45f829babff9dedee8176c Mon Sep 17 00:00:00 2001
From: josephsdavid <josephsd@smu.edu>
Date: Mon, 29 Aug 2022 14:21:29 -0500
Subject: [PATCH 3/7] doc export complete

---
 src/bm25_transformer.jl  | 18 +++---------------
 src/count_transformer.jl |  4 +---
 src/tfidf_transformer.jl | 16 ++--------------
 3 files changed, 6 insertions(+), 32 deletions(-)

diff --git a/src/bm25_transformer.jl b/src/bm25_transformer.jl
index 06be25f..1efb371 100644
--- a/src/bm25_transformer.jl
+++ b/src/bm25_transformer.jl
@@ -10,14 +10,11 @@ upon TF-IDF by incorporating probability - particularly, the probability that a
 consider a search result relevant based on the terms in the search query and those in each
 document.
 
-
-
 # Training data
 
-
 In MLJ or MLJBase, bind an instance `model` to data with
 
-mach = machine(model, X)
+    mach = machine(model, X)
 
 Where
 
@@ -28,10 +25,8 @@ Where
 
 Train the machine using `fit!(mach, rows=...)`.
 
-
 # Hyper-parameters
 
-
 - `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
   Terms that occur in `> max_doc_freq` documents will not be considered by the
   transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
@@ -54,27 +49,21 @@ Train the machine using `fit!(mach, rows=...)`.
   These `1`'s have the same affect as adding an extra document which contains every term
   in the collection exactly once, preventing division by 0.
 
-
 # Operations
 
-
 - `transform(mach, Xnew)`: Return a transformed matrix of type
   `ScientificTypesBase.Continuous` given new features `Xnew`.
 
-
 # Fitted parameters
 
-
 The fields of `fitted_params(mach)` are:
 
 - `vocab`: A vector containing the string used in the transformer's vocabulary.
 - `idf_vector`: The transformer's calculated IDF vector.
 - `mean_words_in_docs`: The mean number of words in each document.
 
-
 # Examples
 
-
 `BM25Transformer` accepts a variety of inputs. In the example below, we use simple
 tokenized documents:
 
@@ -106,12 +95,11 @@ mach = machine(bm25_transformer, ngram_docs)
 MLJ.fit!(mach)
 fitted_params(mach)
 
-tfidf_mat = transform(mach, ngram_docs)
+bm25_mat = transform(mach, ngram_docs)
 ```
 
 See also
-[`GaussianNBClassifier`](@ref)
-
+[`TfidfTransformer`](@ref), [`CountTransformer`](@ref)
 """
 mutable struct BM25Transformer <: AbstractTextTransformer
     max_doc_freq::Float64
diff --git a/src/count_transformer.jl b/src/count_transformer.jl
index 8c5c0b8..7ccde1d 100644
--- a/src/count_transformer.jl
+++ b/src/count_transformer.jl
@@ -27,7 +27,6 @@ Train the machine using `fit!(mach, rows=...)`.
 
 # Hyper-parameters
 
-
 - `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
   Terms that occur in `> max_doc_freq` documents will not be considered by the
   transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
@@ -37,7 +36,6 @@ Train the machine using `fit!(mach, rows=...)`.
   transformer. A value of 0.01 means that only terms that are at least in 1% of the
   documents will be included.
 
-
 # Operations
 
 - `transform(mach, Xnew)`: Return a transformed matrix of type
@@ -86,7 +84,7 @@ count_mat = transform(mach, ngram_docs)
 ```
 
 See also
-[`GaussianNBClassifier`](@ref)
+[`TfidfTransformer`](@ref), [`BM25Transformer`](@ref)
 """
 mutable struct CountTransformer <: AbstractTextTransformer
     max_doc_freq::Float64
diff --git a/src/tfidf_transformer.jl b/src/tfidf_transformer.jl
index 48c4502..49b600f 100644
--- a/src/tfidf_transformer.jl
+++ b/src/tfidf_transformer.jl
@@ -1,7 +1,6 @@
 """
 $(MMI.doc_header(TfidfTransformer))
 
-
 `TfidfTransformer`: Convert a collection of raw documents to a matrix of TF-IDF features.
 "TF" means term-frequency while "TF-IDF" means term-frequency times inverse
 document-frequency.  This is a common term weighting scheme in information retrieval, that
@@ -12,13 +11,11 @@ informative than features that occur in a small fraction of the training corpus.
 that is used to compute the TF-IDF for a term `t` of a document `d` in a document set is
 `tf_idf(t, d) = tf(t, d) * idf(t)`.
 
-
 # Training data
 
-
 In MLJ or MLJBase, bind an instance `model` to data with
 
-mach = machine(model, X)
+    mach = machine(model, X)
 
 Where
 
@@ -29,10 +26,8 @@ Where
 
 Train the machine using `fit!(mach, rows=...)`.
 
-
 # Hyper-parameters
 
-
 - `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
   Terms that occur in `> max_doc_freq` documents will not be considered by the
   transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
@@ -49,26 +44,20 @@ Train the machine using `fit!(mach, rows=...)`.
   These `1`'s have the same affect as adding an extra document which contains every term
   in the collection exactly once, preventing division by 0.
 
-
 # Operations
 
-
 - `transform(mach, Xnew)`: Return a transformed matrix of scitype
   `Continuous` given new features `Xnew`.
 
-
 # Fitted parameters
 
-
 The fields of `fitted_params(mach)` are:
 
 - `vocab`: A vector containing the string used in the transformer's vocabulary.
 - `idf_vector`: The transformer's calculated IDF vector.
 
-
 # Examples
 
-
 `TfidfTransformer` accepts a variety of inputs. In the example below, we use simple
 tokenized documents:
 
@@ -104,8 +93,7 @@ tfidf_mat = transform(mach, ngram_docs)
 ```
 
 See also
-[`GaussianNBClassifier`](@ref)
-
+[`CountTransformer`](@ref), [`BM25Transformer`](@ref)
 """
 mutable struct TfidfTransformer <: AbstractTextTransformer
     max_doc_freq::Float64

From 2d5f1a70ff90e34d5140b36bfe65ef756ee49743 Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Mon, 5 Sep 2022 10:27:38 +1200
Subject: [PATCH 4/7] remove duplicate docstring

---
 src/MLJText.jl | 111 -------------------------------------------------
 1 file changed, 111 deletions(-)

diff --git a/src/MLJText.jl b/src/MLJText.jl
index 5f7d4dc..e54bf1a 100644
--- a/src/MLJText.jl
+++ b/src/MLJText.jl
@@ -26,115 +26,4 @@ include("bm25_transformer.jl")
 
 export TfidfTransformer, BM25Transformer, CountTransformer
 
-"""
-$(MMI.doc_header(TfidfTransformer))
-
-
-`TfidfTransformer`: Convert a collection of raw documents to a matrix of TF-IDF features.
-"TF" means term-frequency while "TF-IDF" means term-frequency times inverse
-document-frequency.  This is a common term weighting scheme in information retrieval, that
-has also found good use in document classification. The goal of using TF-IDF instead of the
-raw frequencies of occurrence of a token in a given document is to scale down the impact of
-tokens that occur very frequently in a given corpus and that are hence empirically less
-informative than features that occur in a small fraction of the training corpus.The formula
-that is used to compute the TF-IDF for a term `t` of a document `d` in a document set is
-`tf_idf(t, d) = tf(t, d) * idf(t)`.
-
-
-# Training data
-
-
-In MLJ or MLJBase, bind an instance `model` to data with
-
-mach = machine(model, X)
-
-Where
-
-- `X`: is any matrix of input features whose items are of scitype
-  `ScientificTypesBase.Textual`, `ScientificTypesBase.{Multiset{<:ScientificNGram}}`, or
-  `ScientificTypesBase.Multiset{.ScientificTypesBase.Textual}`; check the scitype with
-  `schema(X)`
-
-Train the machine using `fit!(mach, rows=...)`.
-
-
-# Hyper-parameters
-
-
-- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
-  Terms that occur in `> max_doc_freq` documents will not be considered by the
-  transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
-  90% of the documents will be removed
-- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
-  Terms that occur in `< max_doc_freq` documents will not be considered by the
-  transformer. A value of 0.01 means that only terms that are at least in 1% of the
-  documents will be included
-- `smooth_idf=true`: Assuming `smooth_idf` is false, IDF is calculated using the equation
-  `idf(t) = log [ n / df(t) ] + 1`, with term `d`, `n` documents, and document frequency
-  `df(t)`. The `1` term outside of the logarithm has the effect that terms with zero idf
-  (i.e. they occur in all documents) will not be entirely ignored. If `smooth_idf` is
-  true, another `1` term is added, giving: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`.
-  These `1`'s have the same affect as adding an extra document which contains every term
-  in the collection exactly once, preventing division by 0
-  matrix
-
-
-# Operations
-
-
-- `transform(mach, Xnew)`: Return a transformed matrix of scitype
-  `Continuous` given new features `Xnew`.
-
-
-# Fitted parameters
-
-
-The fields of `fitted_params(mach)` are:
-
-- `vocab`: A vector containing the string used in the transformer's vocabulary.
-- `idf_vector`: The transformer's calculated IDF vector.
-
-
-# Examples
-
-
-`TfidfTransformer` accepts a variety of inputs. In the example below, we use simple
-tokenized documents:
-
-```julia
-using MLJ, MLJText, TextAnalysis
-
-docs = ["Hi my name is Sam.", "How are you today?"]
-tfidf_transformer = TfidfTransformer()
-mach = machine(tfidf_transformer, tokenize.(docs))
-MLJ.fit!(mach)
-
-fitted_params(mach)
-
-tfidf_mat = transform(mach, tokenize.(docs))
-```
-
-We can also use the `TextAnalysis` package to implement funcionality similar to SciKit
-Learn's N-grams:
-
-```julia
-using MLJ, MLJText, TextAnalysis
-
-docs = ["Hi my name is Sam.", "How are you today?"]
-corpus = Corpus(NGramDocument.(docs, 1, 2))
-ngram_docs = ngrams.(corpus)
-
-tfidf_transformer = TfidfTransformer()
-mach = machine(tfidf_transformer, ngram_docs)
-MLJ.fit!(mach)
-fitted_params(mach)
-
-tfidf_mat = transform(mach, ngram_docs)
-```
-
-See also
-[`GaussianNBClassifier`](@ref)
-
-"""
-
 end # module

From 84d75842588cfacfb9f81579cedaa99a644f7236 Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Mon, 5 Sep 2022 15:31:48 +1200
Subject: [PATCH 5/7] move docstrings and refactor to reduce duplication

---
 src/MLJText.jl           |   2 +
 src/bm25_transformer.jl  | 180 +++++++++++++++++----------------------
 src/count_transformer.jl | 138 +++++++++++-------------------
 src/docstring_helpers.jl |  98 +++++++++++++++++++++
 src/tfidf_transformer.jl | 168 +++++++++++++++---------------------
 5 files changed, 295 insertions(+), 291 deletions(-)
 create mode 100644 src/docstring_helpers.jl

diff --git a/src/MLJText.jl b/src/MLJText.jl
index e54bf1a..c52a6f5 100644
--- a/src/MLJText.jl
+++ b/src/MLJText.jl
@@ -17,6 +17,7 @@ const PKG = "MLJText"          # substitute model-providing package name
 const ScientificNGram{N} = NTuple{<:Any,STB.Textual}
 const NGram{N} = NTuple{<:Any,<:AbstractString}
 
+include("docstring_helpers.jl")
 include("scitypes.jl")
 include("utils.jl")
 include("abstract_text_transformer.jl")
@@ -26,4 +27,5 @@ include("bm25_transformer.jl")
 
 export TfidfTransformer, BM25Transformer, CountTransformer
 
+
 end # module
diff --git a/src/bm25_transformer.jl b/src/bm25_transformer.jl
index 1efb371..dd1a9e6 100644
--- a/src/bm25_transformer.jl
+++ b/src/bm25_transformer.jl
@@ -1,106 +1,3 @@
-"""
-$(MMI.doc_header(BM25Transformer))
-
-`BM25Transformer`: Convert a collection of raw documents to a matrix using the Okapi BM25
-document-word statistic. BM25 is an approach similar to that of TF-IDF in terms of
-representing documents in a vector space.  The BM25 scoring function uses both term
-frequency (TF) and inverse document frequency (IDF) so that, for each term in a document,
-its relative concentration in the document is scored (like TF-IDF). However, BM25 improves
-upon TF-IDF by incorporating probability - particularly, the probability that a user will
-consider a search result relevant based on the terms in the search query and those in each
-document.
-
-# Training data
-
-In MLJ or MLJBase, bind an instance `model` to data with
-
-    mach = machine(model, X)
-
-Where
-
-- `X`: is any matrix of input features whose items are of scitype
-  `ScientificTypesBase.Textual`, `ScientificTypesBase.{Multiset{<:ScientificNGram}}`, or
-  `ScientificTypesBase.Multiset{.ScientificTypesBase.Textual}`; check the scitype with
-  `schema(X)`
-
-Train the machine using `fit!(mach, rows=...)`.
-
-# Hyper-parameters
-
-- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
-  Terms that occur in `> max_doc_freq` documents will not be considered by the
-  transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
-  90% of the documents will be removed.
-- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
-  Terms that occur in `< max_doc_freq` documents will not be considered by the
-  transformer. A value of 0.01 means that only terms that are at least in 1% of the
-  documents will be included.
-- `κ=2`: The term frequency saturation characteristic. Higher values represent slower
-  saturation. What we mean by saturation is the degree to which a term occurring extra
-  times adds to the overall score.
-- `β=0.075`: Amplifies the particular document length compared to the average length. The
-  bigger β is, the more document length is amplified in terms of the overall score. The
-  default value is 0.75, and the bounds are restricted between 0 and 1.
-- `smooth_idf=true`: Assuming `smooth_idf` is false, IDF is calculated using the equation
-  `idf(t) = log [ n / df(t) ] + 1`, with term `d`, `n` documents, and document frequency
-  `df(t)`. The `1` term outside of the logarithm has the effect that terms with zero idf
-  (i.e. they occur in all documents) will not be entirely ignored. If `smooth_idf` is
-  true, another `1` term is added, giving: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`.
-  These `1`'s have the same affect as adding an extra document which contains every term
-  in the collection exactly once, preventing division by 0.
-
-# Operations
-
-- `transform(mach, Xnew)`: Return a transformed matrix of type
-  `ScientificTypesBase.Continuous` given new features `Xnew`.
-
-# Fitted parameters
-
-The fields of `fitted_params(mach)` are:
-
-- `vocab`: A vector containing the string used in the transformer's vocabulary.
-- `idf_vector`: The transformer's calculated IDF vector.
-- `mean_words_in_docs`: The mean number of words in each document.
-
-# Examples
-
-`BM25Transformer` accepts a variety of inputs. In the example below, we use simple
-tokenized documents:
-
-```julia
-using MLJ, MLJText, TextAnalysis
-
-docs = ["Hi my name is Sam.", "How are you today?"]
-bm25_transformer = BM25Transformer()
-mach = machine(bm25_transformer, tokenize.(docs))
-MLJ.fit!(mach)
-
-fitted_params(mach)
-
-bm25_mat = transform(mach, tokenize.(docs))
-```
-
-We can also use the `TextAnalysis` package to implement funcionality similar to SciKit
-Learn's N-grams:
-
-```julia
-using MLJ, MLJText, TextAnalysis
-
-docs = ["Hi my name is Sam.", "How are you today?"]
-corpus = Corpus(NGramDocument.(docs, 1, 2))
-ngram_docs = ngrams.(corpus)
-
-bm25_transformer = BM25Transformer()
-mach = machine(bm25_transformer, ngram_docs)
-MLJ.fit!(mach)
-fitted_params(mach)
-
-bm25_mat = transform(mach, ngram_docs)
-```
-
-See also
-[`TfidfTransformer`](@ref), [`CountTransformer`](@ref)
-"""
 mutable struct BM25Transformer <: AbstractTextTransformer
     max_doc_freq::Float64
     min_doc_freq::Float64
@@ -213,3 +110,80 @@ MMI.metadata_model(BM25Transformer,
                docstring = "Build BM-25 matrix from raw documents",
                path = "MLJText.BM25Transformer"
                )
+
+# # DOC STRING
+
+"""
+$(MMI.doc_header(BM25Transformer))
+
+The transformer converts a collection of documents, tokenized or pre-parsed as bags of
+words/ngrams, to a matrix of [Okapi BM25 document-word
+statistics](https://en.wikipedia.org/wiki/Okapi_BM25). The BM25 scoring function uses both
+term frequency (TF) and inverse document frequency (IDF, defined below), as in
+[`TfidfTransformer`](ref), but additionally adjusts for the probability that a user will
+consider a search result relevant based, on the terms in the search query and those in
+each document.
+
+$DOC_IDF
+
+References:
+
+- http://ethen8181.github.io/machine-learning/search/bm25_intro.html
+- https://en.wikipedia.org/wiki/Okapi_BM25
+- https://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+
+    mach = machine(model, X)
+
+$DOC_IDF
+
+Train the machine using `fit!(mach, rows=...)`.
+
+# Hyper-parameters
+
+- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `> max_doc_freq` documents will not be considered by the
+  transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
+  90% of the documents will be removed.
+
+- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `< max_doc_freq` documents will not be considered by the
+  transformer. A value of 0.01 means that only terms that are at least in 1% of the
+  documents will be included.
+
+- `κ=2`: The term frequency saturation characteristic. Higher values represent slower
+  saturation. What we mean by saturation is the degree to which a term occurring extra
+  times adds to the overall score.
+
+- `β=0.075`: Amplifies the particular document length compared to the average length. The
+  bigger β is, the more document length is amplified in terms of the overall score. The
+  default value is 0.75, and the bounds are restricted between 0 and 1.
+
+- `smooth_idf=true`: Control which definition of IDF to use (see above).
+
+# Operations
+
+- `transform(mach, Xnew)`: Based on the vocabulary, IDF, and mean word counts learned in
+  training, return the matrix of BM25 scores for `Xnew`, a vector of the same form as `X`
+  above. The matrix has size `(n, p)`, where `n = length(Xnew)` and `p` the size of the
+  vocabulary. Tokens/ngrams not appearing in the learned vocabulary are scored zero.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `vocab`: A vector containing the string used in the transformer's vocabulary.
+
+- `idf_vector`: The transformer's calculated IDF vector.
+
+- `mean_words_in_docs`: The mean number of words in each document.
+
+$(doc_examples(:BM25Transformer))
+
+See also [`TfidfTransformer`](@ref), [`CountTransformer`](@ref)
+
+"""
+BM25Transformer
diff --git a/src/count_transformer.jl b/src/count_transformer.jl
index 7ccde1d..b701e6f 100644
--- a/src/count_transformer.jl
+++ b/src/count_transformer.jl
@@ -1,91 +1,3 @@
-"""
-$(MMI.doc_header(CountTransformer))
-
-`CountTransformer`:Convert a collection of raw documents to matrix representing a
-bag-of-words structure from word counts. Essentially, a bag-of-words approach to
-representing documents in a matrix is comprised of a count of every word in the document
-corpus/collection for every document. This is a simple but often quite powerful way of
-representing documents as vectors. The resulting representation is a matrix with rows
-representing every document in the corpus and columns representing every word in the corpus.
-The value for each cell is the raw count of a particular word in a particular document.
-
-
-# Training data
-
-In MLJ or MLJBase, bind an instance `model` to data with
-
-    mach = machine(model, X)
-
-Where
-
-- `X`: is any matrix of input features whose items are of scitype
-  `ScientificTypesBase.Textual`, `ScientificTypesBase.{Multiset{<:ScientificNGram}}`, or
-  `ScientificTypesBase.Multiset{.ScientificTypesBase.Textual}`; check the scitype with
-  `schema(X)`
-
-Train the machine using `fit!(mach, rows=...)`.
-
-# Hyper-parameters
-
-- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
-  Terms that occur in `> max_doc_freq` documents will not be considered by the
-  transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
-  90% of the documents will be removed.
-- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
-  Terms that occur in `< max_doc_freq` documents will not be considered by the
-  transformer. A value of 0.01 means that only terms that are at least in 1% of the
-  documents will be included.
-
-# Operations
-
-- `transform(mach, Xnew)`: Return a transformed matrix of type
-  `ScientificTypesBase.Continuous` given new features `Xnew`.
-
-# Fitted parameters
-
-The fields of `fitted_params(mach)` are:
-
-- `vocab`: A vector containing the string used in the transformer's vocabulary.
-
-# Examples
-
-`CountTransformer` accepts a variety of inputs. In the example below, we use simple
-tokenized documents:
-
-```julia
-using MLJ, MLJText, TextAnalysis
-
-docs = ["Hi my name is Sam.", "How are you today?"]
-count_transformer = CountTransformer()
-mach = machine(count_transformer, tokenize.(docs))
-MLJ.fit!(mach)
-
-fitted_params(mach)
-
-count_mat = transform(mach, tokenize.(docs))
-```
-
-We can also use the `TextAnalysis` package to implement funcionality similar to SciKit
-Learn's N-grams:
-
-```julia
-using MLJ, MLJText, TextAnalysis
-
-docs = ["Hi my name is Sam.", "How are you today?"]
-corpus = Corpus(NGramDocument.(docs, 1, 2))
-ngram_docs = ngrams.(corpus)
-
-count_transformer = CountTransformer()
-mach = machine(count_transformer, ngram_docs)
-MLJ.fit!(mach)
-fitted_params(mach)
-
-count_mat = transform(mach, ngram_docs)
-```
-
-See also
-[`TfidfTransformer`](@ref), [`BM25Transformer`](@ref)
-"""
 mutable struct CountTransformer <: AbstractTextTransformer
     max_doc_freq::Float64
     min_doc_freq::Float64
@@ -162,3 +74,53 @@ MMI.metadata_model(CountTransformer,
                docstring = "Build Bag-of-Words matrix from word counts for corpus of documents",
                path = "MLJText.CountTransformer"
                )
+
+# # DOCUMENT STRING
+
+"""
+$(MMI.doc_header(CountTransformer))
+
+The transformer converts a collection of documents, tokenized or pre-parsed as bags of
+words/ngrams, to a matrix of term counts.
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+
+    mach = machine(model, X)
+
+$DOC_TRANSFORMER_INPUTS
+
+Train the machine using `fit!(mach, rows=...)`.
+
+# Hyper-parameters
+
+- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `> max_doc_freq` documents will not be considered by the
+  transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
+  90% of the documents will be removed.
+
+- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `< max_doc_freq` documents will not be considered by the
+  transformer. A value of 0.01 means that only terms that are at least in 1% of the
+  documents will be included.
+
+# Operations
+
+- `transform(mach, Xnew)`: Based on the vocabulary learned in training, return the matrix
+  of counts for `Xnew`, a vector of the same form as `X` above. The matrix has size `(n,
+  p)`, where `n = length(Xnew)` and `p` the size of the vocabulary. Tokens/ngrams not
+  appearing in the learned vocabulary are scored zero.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `vocab`: A vector containing the string used in the transformer's vocabulary.
+
+$(doc_examples(:CountTransformer))
+
+See also
+[`TfidfTransformer`](@ref), [`BM25Transformer`](@ref)
+"""
+CountTransformer
diff --git a/src/docstring_helpers.jl b/src/docstring_helpers.jl
new file mode 100644
index 0000000..ff8b024
--- /dev/null
+++ b/src/docstring_helpers.jl
@@ -0,0 +1,98 @@
+const DOC_IDF =
+    """
+    In textbooks and implementations there is variation in the definition of IDF. Here two
+    IDF definitions are available. The default, smoothed option provides the IDF for a
+    term `t` as `log((1 + n)/(1 + df(t))) + 1`, where `n` is the total number of documents
+    and `df(t)` the number of documents in which `t` appears. Setting `smooth_df = false`
+    provides an IDF of `log(n/df(t)) + 1`.
+
+    """
+
+const DOC_TRANSFORMER_INPUTS =
+    """
+    Here:
+
+    - `X` is any vector whose elements are either tokenized documents or bags of
+      words/ngrams. Specifically, each element is one of the following:
+
+      - A vector of abstract strings (tokens), e.g., `["I", "like", "Sam", ".", "Sam",
+        "is", "nice", "."]` (scitype `AbstractVector{Textual}`)
+
+      - A dictionary of counts, indexed on abstract strings, e.g., `Dict("I"=>1, "Sam"=>2,
+        "Sam is"=>1)` (scitype `Multiset{Textual}}`)
+
+      - A dictionary of counts, indexed on plain ngrams, e.g., `Dict(("I",)=>1,
+        ("Sam",)=>2, ("I", "Sam")=>1)` (scitype `Multiset{<:NTuple{N,Textual} where N}`);
+        here a *plain ngram* is a tuple of abstract strings.
+
+    """
+
+function doc_examples(T)
+    t = begin
+        T == :TfidfTransformer ? "tfidf_transformer" :
+            T == :BM25Transformer  ? "bm25_transformer" :
+            T == :CountTransformer ? "count_transformer" :
+            error("Problem generating a document string for $T.")
+    end
+
+    """
+
+    # Examples
+
+    `$T` accepts a variety of inputs. The example below transforms tokenized documents:
+
+    ```julia
+    using MLJ
+    import TextAnalysis
+
+    $T = @load $T pkg=MLJText
+
+    docs = ["Hi my name is Sam.", "How are you today?"]
+    $t = $T()
+
+    julia> tokenized_docs = TextAnalysis.tokenize.(docs)
+    2-element Vector{Vector{String}}:
+     ["Hi", "my", "name", "is", "Sam", "."]
+     ["How", "are", "you", "today", "?"]
+
+    mach = machine($t, tokenized_docs)
+    fit!(mach)
+
+    fitted_params(mach)
+
+    tfidf_mat = transform(mach, tokenized_docs)
+    ```
+
+    Alternatively, one can provide documents pre-parsed as ngrams counts:
+
+    ```julia
+    using MLJ
+    import TextAnalysis
+
+    docs = ["Hi my name is Sam.", "How are you today?"]
+    corpus = TextAnalysis.Corpus(TextAnalysis.NGramDocument.(docs, 1, 2))
+    ngram_docs = TextAnalysis.ngrams.(corpus)
+
+    julia> ngram_docs[1]
+    Dict{AbstractString, Int64} with 11 entries:
+      "is"      => 1
+      "my"      => 1
+      "name"    => 1
+      "."       => 1
+      "Hi"      => 1
+      "Sam"     => 1
+      "my name" => 1
+      "Hi my"   => 1
+      "name is" => 1
+      "Sam ."   => 1
+      "is Sam"  => 1
+
+    $t = $T()
+    mach = machine($t, ngram_docs)
+    MLJ.fit!(mach)
+    fitted_params(mach)
+
+    tfidf_mat = transform(mach, ngram_docs)
+    ```
+    """
+end
diff --git a/src/tfidf_transformer.jl b/src/tfidf_transformer.jl
index 49b600f..2bfeaa4 100644
--- a/src/tfidf_transformer.jl
+++ b/src/tfidf_transformer.jl
@@ -1,100 +1,3 @@
-"""
-$(MMI.doc_header(TfidfTransformer))
-
-`TfidfTransformer`: Convert a collection of raw documents to a matrix of TF-IDF features.
-"TF" means term-frequency while "TF-IDF" means term-frequency times inverse
-document-frequency.  This is a common term weighting scheme in information retrieval, that
-has also found good use in document classification. The goal of using TF-IDF instead of the
-raw frequencies of occurrence of a token in a given document is to scale down the impact of
-tokens that occur very frequently in a given corpus and that are hence empirically less
-informative than features that occur in a small fraction of the training corpus.The formula
-that is used to compute the TF-IDF for a term `t` of a document `d` in a document set is
-`tf_idf(t, d) = tf(t, d) * idf(t)`.
-
-# Training data
-
-In MLJ or MLJBase, bind an instance `model` to data with
-
-    mach = machine(model, X)
-
-Where
-
-- `X`: is any matrix of input features whose items are of scitype
-  `ScientificTypesBase.Textual`, `ScientificTypesBase.{Multiset{<:ScientificNGram}}`, or
-  `ScientificTypesBase.Multiset{.ScientificTypesBase.Textual}`; check the scitype with
-  `schema(X)`
-
-Train the machine using `fit!(mach, rows=...)`.
-
-# Hyper-parameters
-
-- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
-  Terms that occur in `> max_doc_freq` documents will not be considered by the
-  transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
-  90% of the documents will be removed.
-- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
-  Terms that occur in `< max_doc_freq` documents will not be considered by the
-  transformer. A value of 0.01 means that only terms that are at least in 1% of the
-  documents will be included.
-- `smooth_idf=true`: Assuming `smooth_idf` is false, IDF is calculated using the equation
-  `idf(t) = log [ n / df(t) ] + 1`, with term `d`, `n` documents, and document frequency
-  `df(t)`. The `1` term outside of the logarithm has the effect that terms with zero idf
-  (i.e. they occur in all documents) will not be entirely ignored. If `smooth_idf` is
-  true, another `1` term is added, giving: `idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1`.
-  These `1`'s have the same affect as adding an extra document which contains every term
-  in the collection exactly once, preventing division by 0.
-
-# Operations
-
-- `transform(mach, Xnew)`: Return a transformed matrix of scitype
-  `Continuous` given new features `Xnew`.
-
-# Fitted parameters
-
-The fields of `fitted_params(mach)` are:
-
-- `vocab`: A vector containing the string used in the transformer's vocabulary.
-- `idf_vector`: The transformer's calculated IDF vector.
-
-# Examples
-
-`TfidfTransformer` accepts a variety of inputs. In the example below, we use simple
-tokenized documents:
-
-```julia
-using MLJ, MLJText, TextAnalysis
-
-docs = ["Hi my name is Sam.", "How are you today?"]
-tfidf_transformer = TfidfTransformer()
-mach = machine(tfidf_transformer, tokenize.(docs))
-MLJ.fit!(mach)
-
-fitted_params(mach)
-
-tfidf_mat = transform(mach, tokenize.(docs))
-```
-
-We can also use the `TextAnalysis` package to implement funcionality similar to SciKit
-Learn's N-grams:
-
-```julia
-using MLJ, MLJText, TextAnalysis
-
-docs = ["Hi my name is Sam.", "How are you today?"]
-corpus = Corpus(NGramDocument.(docs, 1, 2))
-ngram_docs = ngrams.(corpus)
-
-tfidf_transformer = TfidfTransformer()
-mach = machine(tfidf_transformer, ngram_docs)
-MLJ.fit!(mach)
-fitted_params(mach)
-
-tfidf_mat = transform(mach, ngram_docs)
-```
-
-See also
-[`CountTransformer`](@ref), [`BM25Transformer`](@ref)
-"""
 mutable struct TfidfTransformer <: AbstractTextTransformer
     max_doc_freq::Float64
     min_doc_freq::Float64
@@ -139,7 +42,6 @@ function build_tfidf!(doc_term_mat::SparseMatrixCSC{T},
 
     return tfidf
 end
-
 function _transform(::TfidfTransformer,
                     result::TfidfTransformerResult,
                     v::Corpus)
@@ -178,6 +80,72 @@ MMI.metadata_model(TfidfTransformer,
                    AbstractVector{<:STB.Multiset{STB.Textual}}
                    },
                output_scitype = AbstractMatrix{STB.Continuous},
-               docstring = "Build TF-IDF matrix from raw documents",
-               path = "MLJText.TfidfTransformer"
+               human_name="TF-IFD transformer",
+               path = "MLJText.TfidfTransformer",
                )
+
+
+# # DOCUMENT STRING
+
+"""
+$(MMI.doc_header(TfidfTransformer))
+
+The transformer converts a collection of documents, tokenized or pre-parsed as bags of
+words/ngrams, to a matrix of [TF-IDF
+scores](https://en.wikipedia.org/wiki/Tf–idf#Inverse_document_frequency_2). Here "TF"
+means term-frequency while "IDF" means inverse document frequency (defined below). The
+TF-IDF score is the product of the two. This is a common term weighting scheme in
+information retrieval, that has also found good use in document classification. The goal
+of using TF-IDF instead of the raw frequencies of occurrence of a token in a given
+document is to scale down the impact of tokens that occur very frequently in a given
+corpus and that are hence empirically less informative than features that occur in a small
+fraction of the training corpus.
+
+$DOC_IDF
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+
+    mach = machine(model, X)
+
+$DOC_TRANSFORMER_INPUTS
+
+Train the machine using `fit!(mach, rows=...)`.
+
+# Hyper-parameters
+
+- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.  Terms
+  that occur in `> max_doc_freq` documents will not be considered by the transformer. For
+  example, if `max_doc_freq` is set to 0.9, terms that are in more than 90% of the
+  documents will be removed.
+
+- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.  Terms
+  that occur in `< max_doc_freq` documents will not be considered by the transformer. A
+  value of 0.01 means that only terms that are at least in 1% of the documents will be
+  included.
+
+- `smooth_idf=true`: Control which definition of IDF to use (see above).
+
+# Operations
+
+- `transform(mach, Xnew)`: Based on the vocabulary and IDF learned in training, return the
+  matrix of TF-IDF scores for `Xnew`, a vector of the same form as `X` above. The matrix
+  has size `(n, p)`, where `n = length(Xnew)` and `p` the size of the
+  vocabulary. Tokens/ngrams not appearing in the learned vocabulary are scored zero.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `vocab`: A vector containing the strings used in the transformer's vocabulary.
+
+- `idf_vector`: The transformer's calculated IDF vector.
+
+
+$(doc_examples(:TfidfTransformer))
+
+See also [`CountTransformer`](@ref), [`BM25Transformer`](@ref)
+
+"""
+TfidfTransformer

From a086384b0713695ce964ea0ccc1f2e1ac5241595 Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Thu, 8 Sep 2022 09:53:29 +1200
Subject: [PATCH 6/7] bump compat MLJModelInterface = "1.4"

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index ca7cd5c..9b23ca9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,7 +14,7 @@ TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
 
 [compat]
 CorpusLoaders = "0.3"
-MLJModelInterface = "1.3"
+MLJModelInterface = "1.4"
 ScientificTypes = "2.2.2, 3"
 ScientificTypesBase = "2.2.0, 3"
 TextAnalysis = "0.7.3"

From de2281eee20d3bc001cf2160bf230188ab7c3861 Mon Sep 17 00:00:00 2001
From: "Anthony D. Blaom" <anthony.blaom@gmail.com>
Date: Thu, 8 Sep 2022 09:54:59 +1200
Subject: [PATCH 7/7] remove deprecated use of `docstring` in metadata_model
 declaration

---
 src/bm25_transformer.jl  | 1 -
 src/count_transformer.jl | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/bm25_transformer.jl b/src/bm25_transformer.jl
index dd1a9e6..0b2013f 100644
--- a/src/bm25_transformer.jl
+++ b/src/bm25_transformer.jl
@@ -107,7 +107,6 @@ MMI.metadata_model(BM25Transformer,
                    AbstractVector{<:STB.Multiset{STB.Textual}}
                    },
                output_scitype = AbstractMatrix{STB.Continuous},
-               docstring = "Build BM-25 matrix from raw documents",
                path = "MLJText.BM25Transformer"
                )
 
diff --git a/src/count_transformer.jl b/src/count_transformer.jl
index b701e6f..f99e9c6 100644
--- a/src/count_transformer.jl
+++ b/src/count_transformer.jl
@@ -71,7 +71,6 @@ MMI.metadata_model(CountTransformer,
                    AbstractVector{<:STB.Multiset{STB.Textual}}
                    },
                output_scitype = AbstractMatrix{STB.Continuous},
-               docstring = "Build Bag-of-Words matrix from word counts for corpus of documents",
                path = "MLJText.CountTransformer"
                )