Merge pull request #23 from JuliaAI/josephsdavid-dev

Add MLJ compliant doc-strings - take II
JuliaAI · Oct 4, 2022 · b42db3f · b42db3f
2 parents c37e01c + de2281e
commit b42db3f
Show file tree

Hide file tree

Showing 6 changed files with 309 additions and 119 deletions.
diff --git a/Project.toml b/Project.toml
@@ -14,7 +14,7 @@ TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
 
 [compat]
 CorpusLoaders = "0.3"
-MLJModelInterface = "1.3"
+MLJModelInterface = "1.4"
 ScientificTypes = "2.2.2, 3"
 ScientificTypesBase = "2.2.0, 3"
 TextAnalysis = "0.7.3"

diff --git a/src/MLJText.jl b/src/MLJText.jl
@@ -17,6 +17,7 @@ const PKG = "MLJText"          # substitute model-providing package name
 const ScientificNGram{N} = NTuple{<:Any,STB.Textual}
 const NGram{N} = NTuple{<:Any,<:AbstractString}
 
+include("docstring_helpers.jl")
 include("scitypes.jl")
 include("utils.jl")
 include("abstract_text_transformer.jl")
@@ -26,4 +27,5 @@ include("bm25_transformer.jl")
 
 export TfidfTransformer, BM25Transformer, CountTransformer
 
+
 end # module
diff --git a/src/bm25_transformer.jl b/src/bm25_transformer.jl
@@ -1,38 +1,3 @@
-"""
-    BM25Transformer()
-
-Convert a collection of raw documents to a matrix using the Okapi BM25 document-word statistic.
-
-BM25 is an approach similar to that of TF-IDF in terms of representing documents in a vector
-space.  The BM25 scoring function uses both term frequency (TF) and inverse document frequency 
-(IDF) so that, for each term in a document, its relative concentration in the document is
-scored (like TF-IDF). However, BM25 improves upon TF-IDF by incorporating probability - particularly,
-the probability that a user will consider a search result relevant based on the terms in the search query
-and those in each document.
-
-The parameters `max_doc_freq`, `min_doc_freq`, and `smooth_idf` all work identically to those in the
-`TfidfTransformer`. BM25 introduces two additional parameters:
-
-`κ` is the term frequency saturation characteristic. Higher values represent slower saturation. What 
-we mean by saturation is the degree to which a term occuring extra times adds to the overall score. This defaults
-to 2.
-
-`β` is a parameter, bound between 0 and 1, that amplifies the particular document length compared to the average length.
-The bigger β is, the more document length is amplified in terms of the overall score. The default value is 0.75.
-
-For more explanations, please see:
-- http://ethen8181.github.io/machine-learning/search/bm25_intro.html
-- https://en.wikipedia.org/wiki/Okapi_BM25
-- https://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
-
-The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
-that the transformer will consider. `max_doc_freq` indicates that terms in only
-up to the specified percentage of documents will be considered. For example, if
-`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
-will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
-other direction. A value of 0.01 means that only terms that are at least in 1% of
-documents will be included.
-"""
 mutable struct BM25Transformer <: AbstractTextTransformer
     max_doc_freq::Float64
     min_doc_freq::Float64
@@ -41,13 +6,13 @@ mutable struct BM25Transformer <: AbstractTextTransformer
     smooth_idf::Bool
 end
 
-function BM25Transformer(; 
+function BM25Transformer(;
     max_doc_freq::Float64 = 1.0,
     min_doc_freq::Float64 = 0.0,
     κ::Int=2,
     β::Float64=0.75,
     smooth_idf::Bool = true
-    )    
+    )
     transformer = BM25Transformer(max_doc_freq, min_doc_freq, κ, β, smooth_idf)
     message = MMI.clean!(transformer)
     isempty(message) || @warn message
@@ -103,14 +68,14 @@ function build_bm25!(doc_term_mat::SparseMatrixCSC{T},
     return bm25
 end
 
-function _transform(transformer::BM25Transformer, 
+function _transform(transformer::BM25Transformer,
                     result::BMI25TransformerResult,
                     v::Corpus)
     doc_terms = build_dtm(v, result.vocab)
     bm25 = similar(doc_terms.dtm, eltype(result.idf_vector))
     build_bm25!(doc_terms.dtm, bm25, result.idf_vector, result.mean_words_in_docs; κ=transformer.κ, β=transformer.β)
 
-    # here we return the `adjoint` of our sparse matrix to conform to 
+    # here we return the `adjoint` of our sparse matrix to conform to
     # the `n x p` dimensions throughout MLJ
     return adjoint(bm25)
 end
@@ -142,6 +107,82 @@ MMI.metadata_model(BM25Transformer,
                    AbstractVector{<:STB.Multiset{STB.Textual}}
                    },
                output_scitype = AbstractMatrix{STB.Continuous},
-               docstring = "Build BM-25 matrix from raw documents",
                path = "MLJText.BM25Transformer"
                )
+
+# # DOC STRING
+
+"""
+$(MMI.doc_header(BM25Transformer))
+
+The transformer converts a collection of documents, tokenized or pre-parsed as bags of
+words/ngrams, to a matrix of [Okapi BM25 document-word
+statistics](https://en.wikipedia.org/wiki/Okapi_BM25). The BM25 scoring function uses both
+term frequency (TF) and inverse document frequency (IDF, defined below), as in
+[`TfidfTransformer`](ref), but additionally adjusts for the probability that a user will
+consider a search result relevant based, on the terms in the search query and those in
+each document.
+
+$DOC_IDF
+
+References:
+
+- http://ethen8181.github.io/machine-learning/search/bm25_intro.html
+- https://en.wikipedia.org/wiki/Okapi_BM25
+- https://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+
+    mach = machine(model, X)
+
+$DOC_IDF
+
+Train the machine using `fit!(mach, rows=...)`.
+
+# Hyper-parameters
+
+- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `> max_doc_freq` documents will not be considered by the
+  transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
+  90% of the documents will be removed.
+
+- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `< max_doc_freq` documents will not be considered by the
+  transformer. A value of 0.01 means that only terms that are at least in 1% of the
+  documents will be included.
+
+- `κ=2`: The term frequency saturation characteristic. Higher values represent slower
+  saturation. What we mean by saturation is the degree to which a term occurring extra
+  times adds to the overall score.
+
+- `β=0.075`: Amplifies the particular document length compared to the average length. The
+  bigger β is, the more document length is amplified in terms of the overall score. The
+  default value is 0.75, and the bounds are restricted between 0 and 1.
+
+- `smooth_idf=true`: Control which definition of IDF to use (see above).
+
+# Operations
+
+- `transform(mach, Xnew)`: Based on the vocabulary, IDF, and mean word counts learned in
+  training, return the matrix of BM25 scores for `Xnew`, a vector of the same form as `X`
+  above. The matrix has size `(n, p)`, where `n = length(Xnew)` and `p` the size of the
+  vocabulary. Tokens/ngrams not appearing in the learned vocabulary are scored zero.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `vocab`: A vector containing the string used in the transformer's vocabulary.
+
+- `idf_vector`: The transformer's calculated IDF vector.
+
+- `mean_words_in_docs`: The mean number of words in each document.
+
+$(doc_examples(:BM25Transformer))
+
+See also [`TfidfTransformer`](@ref), [`CountTransformer`](@ref)
+
+"""
+BM25Transformer
diff --git a/src/count_transformer.jl b/src/count_transformer.jl
@@ -1,30 +1,9 @@
-"""
-    CountTransformer()
-
-Convert a collection of raw documents to matrix representing a bag-of-words structure from 
-word counts. Essentially, a bag-of-words approach to representing documents in a matrix is 
-comprised of a count of every word in the document corpus/collection for every document. 
-This is a simple but often quite powerful way of representing documents as vectors. The 
-resulting representation is a matrix with rows representing every document in the corpus 
-and columns representing every word in the corpus. The value for each cell is the raw count 
-of a particular word in a particular document.
-
-Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
-to words occuring in a maximum or minimum portion of documents.
-The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
-that the transformer will consider. `max_doc_freq` indicates that terms in only
-up to the specified percentage of documents will be considered. For example, if
-`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
-will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
-other direction. A value of 0.01 means that only terms that are at least in 1% of
-documents will be included.
-"""
 mutable struct CountTransformer <: AbstractTextTransformer
     max_doc_freq::Float64
     min_doc_freq::Float64
 end
 
-function CountTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)    
+function CountTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)
     transformer = CountTransformer(max_doc_freq, min_doc_freq)
     message = MMI.clean!(transformer)
     isempty(message) || @warn message
@@ -37,7 +16,7 @@ end
 
 function _fit(transformer::CountTransformer, verbosity::Int, X::Corpus)
     # process corpus vocab
-    update_lexicon!(X)    
+    update_lexicon!(X)
 
     # calculate min and max doc freq limits
     if transformer.max_doc_freq < 1 || transformer.min_doc_freq > 0
@@ -58,12 +37,12 @@ function _fit(transformer::CountTransformer, verbosity::Int, X::Corpus)
     return fitresult, cache, NamedTuple()
 end
 
-function _transform(::CountTransformer, 
+function _transform(::CountTransformer,
                     result::CountTransformerResult,
                     v::Corpus)
     dtm_matrix = build_dtm(v, result.vocab)
 
-    # here we return the `adjoint` of our sparse matrix to conform to 
+    # here we return the `adjoint` of our sparse matrix to conform to
     # the `n x p` dimensions throughout MLJ
     return adjoint(dtm_matrix.dtm)
 end
@@ -92,6 +71,55 @@ MMI.metadata_model(CountTransformer,
                    AbstractVector{<:STB.Multiset{STB.Textual}}
                    },
                output_scitype = AbstractMatrix{STB.Continuous},
-               docstring = "Build Bag-of-Words matrix from word counts for corpus of documents",
                path = "MLJText.CountTransformer"
-               )
+               )
+
+# # DOCUMENT STRING
+
+"""
+$(MMI.doc_header(CountTransformer))
+
+The transformer converts a collection of documents, tokenized or pre-parsed as bags of
+words/ngrams, to a matrix of term counts.
+
+# Training data
+
+In MLJ or MLJBase, bind an instance `model` to data with
+
+    mach = machine(model, X)
+
+$DOC_TRANSFORMER_INPUTS
+
+Train the machine using `fit!(mach, rows=...)`.
+
+# Hyper-parameters
+
+- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `> max_doc_freq` documents will not be considered by the
+  transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
+  90% of the documents will be removed.
+
+- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
+  Terms that occur in `< max_doc_freq` documents will not be considered by the
+  transformer. A value of 0.01 means that only terms that are at least in 1% of the
+  documents will be included.
+
+# Operations
+
+- `transform(mach, Xnew)`: Based on the vocabulary learned in training, return the matrix
+  of counts for `Xnew`, a vector of the same form as `X` above. The matrix has size `(n,
+  p)`, where `n = length(Xnew)` and `p` the size of the vocabulary. Tokens/ngrams not
+  appearing in the learned vocabulary are scored zero.
+
+# Fitted parameters
+
+The fields of `fitted_params(mach)` are:
+
+- `vocab`: A vector containing the string used in the transformer's vocabulary.
+
+$(doc_examples(:CountTransformer))
+
+See also
+[`TfidfTransformer`](@ref), [`BM25Transformer`](@ref)
+"""
+CountTransformer
diff --git a/src/docstring_helpers.jl b/src/docstring_helpers.jl
@@ -0,0 +1,98 @@
+const DOC_IDF =
+    """
+    In textbooks and implementations there is variation in the definition of IDF. Here two
+    IDF definitions are available. The default, smoothed option provides the IDF for a
+    term `t` as `log((1 + n)/(1 + df(t))) + 1`, where `n` is the total number of documents
+    and `df(t)` the number of documents in which `t` appears. Setting `smooth_df = false`
+    provides an IDF of `log(n/df(t)) + 1`.
+
+    """
+
+const DOC_TRANSFORMER_INPUTS =
+    """
+    Here:
+
+    - `X` is any vector whose elements are either tokenized documents or bags of
+      words/ngrams. Specifically, each element is one of the following:
+
+      - A vector of abstract strings (tokens), e.g., `["I", "like", "Sam", ".", "Sam",
+        "is", "nice", "."]` (scitype `AbstractVector{Textual}`)
+
+      - A dictionary of counts, indexed on abstract strings, e.g., `Dict("I"=>1, "Sam"=>2,
+        "Sam is"=>1)` (scitype `Multiset{Textual}}`)
+
+      - A dictionary of counts, indexed on plain ngrams, e.g., `Dict(("I",)=>1,
+        ("Sam",)=>2, ("I", "Sam")=>1)` (scitype `Multiset{<:NTuple{N,Textual} where N}`);
+        here a *plain ngram* is a tuple of abstract strings.
+
+    """
+
+function doc_examples(T)
+    t = begin
+        T == :TfidfTransformer ? "tfidf_transformer" :
+            T == :BM25Transformer  ? "bm25_transformer" :
+            T == :CountTransformer ? "count_transformer" :
+            error("Problem generating a document string for $T.")
+    end
+
+    """
+
+    # Examples
+
+    `$T` accepts a variety of inputs. The example below transforms tokenized documents:
+
+    ```julia
+    using MLJ
+    import TextAnalysis
+
+    $T = @load $T pkg=MLJText
+
+    docs = ["Hi my name is Sam.", "How are you today?"]
+    $t = $T()
+
+    julia> tokenized_docs = TextAnalysis.tokenize.(docs)
+    2-element Vector{Vector{String}}:
+     ["Hi", "my", "name", "is", "Sam", "."]
+     ["How", "are", "you", "today", "?"]
+
+    mach = machine($t, tokenized_docs)
+    fit!(mach)
+
+    fitted_params(mach)
+
+    tfidf_mat = transform(mach, tokenized_docs)
+    ```
+
+    Alternatively, one can provide documents pre-parsed as ngrams counts:
+
+    ```julia
+    using MLJ
+    import TextAnalysis
+
+    docs = ["Hi my name is Sam.", "How are you today?"]
+    corpus = TextAnalysis.Corpus(TextAnalysis.NGramDocument.(docs, 1, 2))
+    ngram_docs = TextAnalysis.ngrams.(corpus)
+
+    julia> ngram_docs[1]
+    Dict{AbstractString, Int64} with 11 entries:
+      "is"      => 1
+      "my"      => 1
+      "name"    => 1
+      "."       => 1
+      "Hi"      => 1
+      "Sam"     => 1
+      "my name" => 1
+      "Hi my"   => 1
+      "name is" => 1
+      "Sam ."   => 1
+      "is Sam"  => 1
+
+    $t = $T()
+    mach = machine($t, ngram_docs)
+    MLJ.fit!(mach)
+    fitted_params(mach)
+
+    tfidf_mat = transform(mach, ngram_docs)
+    ```
+    """
+end