Skip to content

Commit

Permalink
Merge pull request #23 from JuliaAI/josephsdavid-dev
Browse files Browse the repository at this point in the history
Add MLJ compliant doc-strings - take II
  • Loading branch information
ablaom authored Oct 4, 2022
2 parents c37e01c + de2281e commit b42db3f
Show file tree
Hide file tree
Showing 6 changed files with 309 additions and 119 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"

[compat]
CorpusLoaders = "0.3"
MLJModelInterface = "1.3"
MLJModelInterface = "1.4"
ScientificTypes = "2.2.2, 3"
ScientificTypesBase = "2.2.0, 3"
TextAnalysis = "0.7.3"
Expand Down
2 changes: 2 additions & 0 deletions src/MLJText.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ const PKG = "MLJText" # substitute model-providing package name
const ScientificNGram{N} = NTuple{<:Any,STB.Textual}
const NGram{N} = NTuple{<:Any,<:AbstractString}

include("docstring_helpers.jl")
include("scitypes.jl")
include("utils.jl")
include("abstract_text_transformer.jl")
Expand All @@ -26,4 +27,5 @@ include("bm25_transformer.jl")

export TfidfTransformer, BM25Transformer, CountTransformer


end # module
121 changes: 81 additions & 40 deletions src/bm25_transformer.jl
Original file line number Diff line number Diff line change
@@ -1,38 +1,3 @@
"""
BM25Transformer()
Convert a collection of raw documents to a matrix using the Okapi BM25 document-word statistic.
BM25 is an approach similar to that of TF-IDF in terms of representing documents in a vector
space. The BM25 scoring function uses both term frequency (TF) and inverse document frequency
(IDF) so that, for each term in a document, its relative concentration in the document is
scored (like TF-IDF). However, BM25 improves upon TF-IDF by incorporating probability - particularly,
the probability that a user will consider a search result relevant based on the terms in the search query
and those in each document.
The parameters `max_doc_freq`, `min_doc_freq`, and `smooth_idf` all work identically to those in the
`TfidfTransformer`. BM25 introduces two additional parameters:
`κ` is the term frequency saturation characteristic. Higher values represent slower saturation. What
we mean by saturation is the degree to which a term occuring extra times adds to the overall score. This defaults
to 2.
`β` is a parameter, bound between 0 and 1, that amplifies the particular document length compared to the average length.
The bigger β is, the more document length is amplified in terms of the overall score. The default value is 0.75.
For more explanations, please see:
- http://ethen8181.github.io/machine-learning/search/bm25_intro.html
- https://en.wikipedia.org/wiki/Okapi_BM25
- https://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
that the transformer will consider. `max_doc_freq` indicates that terms in only
up to the specified percentage of documents will be considered. For example, if
`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
other direction. A value of 0.01 means that only terms that are at least in 1% of
documents will be included.
"""
mutable struct BM25Transformer <: AbstractTextTransformer
max_doc_freq::Float64
min_doc_freq::Float64
Expand All @@ -41,13 +6,13 @@ mutable struct BM25Transformer <: AbstractTextTransformer
smooth_idf::Bool
end

function BM25Transformer(;
function BM25Transformer(;
max_doc_freq::Float64 = 1.0,
min_doc_freq::Float64 = 0.0,
κ::Int=2,
β::Float64=0.75,
smooth_idf::Bool = true
)
)
transformer = BM25Transformer(max_doc_freq, min_doc_freq, κ, β, smooth_idf)
message = MMI.clean!(transformer)
isempty(message) || @warn message
Expand Down Expand Up @@ -103,14 +68,14 @@ function build_bm25!(doc_term_mat::SparseMatrixCSC{T},
return bm25
end

function _transform(transformer::BM25Transformer,
function _transform(transformer::BM25Transformer,
result::BMI25TransformerResult,
v::Corpus)
doc_terms = build_dtm(v, result.vocab)
bm25 = similar(doc_terms.dtm, eltype(result.idf_vector))
build_bm25!(doc_terms.dtm, bm25, result.idf_vector, result.mean_words_in_docs; κ=transformer.κ, β=transformer.β)

# here we return the `adjoint` of our sparse matrix to conform to
# here we return the `adjoint` of our sparse matrix to conform to
# the `n x p` dimensions throughout MLJ
return adjoint(bm25)
end
Expand Down Expand Up @@ -142,6 +107,82 @@ MMI.metadata_model(BM25Transformer,
AbstractVector{<:STB.Multiset{STB.Textual}}
},
output_scitype = AbstractMatrix{STB.Continuous},
docstring = "Build BM-25 matrix from raw documents",
path = "MLJText.BM25Transformer"
)

# # DOC STRING

"""
$(MMI.doc_header(BM25Transformer))
The transformer converts a collection of documents, tokenized or pre-parsed as bags of
words/ngrams, to a matrix of [Okapi BM25 document-word
statistics](https://en.wikipedia.org/wiki/Okapi_BM25). The BM25 scoring function uses both
term frequency (TF) and inverse document frequency (IDF, defined below), as in
[`TfidfTransformer`](ref), but additionally adjusts for the probability that a user will
consider a search result relevant based, on the terms in the search query and those in
each document.
$DOC_IDF
References:
- http://ethen8181.github.io/machine-learning/search/bm25_intro.html
- https://en.wikipedia.org/wiki/Okapi_BM25
- https://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html
# Training data
In MLJ or MLJBase, bind an instance `model` to data with
mach = machine(model, X)
$DOC_IDF
Train the machine using `fit!(mach, rows=...)`.
# Hyper-parameters
- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
Terms that occur in `> max_doc_freq` documents will not be considered by the
transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
90% of the documents will be removed.
- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
Terms that occur in `< max_doc_freq` documents will not be considered by the
transformer. A value of 0.01 means that only terms that are at least in 1% of the
documents will be included.
- `κ=2`: The term frequency saturation characteristic. Higher values represent slower
saturation. What we mean by saturation is the degree to which a term occurring extra
times adds to the overall score.
- `β=0.075`: Amplifies the particular document length compared to the average length. The
bigger β is, the more document length is amplified in terms of the overall score. The
default value is 0.75, and the bounds are restricted between 0 and 1.
- `smooth_idf=true`: Control which definition of IDF to use (see above).
# Operations
- `transform(mach, Xnew)`: Based on the vocabulary, IDF, and mean word counts learned in
training, return the matrix of BM25 scores for `Xnew`, a vector of the same form as `X`
above. The matrix has size `(n, p)`, where `n = length(Xnew)` and `p` the size of the
vocabulary. Tokens/ngrams not appearing in the learned vocabulary are scored zero.
# Fitted parameters
The fields of `fitted_params(mach)` are:
- `vocab`: A vector containing the string used in the transformer's vocabulary.
- `idf_vector`: The transformer's calculated IDF vector.
- `mean_words_in_docs`: The mean number of words in each document.
$(doc_examples(:BM25Transformer))
See also [`TfidfTransformer`](@ref), [`CountTransformer`](@ref)
"""
BM25Transformer
82 changes: 55 additions & 27 deletions src/count_transformer.jl
Original file line number Diff line number Diff line change
@@ -1,30 +1,9 @@
"""
CountTransformer()
Convert a collection of raw documents to matrix representing a bag-of-words structure from
word counts. Essentially, a bag-of-words approach to representing documents in a matrix is
comprised of a count of every word in the document corpus/collection for every document.
This is a simple but often quite powerful way of representing documents as vectors. The
resulting representation is a matrix with rows representing every document in the corpus
and columns representing every word in the corpus. The value for each cell is the raw count
of a particular word in a particular document.
Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
to words occuring in a maximum or minimum portion of documents.
The parameters `max_doc_freq` and `min_doc_freq` restrict the vocabulary
that the transformer will consider. `max_doc_freq` indicates that terms in only
up to the specified percentage of documents will be considered. For example, if
`max_doc_freq` is set to 0.9, terms that are in more than 90% of documents
will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
other direction. A value of 0.01 means that only terms that are at least in 1% of
documents will be included.
"""
mutable struct CountTransformer <: AbstractTextTransformer
max_doc_freq::Float64
min_doc_freq::Float64
end

function CountTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)
function CountTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)
transformer = CountTransformer(max_doc_freq, min_doc_freq)
message = MMI.clean!(transformer)
isempty(message) || @warn message
Expand All @@ -37,7 +16,7 @@ end

function _fit(transformer::CountTransformer, verbosity::Int, X::Corpus)
# process corpus vocab
update_lexicon!(X)
update_lexicon!(X)

# calculate min and max doc freq limits
if transformer.max_doc_freq < 1 || transformer.min_doc_freq > 0
Expand All @@ -58,12 +37,12 @@ function _fit(transformer::CountTransformer, verbosity::Int, X::Corpus)
return fitresult, cache, NamedTuple()
end

function _transform(::CountTransformer,
function _transform(::CountTransformer,
result::CountTransformerResult,
v::Corpus)
dtm_matrix = build_dtm(v, result.vocab)

# here we return the `adjoint` of our sparse matrix to conform to
# here we return the `adjoint` of our sparse matrix to conform to
# the `n x p` dimensions throughout MLJ
return adjoint(dtm_matrix.dtm)
end
Expand Down Expand Up @@ -92,6 +71,55 @@ MMI.metadata_model(CountTransformer,
AbstractVector{<:STB.Multiset{STB.Textual}}
},
output_scitype = AbstractMatrix{STB.Continuous},
docstring = "Build Bag-of-Words matrix from word counts for corpus of documents",
path = "MLJText.CountTransformer"
)
)

# # DOCUMENT STRING

"""
$(MMI.doc_header(CountTransformer))
The transformer converts a collection of documents, tokenized or pre-parsed as bags of
words/ngrams, to a matrix of term counts.
# Training data
In MLJ or MLJBase, bind an instance `model` to data with
mach = machine(model, X)
$DOC_TRANSFORMER_INPUTS
Train the machine using `fit!(mach, rows=...)`.
# Hyper-parameters
- `max_doc_freq=1.0`: Restricts the vocabulary that the transformer will consider.
Terms that occur in `> max_doc_freq` documents will not be considered by the
transformer. For example, if `max_doc_freq` is set to 0.9, terms that are in more than
90% of the documents will be removed.
- `min_doc_freq=0.0`: Restricts the vocabulary that the transformer will consider.
Terms that occur in `< max_doc_freq` documents will not be considered by the
transformer. A value of 0.01 means that only terms that are at least in 1% of the
documents will be included.
# Operations
- `transform(mach, Xnew)`: Based on the vocabulary learned in training, return the matrix
of counts for `Xnew`, a vector of the same form as `X` above. The matrix has size `(n,
p)`, where `n = length(Xnew)` and `p` the size of the vocabulary. Tokens/ngrams not
appearing in the learned vocabulary are scored zero.
# Fitted parameters
The fields of `fitted_params(mach)` are:
- `vocab`: A vector containing the string used in the transformer's vocabulary.
$(doc_examples(:CountTransformer))
See also
[`TfidfTransformer`](@ref), [`BM25Transformer`](@ref)
"""
CountTransformer
98 changes: 98 additions & 0 deletions src/docstring_helpers.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
const DOC_IDF =
"""
In textbooks and implementations there is variation in the definition of IDF. Here two
IDF definitions are available. The default, smoothed option provides the IDF for a
term `t` as `log((1 + n)/(1 + df(t))) + 1`, where `n` is the total number of documents
and `df(t)` the number of documents in which `t` appears. Setting `smooth_df = false`
provides an IDF of `log(n/df(t)) + 1`.
"""

const DOC_TRANSFORMER_INPUTS =
"""
Here:
- `X` is any vector whose elements are either tokenized documents or bags of
words/ngrams. Specifically, each element is one of the following:
- A vector of abstract strings (tokens), e.g., `["I", "like", "Sam", ".", "Sam",
"is", "nice", "."]` (scitype `AbstractVector{Textual}`)
- A dictionary of counts, indexed on abstract strings, e.g., `Dict("I"=>1, "Sam"=>2,
"Sam is"=>1)` (scitype `Multiset{Textual}}`)
- A dictionary of counts, indexed on plain ngrams, e.g., `Dict(("I",)=>1,
("Sam",)=>2, ("I", "Sam")=>1)` (scitype `Multiset{<:NTuple{N,Textual} where N}`);
here a *plain ngram* is a tuple of abstract strings.
"""

function doc_examples(T)
t = begin
T == :TfidfTransformer ? "tfidf_transformer" :
T == :BM25Transformer ? "bm25_transformer" :
T == :CountTransformer ? "count_transformer" :
error("Problem generating a document string for $T.")
end

"""
# Examples
`$T` accepts a variety of inputs. The example below transforms tokenized documents:
```julia
using MLJ
import TextAnalysis
$T = @load $T pkg=MLJText
docs = ["Hi my name is Sam.", "How are you today?"]
$t = $T()
julia> tokenized_docs = TextAnalysis.tokenize.(docs)
2-element Vector{Vector{String}}:
["Hi", "my", "name", "is", "Sam", "."]
["How", "are", "you", "today", "?"]
mach = machine($t, tokenized_docs)
fit!(mach)
fitted_params(mach)
tfidf_mat = transform(mach, tokenized_docs)
```
Alternatively, one can provide documents pre-parsed as ngrams counts:
```julia
using MLJ
import TextAnalysis
docs = ["Hi my name is Sam.", "How are you today?"]
corpus = TextAnalysis.Corpus(TextAnalysis.NGramDocument.(docs, 1, 2))
ngram_docs = TextAnalysis.ngrams.(corpus)
julia> ngram_docs[1]
Dict{AbstractString, Int64} with 11 entries:
"is" => 1
"my" => 1
"name" => 1
"." => 1
"Hi" => 1
"Sam" => 1
"my name" => 1
"Hi my" => 1
"name is" => 1
"Sam ." => 1
"is Sam" => 1
$t = $T()
mach = machine($t, ngram_docs)
MLJ.fit!(mach)
fitted_params(mach)
tfidf_mat = transform(mach, ngram_docs)
```
"""
end
Loading

0 comments on commit b42db3f

Please sign in to comment.