Skip to content

Commit

Permalink
Merge pull request #21 from JuliaAI/dev
Browse files Browse the repository at this point in the history
For a 0.2.0 release
  • Loading branch information
pazzo83 authored Feb 8, 2022
2 parents 2af3837 + c37e01c commit 45a2eea
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 43 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "MLJText"
uuid = "5e27fcf9-6bac-46ba-8580-b5712f3d6387"
authors = ["Chris Alexander <uvapazzo@gmail.com>, Anthony D. Blaom <anthony.blaom@gmail.com>"]
version = "0.1.3"
version = "0.2.0"

[deps]
CorpusLoaders = "214a0ac2-f95b-54f7-a80b-442ed9c2c9e8"
Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,19 +89,19 @@ BM25Transformer(
```
Please see [http://ethen8181.github.io/machine-learning/search/bm25_intro.html](http://ethen8181.github.io/machine-learning/search/bm25_intro.html) for more details about how these parameters affect the matrix that is generated.

## Bag-of-Words Transformer
## Count Transformer
The `MLJText` package also offers a way to represent documents using the simpler bag-of-words representation. This returns a document-term matrix (as you would get in `TextAnalysis`) that consists of the count for every word in the corpus for each document in the corpus.

### Usage
```julia
using MLJ, MLJText, TextAnalysis

docs = ["Hi my name is Sam.", "How are you today?"]
bagofwords_transformer = BagOfWordsTransformer()
mach = machine(bagofwords_transformer, tokenize.(docs))
count_transformer = CountTransformer()
mach = machine(count_transformer, tokenize.(docs))
MLJ.fit!(mach)

bagofwords_mat = transform(mach, tokenize.(docs))
count_mat = transform(mach, tokenize.(docs))
```

The resulting matrix looks like:
Expand Down
4 changes: 2 additions & 2 deletions src/MLJText.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ include("scitypes.jl")
include("utils.jl")
include("abstract_text_transformer.jl")
include("tfidf_transformer.jl")
include("bagofwords_transformer.jl")
include("count_transformer.jl")
include("bm25_transformer.jl")

export TfidfTransformer, BM25Transformer, BagOfWordsTransformer
export TfidfTransformer, BM25Transformer, CountTransformer

end # module
42 changes: 21 additions & 21 deletions src/bagofwords_transformer.jl → src/count_transformer.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
"""
BagOfWordsTransformer()
CountTransformer()
Convert a collection of raw documents to matrix representing a bag-of-words structure.
Essentially, a bag-of-words approach to representing documents in a matrix is comprised of
a count of every word in the document corpus/collection for every document. This is a simple
but often quite powerful way of representing documents as vectors. The resulting representation is
a matrix with rows representing every document in the corpus and columns representing every word
in the corpus. The value for each cell is the raw count of a particular word in a particular
document.
Convert a collection of raw documents to matrix representing a bag-of-words structure from
word counts. Essentially, a bag-of-words approach to representing documents in a matrix is
comprised of a count of every word in the document corpus/collection for every document.
This is a simple but often quite powerful way of representing documents as vectors. The
resulting representation is a matrix with rows representing every document in the corpus
and columns representing every word in the corpus. The value for each cell is the raw count
of a particular word in a particular document.
Similarly to the `TfidfTransformer`, the vocabulary considered can be restricted
to words occuring in a maximum or minimum portion of documents.
Expand All @@ -19,23 +19,23 @@ will be removed. Similarly, the `min_doc_freq` parameter restricts terms in the
other direction. A value of 0.01 means that only terms that are at least in 1% of
documents will be included.
"""
mutable struct BagOfWordsTransformer <: AbstractTextTransformer
mutable struct CountTransformer <: AbstractTextTransformer
max_doc_freq::Float64
min_doc_freq::Float64
end

function BagOfWordsTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)
transformer = BagOfWordsTransformer(max_doc_freq, min_doc_freq)
function CountTransformer(; max_doc_freq::Float64 = 1.0, min_doc_freq::Float64 = 0.0)
transformer = CountTransformer(max_doc_freq, min_doc_freq)
message = MMI.clean!(transformer)
isempty(message) || @warn message
return transformer
end

struct BagOfWordsTransformerResult
struct CountTransformerResult
vocab::Vector{String}
end

function _fit(transformer::BagOfWordsTransformer, verbosity::Int, X::Corpus)
function _fit(transformer::CountTransformer, verbosity::Int, X::Corpus)
# process corpus vocab
update_lexicon!(X)

Expand All @@ -52,14 +52,14 @@ function _fit(transformer::BagOfWordsTransformer, verbosity::Int, X::Corpus)
end

# prepare result
fitresult = BagOfWordsTransformerResult(vocab)
fitresult = CountTransformerResult(vocab)
cache = nothing

return fitresult, cache, NamedTuple()
end

function _transform(::BagOfWordsTransformer,
result::BagOfWordsTransformerResult,
function _transform(::CountTransformer,
result::CountTransformerResult,
v::Corpus)
dtm_matrix = build_dtm(v, result.vocab)

Expand All @@ -69,14 +69,14 @@ function _transform(::BagOfWordsTransformer,
end

# for returning user-friendly form of the learned parameters:
function MMI.fitted_params(::BagOfWordsTransformer, fitresult::BagOfWordsTransformerResult)
function MMI.fitted_params(::CountTransformer, fitresult::CountTransformerResult)
vocab = fitresult.vocab
return (vocab = vocab,)
end

## META DATA

MMI.metadata_pkg(BagOfWordsTransformer,
MMI.metadata_pkg(CountTransformer,
name="$PKG",
uuid="7876af07-990d-54b4-ab0e-23690620f79a",
url="https://github.com/JuliaAI/MLJText.jl",
Expand All @@ -85,13 +85,13 @@ MMI.metadata_pkg(BagOfWordsTransformer,
is_wrapper=false
)

MMI.metadata_model(BagOfWordsTransformer,
MMI.metadata_model(CountTransformer,
input_scitype = Union{
AbstractVector{<:AbstractVector{STB.Textual}},
AbstractVector{<:STB.Multiset{<:ScientificNGram}},
AbstractVector{<:STB.Multiset{STB.Textual}}
},
output_scitype = AbstractMatrix{STB.Continuous},
docstring = "Build Bag-of-Words matrix for corpus of documents",
path = "MLJText.BagOfWordsTransformer"
docstring = "Build Bag-of-Words matrix from word counts for corpus of documents",
path = "MLJText.CountTransformer"
)
30 changes: 15 additions & 15 deletions test/abstract_text_transformer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@ using TextAnalysis
test_tfidf_machine = @test_logs machine(tfidf_transformer, ngram_vec)
MLJBase.fit!(test_tfidf_machine)

# train bag_of_words transformer
bagofwords_vectorizer = MLJText.BagOfWordsTransformer()
test_bow_machine = @test_logs machine(bagofwords_vectorizer, ngram_vec)
MLJBase.fit!(test_bow_machine)
# train count transformer
count_transformer = MLJText.CountTransformer()
test_count_machine = @test_logs machine(count_transformer, ngram_vec)
MLJBase.fit!(test_count_machine)

# train bm25 transformer
bm25_transformer = MLJText.BM25Transformer()
test_bm25_machine = @test_logs machine(bm25_transformer, ngram_vec)
MLJBase.fit!(test_bm25_machine)

test_machines = [test_tfidf_machine, test_bow_machine, test_bm25_machine]
test_machines = [test_tfidf_machine, test_count_machine, test_bm25_machine]

# test single doc
test_doc1 = ngrams(NGramDocument("Another sentence ok"))
Expand Down Expand Up @@ -91,18 +91,18 @@ end
test_tfidf_machine2 = @test_logs machine(tfidf_transformer, [bag])
MLJBase.fit!(test_tfidf_machine2)

# train bag_of_words transformer
bagofwords_vectorizer = MLJText.BagOfWordsTransformer()
test_bow_machine2 = @test_logs machine(bagofwords_vectorizer, [bag])
MLJBase.fit!(test_bow_machine2)
# train count transformer
count_transformer = MLJText.CountTransformer()
test_count_machine2 = @test_logs machine(count_transformer, [bag])
MLJBase.fit!(test_count_machine2)

# train bm25 transformer
bm25_transformer = MLJText.BM25Transformer()
test_bm25_machine2 = @test_logs machine(bm25_transformer, [bag])
MLJBase.fit!(test_bm25_machine2)

test_doc5 = ["How about a cat in a hat"]
for mach = [test_tfidf_machine2, test_bow_machine2, test_bm25_machine2]
for mach = [test_tfidf_machine2, test_count_machine2, test_bm25_machine2]
test_doc_transform = transform(mach, test_doc5)
@test sum(test_doc_transform, dims=2)[1] > 0.0
@test size(test_doc_transform) == (1, 8)
Expand All @@ -126,10 +126,10 @@ end
test_tfidf_machine3 = @test_logs machine(tfidf_transformer, ngram_vec)
MLJBase.fit!(test_tfidf_machine3)

# train bag_of_words transformer
bagofwords_vectorizer = MLJText.BagOfWordsTransformer(max_doc_freq=0.8)
test_bow_machine3 = @test_logs machine(bagofwords_vectorizer, ngram_vec)
MLJBase.fit!(test_bow_machine3)
# train count transformer
count_transformer = MLJText.CountTransformer(max_doc_freq=0.8)
test_count_machine3 = @test_logs machine(count_transformer, ngram_vec)
MLJBase.fit!(test_count_machine3)

# train bm25 transformer
bm25_transformer = MLJText.BM25Transformer(max_doc_freq=0.8, min_doc_freq=0.2)
Expand All @@ -140,7 +140,7 @@ end
test_doc_transform = transform(test_tfidf_machine3, ngram_vec)
@test (Vector(vec(sum(test_doc_transform, dims=2))) .> 0.2) == Bool[1, 1, 1, 1, 1, 1]

test_doc_transform = transform(test_bow_machine3, ngram_vec)
test_doc_transform = transform(test_count_machine3, ngram_vec)
@test Vector(vec(sum(test_doc_transform, dims=2))) == [14, 10, 14, 9, 13, 7]

test_doc_transform = transform(test_bm25_machine3, ngram_vec)
Expand Down

0 comments on commit 45a2eea

Please sign in to comment.