Merge pull request #1 from TidierOrg/package-cleanup

Cleaned up README and docstrings, removed antijoin, bumped version to…
TidierOrg · Nov 22, 2023 · f3aae23 · f3aae23 · kdpsingh · Nov 22, 2023
2 parents e3d47a0 + f593086
commit f3aae23
Show file tree

Hide file tree

Showing 6 changed files with 173 additions and 70 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
 /Manifest.toml
+.vscode
diff --git a/Project.toml b/Project.toml
@@ -1,17 +1,21 @@
 name = "TidierText"
 uuid = "8f0b679f-44a1-4a38-8011-253e3a78fd39"
 authors = ["Daniel Rizk"]
-version = "0.0.1-DEV"
+version = "0.1.0"
 
 [deps]
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
+DataFrames = "1.5"
+Languages = "0.4"
+MacroTools = "0.5"
+Reexport = "0.2, 1"
+StatsBase = "0.34, 1"
 julia = "1.6"
 
 [extras]

diff --git a/README.md b/README.md
@@ -1,14 +1,21 @@
-# TidierText
+# TidierText.jl
+
+[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://github.com/TidierOrg/TidierData.jl/blob/main/LICENSE)
+[![Build Status](https://github.com/TidierOrg/TidierText.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/TidierOrg/TidierText.jl/actions/workflows/CI.yml?query=branch%3Amain)
+
+<img src="https://raw.githubusercontent.com/TidierOrg/Tidier.jl/main/docs/src/assets/Tidier_jl_logo.png" align="right" style="padding-left:10px;" width="150"/>
 
 ## What is TidierText.jl
-`TidierText.jl`is a 100% Julia implementation of the R tidytext package.
 
-To better understand how to leverage `TidierText.jl`, please explore Text Mining with R by Julia Silge et al. 
+TidierText.jl is a 100% Julia implementation of the R tidytext package. The purpose of the package is to make it easy analyze text data using DataFrames.
+
+An extensive guide to tidy text analysis by Julia Silge and David Robinson is available here: [https://www.tidytextmining.com/](https://www.tidytextmining.com).
 
 ## Installation
+
 For the development version:
 
-```
+```julia
 using Pkg
 Pkg.add(url="https://github.com/TidierOrg/TidierText.jl")
 ```
@@ -17,12 +24,104 @@ To better understand how to leverage `TidierText.jl`, please explore Text Mining
 
 ## What functions does TidierText.jl support?
 
-- `get_stopwords`
-- `@antijoin`
 - `@bind_tf_idf`
 - `@unnest_tokens`
 - `@unnest_regex()`
 - `@unnest_characters()`
 - `@unnest_ngrams()`
+- `get_stopwords()`
+
+## How does the package work?
+
+### Let's load the package and read in the UCLA Fall 2018 course dataset.
+
+```julia
+using TidierData
+using TidierText
+
+using CSV
+
+courses = CSV.read(download("https://vincentarelbundock.github.io/Rdatasets/csv/openintro/ucla_f18.csv"), DataFrame)
+```
+
+### What are the course names?
+
+```julia
+@chain courses begin
+  @select(id = rownames, course)
+  @slice(1:10)
+end
+```
 
-This package is a nascent version that will continue to grow.
+```
+10×2 DataFrame
+ Row │ id     course                            
+     │ Int64  String                            
+─────┼──────────────────────────────────────────
+   1 │     1  Leadership Laboratory
+   2 │     2  Heritage and Values
+   3 │     3  Team and Leadership Fundamentals
+   4 │     4  Air Force Leadership Studies
+   5 │     5  National Security Affairs/Prepar…
+   6 │     6  Introduction to Black Studies
+   7 │     7  African American Musical Heritage
+   8 │     8  UCLA Centennial Initiative: Arth…
+   9 │     9  UCLA Centennial Initiative: Soci…
+  10 │    10  Student Research Program
+```
+
+### Let's tokenize the course names and convert them to lowercase.
+
+```julia
+tokens = @chain courses begin
+  @select(id = rownames, course)
+  @slice(1:10)
+  @unnest_tokens(word, course, to_lower = true)
+end;
+
+@chain tokens @slice(1:10)
+```
+
+```
+10×2 DataFrame
+ Row │ id     word         
+     │ Int64  SubStrin…    
+─────┼─────────────────────
+   1 │     1  leadership
+   2 │     1  laboratory
+   3 │     2  heritage
+   4 │     2  and
+   5 │     2  values
+   6 │     3  team
+   7 │     3  and
+   8 │     3  leadership
+   9 │     3  fundamentals
+  10 │     4  air
+```
+
+### Let's add the term frequency, inverse document frequency, and the tf-idf.
+
+```julia
+@chain tokens begin
+  @count(id, word)
+  @bind_tf_idf(word, id, n)
+  @slice(1:10)
+end
+```
+
+```
+10×6 DataFrame
+ Row │ id     word          n      tf        idf       tf_idf   
+     │ Int64  SubStrin…     Int64  Float64   Float64   Float64  
+─────┼──────────────────────────────────────────────────────────
+   1 │     1  leadership        1  0.5       1.20397   0.601986
+   2 │     1  laboratory        1  0.5       2.30259   1.15129
+   3 │     2  heritage          1  0.333333  1.60944   0.536479
+   4 │     2  and               1  0.333333  0.916291  0.30543
+   5 │     2  values            1  0.333333  2.30259   0.767528
+   6 │     3  team              1  0.25      2.30259   0.575646
+   7 │     3  and               1  0.25      0.916291  0.229073
+   8 │     3  leadership        1  0.25      1.20397   0.300993
+   9 │     3  fundamentals      1  0.25      2.30259   0.575646
+  10 │     4  air               1  0.25      2.30259   0.575646
+```
diff --git a/src/TidierText.jl b/src/TidierText.jl
@@ -8,8 +8,9 @@ using Reexport
 
 include("docstrings.jl")
 
+@reexport using DataFrames: DataFrame
 
-export get_stopwords, @bind_tf_idf, @unnest_characters, @unnest_ngrams, @unnest_regex, @unnest_tokens, @antijoin
+export get_stopwords, @bind_tf_idf, @unnest_characters, @unnest_ngrams, @unnest_regex, @unnest_tokens
 
 """
 $docstring_get_stopwords
@@ -135,16 +136,6 @@ end
 
 
 ### Macros
-"""
-$docstring_antijoin
-"""
-macro antijoin(df1, df2)
-    by = :(intersect(names($(esc(df1))), names($(esc(df2)))))
-
-    return quote
-        antijoin(DataFrame($(esc(df1))), DataFrame($(esc(df2))); on = $(by))
-    end
-end
 
 """
 $docstring_bind_tf_idf

diff --git a/src/docstrings.jl b/src/docstrings.jl
@@ -4,45 +4,26 @@ const docstring_get_stopwords =
 
 Returns a DataFrame containing English stopwords.
 
+The stopwords come from the `Languages.jl` package: https://github.com/JuliaText/Languages.jl/blob/master/data/stopwords/English.txt.
+
 # Returns
 - `DataFrame` with a single column `word`, each row containing a stopword.
 
 # Examples
 ```jldoctest
-julia> get_stopwords();
+julia> first(get_stopwords(), 5)
+5×1 DataFrame
+ Row │ word   
+     │ String 
+─────┼────────
+   1 │ a
+   2 │ about
+   3 │ above
+   4 │ across
+   5 │ after
 ```
 """
 
-const docstring_antijoin = 
-"""
-    @antijoin(df1, df2)
-
-Performs an anti-join operation on `df1` and `df2`, returning rows from `df1` that do not have matching rows in `df2`.
-
-# Arguments
-- `df1`: The left DataFrame.
-- `df2`: The right DataFrame.
-
-# Returns
-- A new DataFrame containing the result of the anti-join operation.
-
-# Examples
-```jldoctest
-julia> using DataFrames;
-       df1 = DataFrame(ID = [1, 2, 3, 4, 5], Name = ["A", "B", "C", "D", "E"]);
-       df2 = DataFrame(ID = [3, 4, 5, 6, 7], Test = ["C", "D", "E", "F", "G"]);
-
-julia> @antijoin(df1, df2)
-2×2 DataFrame
- Row │ ID     Name   
-     │ Int64  String 
-─────┼───────────────
-   1 │     1  A
-   2 │     2  B
-
-```
-
-"""
 const docstring_bind_tf_idf = 
 """
     @bind_tf_idf(df, term_col, document_col, n)
@@ -61,7 +42,11 @@ Calculates TF-IDF values for the specified columns of `df`.
 # Examples
 ```jldoctest
 julia> using DataFrames;
-       df = DataFrame(doc_id = [1, 1, 2, 2, 3, 3], term = ["apple", "banana", "apple", "cherry", "banana", "date"], n = [1, 4, 6, 4, 9, 8]);
+       df = DataFrame(
+              doc_id = [1, 1, 2, 2, 3, 3],
+              term = ["apple", "banana", "apple", "cherry", "banana", "date"],
+              n = [1, 4, 6, 4, 9, 8]
+            );
 
 julia> @bind_tf_idf(df, doc_id, term, n)
 6×6 DataFrame
@@ -79,7 +64,7 @@ julia> @bind_tf_idf(df, doc_id, term, n)
 
 const docstring_unnest_tokens = 
 """
-    @unnest_tokens(df, output_col, input_col, to_lower=false)
+    @unnest_tokens(df, output_col, input_col, to_lower = false)
 
 Tokenizes the text in `input_col` of `df` into separate words, outputting the result to `output_col`.
 
@@ -94,10 +79,14 @@ Tokenizes the text in `input_col` of `df` into separate words, outputting the re
 
 # Examples
 ```jldoctest
-julia>  using DataFrames;
-        df = DataFrame(text = ["The quick brown fox jumps.", "One column and the one row?"], doc = [1, 2]);
+julia> using DataFrames;
+       df = DataFrame(
+              text = ["The quick brown fox jumps.",
+                      "One column and the one row?"],
+                      doc = [1, 2]
+            );
 
-julia>  @unnest_tokens(df, word, text)
+julia> @unnest_tokens(df, word, text)
 11×2 DataFrame
  Row │ doc    word      
      │ Int64  SubStrin… 
@@ -114,7 +103,7 @@ julia>  @unnest_tokens(df, word, text)
   10 │     2  one
   11 │     2  row?
 
-julia>  @unnest_tokens(df, word, text, to_lower = true)
+julia> @unnest_tokens(df, word, text, to_lower = true)
 11×2 DataFrame
  Row │ doc    word      
      │ Int64  SubStrin… 
@@ -135,7 +124,7 @@ julia>  @unnest_tokens(df, word, text, to_lower = true)
 
 const docstring_unnest_regex =
 """
-    @unnest_regex(df, output_col, input_col, pattern="\\s+", to_lower=false)
+    @unnest_regex(df, output_col, input_col, pattern = "\\s+", to_lower = false)
 
 Splits the text in `input_col` of `df` based on a regex `pattern`, outputting the result to `output_col`.
 
@@ -151,10 +140,14 @@ Splits the text in `input_col` of `df` based on a regex `pattern`, outputting th
 
 # Examples
 ```jldoctest
-julia>  using DataFrames;
-        df = DataFrame(text = ["The quick brown fox jumps.", "One column and the one row?"], doc = [1, 2]);
+julia> using DataFrames;
+       df = DataFrame(
+              text = ["The quick brown fox jumps.",
+                      "One column and the one row?"],
+                      doc = [1, 2]
+            );
 
-julia>  @unnest_regex(df, word, text, "the")
+julia> @unnest_regex(df, word, text, "the")
 3×2 DataFrame
  Row │ doc    word                       
      │ Int64  SubStrin…                  
@@ -163,7 +156,7 @@ julia>  @unnest_regex(df, word, text, "the")
    2 │     2  One column and
    3 │     2  one row?
 
-julia>  @unnest_regex(df, word, text,  "the", to_lower = true)
+julia> @unnest_regex(df, word, text, "the", to_lower = true)
 3×2 DataFrame
  Row │ doc    word                   
      │ Int64  SubStrin…              
@@ -176,7 +169,7 @@ julia>  @unnest_regex(df, word, text,  "the", to_lower = true)
 
 const docstring_unnest_ngrams =
 """
-    @unnest_ngrams(df, output_col, input_col, n, to_lower=false)
+    @unnest_ngrams(df, output_col, input_col, n, to_lower = false)
 
 Creates n-grams from the text in `input_col` of `df`, outputting the result to `output_col`.
 
@@ -192,10 +185,14 @@ Creates n-grams from the text in `input_col` of `df`, outputting the result to `
 
 # Examples
 ```jldoctest
-julia>  using DataFrames;
-        df = DataFrame(text = ["The quick brown fox jumps.", "The sun rises in the east."], doc = [1, 2]);
+julia> using DataFrames;
+       df = DataFrame(
+              text = ["The quick brown fox jumps.",
+                      "The sun rises in the east."],
+                      doc = [1, 2]
+            );
 
-julia>  @unnest_ngrams(df, term, text, 2, to_lower = false)
+julia> @unnest_ngrams(df, term, text, 2, to_lower = false)
 9×2 DataFrame
  Row │ doc    term        
      │ Int64  String      
@@ -229,7 +226,7 @@ julia> @unnest_ngrams(df, term, text, 2)
 
 const docstring_unnest_characters = 
 """
-    @unnest_characters(df, output_col, input_col, to_lower=false, strip_non_alphanum = false)
+    @unnest_characters(df, output_col, input_col, to_lower = false, strip_non_alphanum = false)
 
 Splits the text in `input_col` of `df` into separate characters, outputting the result to `output_col`.
 
@@ -245,10 +242,12 @@ Splits the text in `input_col` of `df` into separate characters, outputting the
 
 # Examples
 ```jldoctest
-julia>  using DataFrames;
-        df = DataFrame( text = [ "The quick.", "Nice."],  doc = [1, 2]);
+julia> using DataFrames;
+       df = DataFrame(
+              text = ["The quick.", "Nice."],
+              doc = [1, 2]);
 
-julia>  @unnest_characters(df, term, text, to_lower = false)
+julia> @unnest_characters(df, term, text, to_lower = false)
 15×2 DataFrame
  Row │ doc    term 
      │ Int64  Char 

diff --git a/test/Project.toml b/test/Project.toml
@@ -0,0 +1,9 @@
+[deps]
+TidierText = "8f0b679f-44a1-4a38-8011-253e3a78fd39"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
+MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
+Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"