From 4db391ae9acd05ecac8a79f17b2e899fd66fa4e8 Mon Sep 17 00:00:00 2001 From: David Arenburg Date: Wed, 30 Jan 2019 14:46:59 +0200 Subject: [PATCH 1/2] Update exploration.Rmd --- vignettes/exploration.Rmd | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/vignettes/exploration.Rmd b/vignettes/exploration.Rmd index fd056b8..f184a62 100644 --- a/vignettes/exploration.Rmd +++ b/vignettes/exploration.Rmd @@ -64,9 +64,7 @@ demo_vectors %>% closest_to("bad") The tildes are necessary syntax where things get interesting--you can do **math** on these vectors. So if we want to find the words that are closest to the *combination* of "good" and "bad" (which is to say, words that get used in evaluation) we can write (see where the tilde is?): ```{r} - demo_vectors %>% closest_to(~"good"+"bad") - # The same thing could be written as: # demo_vectors %>% closest_to(demo_vectors[["good"]]+demo_vectors[["bad"]]) ``` @@ -84,7 +82,6 @@ demo_vectors %>% closest_to(~"good" - "bad") > a vector that describes the difference between positive and negative. > Similarity to this vector means, technically, the portion of a words vectors whose > whose multidimensional path lies largely along the direction between the two words. - Again, you can easily switch the order to the opposite: here are a bunch of bad words: ```{r} @@ -127,11 +124,9 @@ demo_vectors %>% closest_to(~ "guy" + ("she" - "he")) Principal components can let you plot a subset of these vectors to see how they relate. You can imagine an arrow from "he" to "she", from "guy" to "lady", and from "man" to "woman"; all run in roughly the same direction. -```{r} - +```{r, fig.height = 7, fig.width = 7, fig.align = "center"} demo_vectors[[c("lady","woman","man","he","she","guy","man"), average=F]] %>% plot(method="pca") - ``` These lists of ten words at a time are useful for interactive exploration, but sometimes we might want to say 'n=Inf' to return the full list. For instance, we can combine these two methods to look at positive and negative words used to evaluate teachers. @@ -141,10 +136,8 @@ First we build up three data_frames: first, a list of the 50 top evaluative word ```{r} top_evaluative_words = demo_vectors %>% closest_to(~ "good"+"bad",n=75) - goodness = demo_vectors %>% closest_to(~ "good"-"bad",n=Inf) - femininity = demo_vectors %>% closest_to(~ "she" - "he", n=Inf) ``` @@ -153,10 +146,9 @@ Then we can use tidyverse packages to join and plot these. An `inner_join` restricts us down to just those top 50 words, and ggplot can array the words on axes. -```{r} +```{r, fig.height = 7, fig.width = 7, fig.align = "center"} library(ggplot2) library(dplyr) - top_evaluative_words %>% inner_join(goodness) %>% inner_join(femininity) %>% @@ -165,4 +157,3 @@ top_evaluative_words %>% y=`similarity to "good" - "bad"`, label=word)) ``` - From 680089a47bfdecfaa07b545c839fde35835b2ed4 Mon Sep 17 00:00:00 2001 From: David Arenburg Date: Wed, 30 Jan 2019 14:48:29 +0200 Subject: [PATCH 2/2] Update introduction.Rmd --- vignettes/introduction.Rmd | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/vignettes/introduction.Rmd b/vignettes/introduction.Rmd index 6f1185b..a464d08 100644 --- a/vignettes/introduction.Rmd +++ b/vignettes/introduction.Rmd @@ -24,8 +24,6 @@ if (!require(wordVectors)) { } devtools::install_github("bmschmidt/wordVectors") } - - ``` # Building test data @@ -66,7 +64,6 @@ To train a word2vec model, use the function `train_word2vec`. This actually buil ```{r} if (!file.exists("cookbook_vectors.bin")) {model = train_word2vec("cookbooks.txt","cookbook_vectors.bin",vectors=200,threads=4,window=12,iter=5,negative_samples=0)} else model = read.vectors("cookbook_vectors.bin") - ``` A few notes: @@ -104,7 +101,7 @@ Now we have a pretty expansive list of potential fish-related words from old coo Or we can just arrange them somehow. In this case, it doesn't look like much of anything. -```{r} +```{r, fig.height = 7, fig.width = 7, fig.align = "center"} some_fish = closest_to(model,model[[c("fish","salmon","trout","shad","flounder","carp","roe","eels")]],150) fishy = model[[some_fish$word,average=F]] plot(fishy,method="pca") @@ -133,22 +130,19 @@ These can be useful for figuring out, at a glance, what some of the overall comm Clusters need not be derived at the level of the full model. We can take, for instance, the 20 words closest to each of four different kinds of words. -```{r} +```{r, fig.height = 7, fig.width = 7, fig.align = "center"} ingredients = c("madeira","beef","saucepan","carrots") term_set = lapply(ingredients, function(ingredient) { nearest_words = model %>% closest_to(model[[ingredient]],20) nearest_words$word }) %>% unlist - subset = model[[term_set,average=F]] - subset %>% cosineDist(subset) %>% as.dist %>% hclust %>% plot - ``` @@ -160,21 +154,17 @@ One of the basic strategies you can take is to try to project the high-dimension For instance, we can take the words "sweet" and "sour," find the twenty words most similar to either of them, and plot those in a sweet-salty plane. -```{r} +```{r, fig.height = 7, fig.width = 7, fig.align = "center"} tastes = model[[c("sweet","salty"),average=F]] - # model[1:3000,] here restricts to the 3000 most common words in the set. sweet_and_saltiness = model[1:3000,] %>% cosineSimilarity(tastes) - # Filter to the top 20 sweet or salty. sweet_and_saltiness = sweet_and_saltiness[ rank(-sweet_and_saltiness[,1])<20 | rank(-sweet_and_saltiness[,2])<20, ] - plot(sweet_and_saltiness,type='n') text(sweet_and_saltiness,labels=rownames(sweet_and_saltiness)) - ``` @@ -183,21 +173,17 @@ There's no limit to how complicated this can get. For instance, there are really Rather than use a base matrix of the whole set, we can shrink down to just five dimensions: how similar every word in our set is to each of these five. (I'm using cosine similarity here, so the closer a number is to one, the more similar it is.) ```{r} - tastes = model[[c("sweet","salty","savory","bitter","sour"),average=F]] - # model[1:3000,] here restricts to the 3000 most common words in the set. common_similarities_tastes = model[1:3000,] %>% cosineSimilarity(tastes) - common_similarities_tastes[20:30,] ``` Now we can filter down to the 50 words that are closest to *any* of these (that's what the apply-max function below does), and use a PCA biplot to look at just 50 words in a flavor plane. -```{r} +```{r, fig.height = 7, fig.width = 7, fig.align = "center"} high_similarities_to_tastes = common_similarities_tastes[rank(-apply(common_similarities_tastes,1,max)) < 75,] - high_similarities_to_tastes %>% prcomp %>% biplot(main="Fifty words in a\nprojection of flavor space") @@ -218,7 +204,7 @@ Just calling "plot" will display the equivalent of a word cloud with individual "Perplexity" is the optimal number of neighbors for each word. By default it's 50; smaller numbers may cause clusters to appear more dramatically at the cost of overall coherence. -```{r} +```{r, fig.height = 7, fig.width = 7, fig.align = "center"} plot(model,perplexity=50) ```