Skip to content

Commit

Permalink
rename vectors_norm everywhere, update tests, regen docs
Browse files Browse the repository at this point in the history
  • Loading branch information
piskvorky committed Jul 30, 2020
1 parent 172e37f commit 3919b68
Show file tree
Hide file tree
Showing 18 changed files with 168 additions and 145 deletions.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions docs/src/auto_examples/tutorials/run_annoy.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@
},
"outputs": [],
"source": [
"# Set up the model and vector that we are using in the comparison\nmodel.init_sims()\nannoy_index = AnnoyIndexer(model, 100)\n\n# Dry run to make sure both indexes are fully in RAM\nvector = model.wv.vectors_norm[0]\nmodel.wv.most_similar([vector], topn=5, indexer=annoy_index)\nmodel.wv.most_similar([vector], topn=5)\n\nimport time\nimport numpy as np\n\ndef avg_query_time(annoy_index=None, queries=1000):\n \"\"\"Average query time of a most_similar method over 1000 random queries.\"\"\"\n total_time = 0\n for _ in range(queries):\n rand_vec = model.wv.vectors_norm[np.random.randint(0, len(model.wv))]\n start_time = time.process_time()\n model.wv.most_similar([rand_vec], topn=5, indexer=annoy_index)\n total_time += time.process_time() - start_time\n return total_time / queries\n\nqueries = 1000\n\ngensim_time = avg_query_time(queries=queries)\nannoy_time = avg_query_time(annoy_index, queries=queries)\nprint(\"Gensim (s/query):\\t{0:.5f}\".format(gensim_time))\nprint(\"Annoy (s/query):\\t{0:.5f}\".format(annoy_time))\nspeed_improvement = gensim_time / annoy_time\nprint (\"\\nAnnoy is {0:.2f} times faster on average on this particular run\".format(speed_improvement))"
"# Set up the model and vector that we are using in the comparison\nmodel.init_sims()\nannoy_index = AnnoyIndexer(model, 100)\n\n# Dry run to make sure both indexes are fully in RAM\nnormed_vectors = model.wv.get_normed_vectors()\nvector = normed_vectors[0]\nmodel.wv.most_similar([vector], topn=5, indexer=annoy_index)\nmodel.wv.most_similar([vector], topn=5)\n\nimport time\nimport numpy as np\n\ndef avg_query_time(annoy_index=None, queries=1000):\n \"\"\"Average query time of a most_similar method over 1000 random queries.\"\"\"\n total_time = 0\n for _ in range(queries):\n rand_vec = normed_vectors[np.random.randint(0, len(model.wv))]\n start_time = time.process_time()\n model.wv.most_similar([rand_vec], topn=5, indexer=annoy_index)\n total_time += time.process_time() - start_time\n return total_time / queries\n\nqueries = 1000\n\ngensim_time = avg_query_time(queries=queries)\nannoy_time = avg_query_time(annoy_index, queries=queries)\nprint(\"Gensim (s/query):\\t{0:.5f}\".format(gensim_time))\nprint(\"Annoy (s/query):\\t{0:.5f}\".format(annoy_time))\nspeed_improvement = gensim_time / annoy_time\nprint (\"\\nAnnoy is {0:.2f} times faster on average on this particular run\".format(speed_improvement))"
]
},
{
Expand Down Expand Up @@ -234,7 +234,7 @@
},
"outputs": [],
"source": [
"exact_results = [element[0] for element in model.wv.most_similar([model.wv.vectors_norm[0]], topn=100)]\n\nx_values = []\ny_values_init = []\ny_values_accuracy = []\n\nfor x in range(1, 300, 10):\n x_values.append(x)\n start_time = time.time()\n annoy_index = AnnoyIndexer(model, x)\n y_values_init.append(time.time() - start_time)\n approximate_results = model.wv.most_similar([model.wv.vectors_norm[0]], topn=100, indexer=annoy_index)\n top_words = [result[0] for result in approximate_results]\n y_values_accuracy.append(len(set(top_words).intersection(exact_results)))"
"exact_results = [element[0] for element in model.wv.most_similar([normed_vectors[0]], topn=100)]\n\nx_values = []\ny_values_init = []\ny_values_accuracy = []\n\nfor x in range(1, 300, 10):\n x_values.append(x)\n start_time = time.time()\n annoy_index = AnnoyIndexer(model, x)\n y_values_init.append(time.time() - start_time)\n approximate_results = model.wv.most_similar([normed_vectors[0]], topn=100, indexer=annoy_index)\n top_words = [result[0] for result in approximate_results]\n y_values_accuracy.append(len(set(top_words).intersection(exact_results)))"
]
},
{
Expand Down
9 changes: 5 additions & 4 deletions docs/src/auto_examples/tutorials/run_annoy.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,8 @@
annoy_index = AnnoyIndexer(model, 100)

# Dry run to make sure both indexes are fully in RAM
vector = model.wv.vectors_norm[0]
normed_vectors = model.wv.get_normed_vectors()
vector = normed_vectors[0]
model.wv.most_similar([vector], topn=5, indexer=annoy_index)
model.wv.most_similar([vector], topn=5)

Expand All @@ -131,7 +132,7 @@ def avg_query_time(annoy_index=None, queries=1000):
"""Average query time of a most_similar method over 1000 random queries."""
total_time = 0
for _ in range(queries):
rand_vec = model.wv.vectors_norm[np.random.randint(0, len(model.wv))]
rand_vec = normed_vectors[np.random.randint(0, len(model.wv))]
start_time = time.process_time()
model.wv.most_similar([rand_vec], topn=5, indexer=annoy_index)
total_time += time.process_time() - start_time
Expand Down Expand Up @@ -286,7 +287,7 @@ def f(process_id):
# Build dataset of Initialization times and accuracy measures:
#

exact_results = [element[0] for element in model.wv.most_similar([model.wv.vectors_norm[0]], topn=100)]
exact_results = [element[0] for element in model.wv.most_similar([normed_vectors[0]], topn=100)]

x_values = []
y_values_init = []
Expand All @@ -297,7 +298,7 @@ def f(process_id):
start_time = time.time()
annoy_index = AnnoyIndexer(model, x)
y_values_init.append(time.time() - start_time)
approximate_results = model.wv.most_similar([model.wv.vectors_norm[0]], topn=100, indexer=annoy_index)
approximate_results = model.wv.most_similar([normed_vectors[0]], topn=100, indexer=annoy_index)
top_words = [result[0] for result in approximate_results]
y_values_accuracy.append(len(set(top_words).intersection(exact_results)))

Expand Down
2 changes: 1 addition & 1 deletion docs/src/auto_examples/tutorials/run_annoy.py.md5
Original file line number Diff line number Diff line change
@@ -1 +1 @@
a18f2e2cf524dea755eb70bb385bf7fe
c6cd2a0225bbe49d97dc66c96d2b7f1c
Loading

0 comments on commit 3919b68

Please sign in to comment.