Skip to content

Commit

Permalink
word embedding generating code added
Browse files Browse the repository at this point in the history
  • Loading branch information
sleepingcat4 authored Apr 20, 2024
1 parent f22cb3c commit 97af4f8
Show file tree
Hide file tree
Showing 2 changed files with 176,038 additions and 0 deletions.
29 changes: 29 additions & 0 deletions word_embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import json
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import concurrent.futures

model = SentenceTransformer("all-MiniLM-L6-v2")

def generate_embedding(word):
embeddings = model.encode(word, convert_to_tensor=True)
return word, embeddings.tolist()

word_file = "word_processed.txt"
with open(word_file, "r") as f:
words = f.read().splitlines()

embeddings_dict = {}
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_word = {executor.submit(generate_embedding, word): word for word in words}
for future in tqdm(concurrent.futures.as_completed(future_to_word), total=len(words), desc="Generating embeddings"):
word = future_to_word[future]
try:
word, embedding = future.result()
embeddings_dict[word] = embedding
except Exception as exc:
print(f"Error generating embedding for {word}: {exc}")

output_file = "embeddings_dict.json"
with open(output_file, "w") as f:
json.dump(embeddings_dict, f)
Loading

0 comments on commit 97af4f8

Please sign in to comment.