Skip to content

Commit

Permalink
refactor: move experiments into experiments folder.
Browse files Browse the repository at this point in the history
Also make translation more robust
  • Loading branch information
nsantacruz committed Apr 16, 2024
1 parent 47a5959 commit 529132b
Show file tree
Hide file tree
Showing 37 changed files with 131 additions and 63 deletions.
27 changes: 18 additions & 9 deletions app/basic_langchain/chat_models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import List
from abc import ABC, abstractmethod
from anthropic import Anthropic
from anthropic import Anthropic, InternalServerError
from time import sleep
from openai import OpenAI
from basic_langchain.schema import AIMessage, AbstractMessage, LLMCompany
from basic_langchain.cache import sqlite_cache
Expand Down Expand Up @@ -51,18 +52,26 @@ def __init__(self, model, temperature, max_tokens=4096):
self.max_tokens = max_tokens

@sqlite_cache('chat')
def __call__(self, messages: List[AbstractMessage]) -> AIMessage:
def __call__(self, messages: list[AbstractMessage]) -> AIMessage:
system = "You are a helpful AI."
if len(messages) > 0 and messages[0].role == "system":
# claude wants system messages as a kwarg
system = messages[0].content
messages.pop(0)
response = self.client.messages.create(
model=self.model,
system=system,
temperature=self.temperature,
max_tokens=self.max_tokens,
messages=self._serialize_messages(messages)
)
response = self._api_call(system, messages)
text = response.content[0].text
return AIMessage(text)

def _api_call(self, system, messages: list[AbstractMessage]):
try:
return self.client.messages.create(
model=self.model,
system=system,
temperature=self.temperature,
max_tokens=self.max_tokens,
messages=self._serialize_messages(messages)
)
except InternalServerError:
print("Internal Server Error")
sleep(5)
return self._api_call(system, messages)
7 changes: 6 additions & 1 deletion app/translation/poc.py → app/translation/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from basic_langchain.chat_models import ChatAnthropic
from basic_langchain.schema import HumanMessage, SystemMessage
from anthropic import BadRequestError

random.seed(26)

Expand All @@ -24,7 +25,11 @@ def translate_text(text: str, context: str = None):
task_prompt = f"<context>{context}</context>{task_prompt}"
task_message = HumanMessage(content=task_prompt)
llm = ChatAnthropic(model="claude-3-opus-20240229", temperature=0)
response_message = llm([identity_message, task_message])
try:
response_message = llm([identity_message, task_message])
except BadRequestError:
print(f"BadRequestError\n{task_message.content}")
return ""
translation = get_by_xml_tag(response_message.content, 'translation')
if translation is None:
print("TRANSLATION FAILED")
Expand Down
2 changes: 1 addition & 1 deletion app/util/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def get_source_text_with_fallback(source: TopicPromptSource, lang: str, auto_tra
text = source.text.get(lang, "")
if len(text) == 0:
if auto_translate and lang == "en":
from translation.poc import translate_text
from translation.translation import translate_text
text = translate_text(text)
else:
other_lang = "en" if lang == "he" else "he"
Expand Down
2 changes: 1 addition & 1 deletion app/util/sefaria_specific.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def get_ref_text_with_fallback(oref: Ref, lang: str, auto_translate=False) -> st
raw_text = get_raw_ref_text(oref, lang)
if len(raw_text) == 0:
if auto_translate and lang == "en":
from translation.poc import translate_segment
from translation.translation import translate_segment
raw_text = translate_segment(oref.normal())
else:
other_lang = "en" if lang == "he" else "he"
Expand Down
3 changes: 3 additions & 0 deletions experiments/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Experiments

This dir is meant for code that is experimenting with LLMs but not (yet) meant to be deployed to the LLM service. The only code that is deployed is in `/app`.
File renamed without changes.
File renamed without changes.
47 changes: 47 additions & 0 deletions experiments/topic_source_curation/exploration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Functions to explore existing topic curations
"""
from topic_source_curation.common import get_datasets
from collections import defaultdict
import django
django.setup()
from sefaria.model import *


if __name__ == '__main__':
bad, good = get_datasets()
counts = defaultdict(int)
for example in good:
for source in example.sources:
oref = Ref(source.ref)
index = oref.index
if index.get_primary_corpus():
counts[index.get_primary_corpus()] += 1
elif len(getattr(index, 'authors', [])) == 1:
counts[index.authors[0]] += 1
else:
counts[index.title] += 1
for k, v in sorted(counts.items(), key=lambda x: x[1], reverse=True):
print(k, v)

"""
quick takeaways:
- sources
- Tanakh, Mishnah, Midrash Rabbah, Bavli, Yerushalmi, MT, SA, Siddurim, Rashi, Zohar
- Pirkei DeRabbi Eliezer, Tanchuma, Ibn Ezra, Ramban, Mekhilta
- Acharonim that give major insights
- maharl
- ramchal
- Shnei Luchot Habrit
- Nachman of Breslov
- levi-yitzchak-of-berditchev
- tzadok-hakohen-of-lublin
- Modern english
- erica brown
- Peninei Halacha
- Eliezer Berkovitz
This is a fairly comprehensive list of everything that comes up 3 times or more
"""


Original file line number Diff line number Diff line change
@@ -1,49 +1,49 @@
slug,category,issue,what should it have
avishai,biblical figures,5 first sources are from Shmuel,"assortment of sources that can be found from search such as Midrash, Talmud, Commentary "
birds,nature,has Chullin too many times,"first source should be from Genesis, Chullin should appear once"
songs,art,feels like a search page,"assortment of sources that can be found from search such as Midrash, Talmud, Commentary"
tsaddik,n/a,feels like a search page,"assortment of sources that can be found from search such as Midrash, Talmud, Commentary, Chassidut"
flood,stories,first three sources about the story but not exact psukim ,sources should give an overview of the story
building-of-the-second-temple,n/a,has Chagai too many times ,"assortment of sources that can be found from search such as Midrash, Talmud, Commentary, Chassidut"
garden-of-eden,stories,Bereshit Rabbah and Genesis appear too many times ,"assortment of sources that can be found from search such as Midrash, Talmud, Commentary, Chassidut"
faith,values,Too many Exodus refs ,
religion,beliefs,Too many Sefer Haikarim refs,
avodat-hashem,values,Too many deuteronomy refs,
bread,food,feels like a search page,
the-spies,biblical figures,Too many Numbers refs,
canaan,places,has no sources but should,
slaves,social issues,feels like a search page,
wars,social issues,Too many deuteronomy refs,
parah-adumah,ritual objetcs,Too many Numbers refs,
leadership,social issues,Too many Numbers refs,
kings,n/a,Too many deuteronomy refs,
elijah,biblical figures,Kings appear too many times,
prophecy,beliefs,feels like a search page,
women,social issues,feels like a search page,
daughters-of-zelophehad,biblical figures,Too many Numbers refs,
night,nature,doesn't have sources from Torah,
exile,history,Doesn't start with Abraham and other known texts,
music,art,feels like a search page,
miriam,biblical figures,Too many Exodus and Numbers refs,
rabbis,n/a,"only one source, should have more ",
money,social issues,Too many Exodus refs,
high-priest,biblical figures,feels like a search page,
free-will,philosophy,Too many deuteronomy refs,
minhag,Halachic Principles,feels like a search page,
bal-tashchit,values,feels like a search page,
shekhinah,beliefs,Too many Exodus refs,
parah-adumah,ritual objetcs,Too many Numbers refs,
love,values,feels like a search page,
golems,supernatural,mostly talmudic sources ,
revelation,beliefs,Too many Exodus refs,
yetzer-hara,beliefs,feels like a search page,
birth,life cycle,feels like a search page,
visiting-the-sick,values,mostly talmudic sources ,
mitzvot,beliefs,feels like a search page,
leadership,social issues,Too many Numbers refs,
shalom-bayit,values,can have more sources ,
wine,food,feels like a search page,
angels,supernatural,feels like a search page,
shivah-asar-betammuz,jewish calendar,first three sources give the same information ,
korach,biblical figures,Too many Numbers refs,
slug,category,issue,what should it have
avishai,biblical figures,5 first sources are from Shmuel,"assortment of sources that can be found from search such as Midrash, Talmud, Commentary "
birds,nature,has Chullin too many times,"first source should be from Genesis, Chullin should appear once"
songs,art,feels like a search page,"assortment of sources that can be found from search such as Midrash, Talmud, Commentary"
tsaddik,n/a,feels like a search page,"assortment of sources that can be found from search such as Midrash, Talmud, Commentary, Chassidut"
flood,stories,first three sources about the story but not exact psukim ,sources should give an overview of the story
building-of-the-second-temple,n/a,has Chagai too many times ,"assortment of sources that can be found from search such as Midrash, Talmud, Commentary, Chassidut"
garden-of-eden,stories,Bereshit Rabbah and Genesis appear too many times ,"assortment of sources that can be found from search such as Midrash, Talmud, Commentary, Chassidut"
faith,values,Too many Exodus refs ,
religion,beliefs,Too many Sefer Haikarim refs,
avodat-hashem,values,Too many deuteronomy refs,
bread,food,feels like a search page,
the-spies,biblical figures,Too many Numbers refs,
canaan,places,has no sources but should,
slaves,social issues,feels like a search page,
wars,social issues,Too many deuteronomy refs,
parah-adumah,ritual objetcs,Too many Numbers refs,
leadership,social issues,Too many Numbers refs,
kings,n/a,Too many deuteronomy refs,
elijah,biblical figures,Kings appear too many times,
prophecy,beliefs,feels like a search page,
women,social issues,feels like a search page,
daughters-of-zelophehad,biblical figures,Too many Numbers refs,
night,nature,doesn't have sources from Torah,
exile,history,Doesn't start with Abraham and other known texts,
music,art,feels like a search page,
miriam,biblical figures,Too many Exodus and Numbers refs,
rabbis,n/a,"only one source, should have more ",
money,social issues,Too many Exodus refs,
high-priest,biblical figures,feels like a search page,
free-will,philosophy,Too many deuteronomy refs,
minhag,Halachic Principles,feels like a search page,
bal-tashchit,values,feels like a search page,
shekhinah,beliefs,Too many Exodus refs,
parah-adumah,ritual objetcs,Too many Numbers refs,
love,values,feels like a search page,
golems,supernatural,mostly talmudic sources ,
revelation,beliefs,Too many Exodus refs,
yetzer-hara,beliefs,feels like a search page,
birth,life cycle,feels like a search page,
visiting-the-sick,values,mostly talmudic sources ,
mitzvot,beliefs,feels like a search page,
leadership,social issues,Too many Numbers refs,
shalom-bayit,values,can have more sources ,
wine,food,feels like a search page,
angels,supernatural,feels like a search page,
shivah-asar-betammuz,jewish calendar,first three sources give the same information ,
korach,biblical figures,Too many Numbers refs,
medicine,health,feels like a search page,
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import django
django.setup()
from sefaria.model import *
from translation.poc import translate_segment
from translation.translation import translate_segment
from util.sefaria_specific import get_normalized_ref_text, normalizer
import random
from tqdm import tqdm
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import django
django.setup()
from sefaria.model import *
from translation.poc import translate_text
from translation.translation import translate_text
from util.sefaria_specific import get_normalized_ref_text
from tqdm import tqdm
import csv
Expand All @@ -18,6 +18,10 @@ def translate_book(title):
cout.writerow({"ref": segment_oref.normal(), "hebrew": segment_text, "english": translation})
fout.close()

def translate_books(titles):
for title in titles:
translate_book(title)


if __name__ == '__main__':
translate_book("Shenei Luchot HaBerit")
translate_books(["Noam Elimelech", "Tanna Debei Eliyahu Rabbah", "Tanna debei Eliyahu Zuta", "Ba'al Shem Tov"])

0 comments on commit 529132b

Please sign in to comment.