Skip to content

Commit

Permalink
fix: improve ingestion of curated links
Browse files Browse the repository at this point in the history
  • Loading branch information
nsantacruz committed Jun 20, 2024
1 parent 3fd1bae commit c5f5571
Showing 1 changed file with 46 additions and 21 deletions.
67 changes: 46 additions & 21 deletions experiments/topic_source_curation/scripts/make_ref_topic_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from experiments.topic_source_curation.curator import get_topics_to_curate
import django

from sefaria.model import RefTopicLink
from sefaria.model import RefTopicLink, Ref
from sefaria.model import Topic as SefariaTopic
from sefaria.helper.llm.topic_prompt import make_llm_topic
django.setup()
Expand All @@ -13,7 +13,7 @@
def _make_ref_topic_link(topic, tref, context, i):
return {
"toTopic": topic.slug,
"ref": tref,
"ref": Ref(tref).normal(),
"linkType": "about",
"class": "refTopic",
"dataSource": "learning-team",
Expand All @@ -25,7 +25,7 @@ def _make_ref_topic_link(topic, tref, context, i):
},
"descriptions": {
"en": {
"ai_context": context,
"ai_context": "N/A",
"published": False,
"review_state": "not reviewed",
}
Expand All @@ -35,28 +35,55 @@ def _make_ref_topic_link(topic, tref, context, i):

def save_ref_topic_links():
import json
with open("data/private/ref_topic_links.json", "r") as fin:
with open("scripts/ref_topic_links.json", "r") as fin:
links = json.load(fin)
len(links)
for link in links:
try:
existing_links = RefTopicLinkSet({
"toTopic": link["toTopic"],
"ref": link["ref"],
"linkType": link["linkType"],
"dataSource": link["dataSource"],
})
except Exception as e:
continue
print(e)
if existing_links:
existing_links.delete()
RefTopicLink(link).save()

def _generate_all_prompts():
from tqdm import tqdm
slugs_to_generate = {l.toTopic for l in RefTopicLinkSet({"generatedBy": "auto-curator"})}
slugs_to_generate = [
'balaam',
'caleb',
'parents',
'parah-adumah',
'hunger',
'disability',
'aarons-death',
'josephs-dream',
'empathy',
'leviathan',
'memory',
"naaman",
"nephilim",
"naftali",
"sisera",
"sennacherib",
"serah-the-daughter-of-asher",
"iddo",
"obadiah",
"og",
"uzziah",
"ezra",
"achan",
"the-sons-of-eli",
"eli",
"amos",
"amram",
"amasa",
"efron",
"er-(firstborn-son-of-judah)",
"esau",
"potiphar",
"the-concubine-of-givah",
"pharaoh",
"zelophehad",
"keturah",
"cain",
]

for slug in tqdm(slugs_to_generate):
_generate_prompts_for_slug(slug)

Expand All @@ -72,12 +99,10 @@ def _generate_prompts_for_slug(slug):

if __name__ == '__main__':
links = []
# topics = random.sample(get_topics_to_curate(), 50)
topics = [make_llm_topic(SefariaTopic.init(slug)) for slug in [
'creation-of-man'
]]
# topics = get_topics_to_curate()[94:120]
topics = [make_llm_topic(SefariaTopic.init('abel'))]
for topic in topics:
print(topic.slug)
print(f'"{topic.slug}",')
with open(f"output/curation_{topic.slug}.json", "r") as fin:
curation = json.load(fin)
for i, entry in enumerate(curation):
Expand Down

0 comments on commit c5f5571

Please sign in to comment.