Skip to content

Commit

Permalink
fix: remove dependency on sefaria project from uniqueness_of_source.py
Browse files Browse the repository at this point in the history
  • Loading branch information
nsantacruz committed Jun 17, 2024
1 parent 4c08add commit 67c64c1
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 5 deletions.
3 changes: 1 addition & 2 deletions app/topic_prompt/uniqueness_of_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from functools import reduce
from typing import List
from util.general import get_source_text_with_fallback, get_by_xml_tag
from util.topic import get_or_generate_topic_description
from sefaria_llm_interface.topic_prompt import TopicPromptSource
from sefaria_llm_interface import Topic

Expand All @@ -17,7 +16,7 @@

def _get_prompt_inputs(source, other_sources: List[TopicPromptSource], topic: Topic):
topic_title = topic.title['en']
topic_description = get_or_generate_topic_description(topic, verbose=False)
topic_description = topic.description.get('en', 'N/A')
comparison_sources_list = []
max_len = 7000
for other_source in other_sources:
Expand Down
29 changes: 26 additions & 3 deletions experiments/topic_source_curation/curator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from sefaria_llm_interface.topic_prompt import TopicPromptSource
from util.pipeline import Artifact
from util.general import run_parallel
from util.topic import get_or_generate_topic_description
import csv
import random
import json
Expand Down Expand Up @@ -44,6 +45,7 @@ def get_topics_to_curate():

def save_curation(data, topic: Topic) -> list[SummarizedSource]:
sources, clusters = data
topic.description['en'] = get_or_generate_topic_description(topic, verbose=False)
contexts = run_parallel(sources, partial(get_context_for_source, topic=topic, clusters=clusters), max_workers=20, desc="Get source context")
out = [{
"ref": source.source.ref,
Expand All @@ -67,13 +69,34 @@ def curate_topic(topic: Topic) -> list[TopicPromptSource]:
)


def get_topics_that_havent_been_curated_yet() -> list[Topic]:
"""
Get all filenames from output folder
:return:
"""
from os import listdir
from os.path import isfile, join
import re
slugs_curated = {re.sub(r"curation_(.*)\.json", r"\1", f) for f in listdir("output") if isfile(join("output", f)) and re.match("curation_(.*)\.json", f)}
topics_to_curate = get_topics_to_curate()
topics_not_yet_curated = []
for topic in topics_to_curate:
if topic.slug in slugs_curated:
continue
topics_not_yet_curated.append(topic)
for topic in topics_not_yet_curated:
print(topic.slug)
return topics_not_yet_curated


if __name__ == '__main__':
library.rebuild_toc()
topics = get_topics_to_curate()
topics = get_topics_that_havent_been_curated_yet()
print(len(topics))
for t in topics[356:]:
for t in topics:
print("CURATING", t.slug)
try:
curated_sources = curate_topic(t)
except:
except Exception as e:
print(f"FAILED", t.slug)
print(e)

0 comments on commit 67c64c1

Please sign in to comment.