Merge branch 'main' into style-guide

Sefaria · Jul 16, 2024 · 05ca7eb · 05ca7eb
2 parents b870f91 + 3014d10
commit 05ca7eb
Show file tree

Hide file tree

Showing 16 changed files with 1,611 additions and 1,072 deletions.
diff --git a/app/requirements.txt b/app/requirements.txt
@@ -1,16 +1,17 @@
-langchain[llms]==0.1.2
-anthropic~=0.25.1
+langchain[llms]~=0.2.1
+anthropic~=0.26.1
 stanza~=1.5.0
-openai~=1.12.0
+openai~=1.30.0
 typer~=0.4.1
-pydantic~=1.10.8
+pydantic~=2.7.1
 loguru~=0.7.2
 tqdm~=4.66.1
 celery[redis]~=5.2.7
 diff-match-patch
 dnspython~=2.5.0
 tiktoken~=0.4.0
 readability_lxml
+tenacity==8.3.0
 requests
 numpy
 git+https://github.com/Sefaria/LLM@v1.0.3#egg=sefaria_llm_interface&subdirectory=app/llm_interface
diff --git a/app/topic_prompt/topic_prompt_generator.py b/app/topic_prompt/topic_prompt_generator.py
@@ -8,6 +8,7 @@
 from topic_prompt.toprompt_llm_prompt import TopromptLLMPrompt, get_output_parser
 from topic_prompt.toprompt import Toprompt, TopromptOptions
 from topic_prompt.differentiate_writing import repeated_phrase
+from util.general import escape_json_inner_quotes
 
 from langchain.prompts import PromptTemplate
 from basic_langchain.chat_models import ChatOpenAI
@@ -32,7 +33,7 @@ def _get_toprompt_options(lang: str, topic: Topic, source: TopicPromptSource, ot
             responses += [HumanMessage(content=secondary_prompt.format())]
 
         output_parser = get_output_parser()
-        parsed_output = output_parser.parse(curr_response.content)
+        parsed_output = output_parser.parse(escape_json_inner_quotes(curr_response.content))
         parsed_output.title = _remove_colon_from_title_with_validation(responses, parsed_output.title)
 
         topic_prompts += [Toprompt(topic, source, parsed_output.why, parsed_output.what, parsed_output.title)]
@@ -46,7 +47,7 @@ def _get_toprompt_options(lang: str, topic: Topic, source: TopicPromptSource, ot
                                                     partial_variables={"phrase": phrase_to_avoid, "format_instructions": get_output_parser().get_format_instructions()})
         curr_response = llm([human_message] + responses + [HumanMessage(content=avoid_prompt.format())])
         output_parser = get_output_parser()
-        parsed_output = output_parser.parse(curr_response.content)
+        parsed_output = output_parser.parse(escape_json_inner_quotes(curr_response.content))
         parsed_output.title = _remove_colon_from_title_with_validation(responses + [curr_response], parsed_output.title)
         topic_prompts[-1] = Toprompt(topic, source, parsed_output.why, parsed_output.what, parsed_output.title)
 

diff --git a/app/util/cluster.py b/app/util/cluster.py
@@ -2,9 +2,8 @@
 from typing import Any, Callable, Union, TypeVar
 from functools import reduce, partial
 from tqdm import tqdm
-from hdbscan import HDBSCAN
 from sklearn.metrics import silhouette_score, pairwise_distances
-from sklearn.cluster import KMeans, AffinityPropagation
+from sklearn.cluster import AgglomerativeClustering, AffinityPropagation
 import random
 from dataclasses import dataclass
 from numpy import ndarray
@@ -122,7 +121,7 @@ def _build_clusters_from_cluster_results(labels, embeddings, items):
         return clusters, noise_items, noise_embeddings
 
 
-def _guess_optimal_kmeans_clustering(embeddings, verbose=True):
+def _guess_optimal_n_clusters(embeddings, get_model, verbose=True):
     if len(embeddings) <= 1:
         return len(embeddings)
 
@@ -136,20 +135,19 @@ def _guess_optimal_kmeans_clustering(embeddings, verbose=True):
         n_cluster_end = n_cluster_start + 1
     n_clusters = range(n_cluster_start, n_cluster_end)
     for n_cluster in tqdm(n_clusters, total=len(n_clusters), desc='guess optimal clustering', disable=not verbose):
-        kmeans = KMeans(n_clusters=n_cluster, n_init='auto', random_state=RANDOM_SEED).fit(embeddings)
-        labels = kmeans.labels_
-        sil_coeff = silhouette_score(embeddings, labels, metric='cosine', random_state=RANDOM_SEED)
+        model = get_model(n_cluster).fit(embeddings)
+        sil_coeff = silhouette_score(embeddings, model.labels_, metric='cosine', random_state=RANDOM_SEED)
         if sil_coeff > best_sil_coeff:
             best_sil_coeff = sil_coeff
             best_num_clusters = n_cluster
     if verbose:
-        print("Best silhouette score", round(best_sil_coeff, 4))
+        print("Best N", best_num_clusters, "Best silhouette score", round(best_sil_coeff, 4))
     return best_num_clusters
 
 
-def make_kmeans_algo_with_optimal_silhouette_score(embeddings: list[np.ndarray]):
-    n_clusters = _guess_optimal_kmeans_clustering(embeddings)
-    return KMeans(n_clusters=n_clusters, n_init='auto', random_state=RANDOM_SEED)
+def get_agglomerative_clustering_labels_with_optimal_silhouette_score(embeddings: list[np.ndarray]):
+    n_clusters = _guess_optimal_n_clusters(embeddings, lambda n: AgglomerativeClustering(n_clusters=n, linkage='average', metric='cosine'))
+    return AgglomerativeClustering(n_clusters=n_clusters, linkage='average', metric='cosine').fit(embeddings).labels_
 
 
 class SklearnClusterer(AbstractClusterer):
@@ -167,7 +165,6 @@ def __init__(self, embedding_fn: Callable[[str], ndarray],
         self._get_cluster_labels = get_cluster_labels
         self._breakup_large_clusters = breakup_large_clusters
 
-
     def clone(self, **kwargs) -> 'SklearnClusterer':
         """
         Return new object with all the same data except modifications specified in kwargs

diff --git a/app/util/general.py b/app/util/general.py
@@ -8,6 +8,23 @@
 from basic_langchain.schema import SystemMessage, HumanMessage
 
 
+def escape_json_inner_quotes(json_string):
+    """
+    Given a JSON string, escape all double quotes that are in values to avoid invalid JSON
+    Assumes JSON is pretty for
+
+    :param json_string:
+    :return:
+    """
+    pattern = r'(:\s*")(.*?)(?="[,}\n])'
+
+    def escape_quotes(match):
+        # Escape quotes within the matched group
+        return match.group(1) + match.group(2).replace('"', '\\"')
+
+    return re.sub(pattern, escape_quotes, json_string)
+
+
 def get_source_text_with_fallback(source: TopicPromptSource, lang: str, auto_translate=False) -> str:
     text = source.text.get(lang, "")
     other_lang = "en" if lang == "he" else "he"

diff --git a/app/util/sentencizer.py b/app/util/sentencizer.py
@@ -40,10 +40,44 @@ def print_completion(sentences):
         print(sentence)
 
 
+def claude_sentencizer(text, max_sentences=None):
+    text_left = text[:]
+    sentences = []
+    while len(text_left) > 0 and (max_sentences is None or len(sentences) < max_sentences):
+        next_sentence = claude_sentencizer_first_sentence(text_left)
+        if next_sentence is None:
+            break
+        istart = text_left.index(next_sentence)
+        text_left = text_left[istart+len(next_sentence):]
+        sentences.append(next_sentence)
+    return sentences
+
+
+def claude_sentencizer_first_sentence(text):
+    from basic_langchain.chat_models import ChatAnthropic
+    from basic_langchain.schema import SystemMessage, HumanMessage
+    from util.general import get_by_xml_tag
+    system = SystemMessage(content="Given a text discussing Torah topics will little to no punctuation, "
+                                   "output the first sentence. Input is in <input> tags. The first sentence "
+                                   "should be output verbatim as it appears in <input> wrapped in "
+                                   "<first_sentence> tags. Since the input text has no punctuation, use your judgement as to where the first sentence ends. Prefer smaller sentences.")
+    human = HumanMessage(content=f"<input>{text}</input>")
+    llm = ChatAnthropic("claude-3-5-sonnet-20240620", temperature=0)
+    response = llm([system, human])
+    return get_by_xml_tag(response.content, "first_sentence")
+
+
 if __name__ == '__main__':
+    import django
+    django.setup()
+    from sefaria.model import *
     yo = """[26] See R. Joseph B. Soloveitchik, \"Tzedakah: Brotherhood and Fellowship,\" in Halakhic Morality: Essays on Ethics and Masorah, 126-127."""
-    sents = sentencize(yo)
-    print_completion(sents)
+    sup = Ref("Sifra, Vayikra Dibbura DeNedavah, Section 4:1").text('he').text
+    # sents = sentencize(yo)
+    # print_completion(sents)
+    sents = claude_sentencizer(sup, 5)
+    for sent in sents:
+        print(sent)
     pass
 
 
diff --git a/app/util/topic.py b/app/util/topic.py
@@ -74,7 +74,7 @@ def get_topic_description_from_top_sources(topic: Topic, verbose=True):
 
 
 def get_top_trefs_from_slug(slug, top_n=10) -> list[str]:
-    out = get_topic(True, slug, with_refs=True, ref_link_type_filters=['about', 'popular-writing-of'])
+    out = get_topic(True, slug, 'english', with_refs=True, ref_link_type_filters=['about', 'popular-writing-of'])
     try:
         trefs = [d['ref'] for d in out['refs']['about']['refs'] if not d['is_sheet']]
         trefs = filter_invalid_refs(trefs[:top_n])

diff --git a/chart/Chart.yaml b/chart/Chart.yaml
@@ -20,9 +20,9 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 1.1.6
+version: 1.1.7
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "v1.0.4"
+appVersion: "v1.1.3"
diff --git a/chart/values.yaml b/chart/values.yaml
@@ -11,7 +11,7 @@ deployment:
       memory: 500Mi
     limits:
       cpu: 2000m
-      memory: 1Gi
+      memory: 5Gi
   anthropicSecretRef: anthropic-api-key
   openaiSecretRef: openai-api-key
 redis:

diff --git a/experiments/topic_source_curation/choose.py b/experiments/topic_source_curation/choose.py
@@ -10,8 +10,9 @@
 import django
 django.setup()
 from sefaria.pagesheetrank import pagerank_rank_ref_list
-from sefaria.model.text import Ref
+from sefaria.model.text import Ref, library
 from sefaria.client.wrapper import get_links
+from sefaria.helper.llm.topic_prompt import make_topic_prompt_source
 from sefaria.recommendation_engine import RecommendationEngine
 import voyageai
 from tqdm import tqdm
@@ -83,15 +84,80 @@ def choose(clusters: list[Cluster], topic: Topic) -> (list[SummarizedSource], li
     link_pairs = _get_link_pairs_to_avoid(sorted_items)
     chosen_sources, chosen_penalties, not_interesting_trefs = solve_clusters_iteratively(clusters, topic, sorted_items, primary_sources_trefs, link_pairs)
     chosen_sources = (Artifact(chosen_sources)
+                      .pipe(_remove_known_bad_sources)
+                      .pipe(_switch_daf_shevui_for_talmud)
                       .pipe(_sort_sources_by_gpt_instruction, topic)
                       .pipe(_put_primary_sources_first, primary_sources_trefs)
                       .pipe(_remove_not_interesting_sources, not_interesting_trefs)
                       .pipe(_remove_duplicate_books).data
                       )
+    if len(chosen_sources) <= 5:
+        # try to pick more
+        clusters = _break_up_clusters(topic, clusters)
+        sorted_clusters = clusters
+        chosen_sources, chosen_penalties, not_interesting_trefs = solve_clusters_iteratively(clusters, topic, sorted_items, primary_sources_trefs, link_pairs)
+        chosen_sources = (Artifact(chosen_sources)
+                          .pipe(_remove_known_bad_sources)
+                          .pipe(_switch_daf_shevui_for_talmud)
+                          .pipe(_sort_sources_by_gpt_instruction, topic)
+                          .pipe(_put_primary_sources_first, primary_sources_trefs)
+                          .pipe(_remove_not_interesting_sources, not_interesting_trefs)
+                          .pipe(_remove_duplicate_books).data
+                          )
     save_clusters_and_chosen_sources_to_html(topic, sorted_clusters, chosen_sources, chosen_penalties, primary_sources_trefs, not_interesting_trefs)
     return chosen_sources, clusters
 
 
+def _break_up_clusters(topic: Topic, clusters: list[Cluster]):
+    new_clusters = clusters
+    counter = 0
+    while counter < 5:
+        temp_clusters = _break_up_largest_cluster(topic, new_clusters)
+        if temp_clusters is None:
+            break
+        new_clusters = temp_clusters
+        counter += 1
+    return new_clusters
+
+
+def _break_up_largest_cluster(topic: Topic, clusters: list[Cluster]):
+    from util.cluster import SklearnClusterer
+    from experiments.topic_source_curation.cluster import embed_text_openai, get_cluster_summary_based_on_topic
+    from sklearn.cluster import AgglomerativeClustering
+    topic_desc = get_topic_str_for_prompts(topic, verbose=False)
+    get_cluster_summary = partial(get_cluster_summary_based_on_topic, topic_desc)
+    largest_cluster = max(clusters, key=len)
+    if len(largest_cluster) >= 6:
+        get_cluster_labels = lambda x: AgglomerativeClustering(n_clusters=2).fit(x).labels_
+        clusterer = SklearnClusterer(embed_text_openai, get_cluster_labels, get_cluster_summary, breakup_large_clusters=False)
+        new_clusters = [c for c in clusters if c != largest_cluster]
+        new_clusters += clusterer.cluster_items(largest_cluster.items)
+        return new_clusters
+    return clusters
+
+
+def _remove_known_bad_sources(chosen_sources: list[SummarizedSource]) -> list[SummarizedSource]:
+    known_bad_titles = set(library.get_indexes_in_category_path(["Reference", "Dictionary"]))
+    return filter(lambda source: source.source.book_title['en'] not in known_bad_titles, chosen_sources)
+
+
+def _switch_daf_shevui_for_talmud(chosen_sources: list[SummarizedSource]) -> list[SummarizedSource]:
+    import re
+    new_sources = []
+    for source in chosen_sources:
+        if source.source.ref.startswith('Daf Shevui'):
+            talmud_amud = re.search(r'Daf Shevui to (.*):\d+$', source.source.ref).group(1)
+            links = list(filter(lambda link: talmud_amud in link['ref'], get_links(source.source.ref, with_text=False)))
+            if len(links) == 1:
+                topic_prompt_source = make_topic_prompt_source(Ref(links[0]['ref']), '', False)
+                new_sources.append(SummarizedSource(topic_prompt_source, source.summary, source.embedding))
+            else:
+                print("LEN DAF SHEVUI LINKS", len(links), source.source.ref)
+        else:
+            new_sources.append(source)
+    return new_sources
+
+
 def _remove_not_interesting_sources(chosen_sources: list[SummarizedSource], not_interesting_trefs: list[str]) -> list[SummarizedSource]:
     return filter(lambda source: source.source.ref not in not_interesting_trefs, chosen_sources)
 
@@ -244,13 +310,17 @@ def choose_primary_sources(clusters: list[Cluster]) -> list[str]:
     """
     orefs = reduce(lambda x, y: x + [Ref(item.source.ref) for item in y.items], clusters, [])
     refs, pageranks = zip(*pagerank_rank_ref_list(orefs))
-    max_ref = refs[0].normal()
     thresh = mean(pageranks) + 2 * stdev(pageranks)
-    is_primary = pageranks[0] > thresh
-    print(max_ref, "IS PRIMARY:", is_primary, round(pageranks[0], 3), round(thresh, 3))
-    if is_primary:
-        return [max_ref]
-    return []
+    max_refs = []
+    for ref, pr in zip(refs[:3], pageranks[:3]):
+        max_ref = ref.normal()
+        is_primary = pr > thresh
+        print(max_ref, "IS PRIMARY:", is_primary, round(pr, 3), round(thresh, 3))
+        if is_primary:
+            max_refs.append(max_ref)
+        else:
+            break
+    return max_refs
 
 
 def choose_ideal_clusters(clusters: list[Cluster], max_clusters: int) -> list[Cluster]:

diff --git a/experiments/topic_source_curation/cluster.py b/experiments/topic_source_curation/cluster.py
@@ -11,7 +11,7 @@
 from basic_langchain.embeddings import VoyageAIEmbeddings, OpenAIEmbeddings
 from util.pipeline import Artifact
 from util.general import get_by_xml_tag, run_parallel, get_by_xml_list
-from util.cluster import Cluster, OptimizingClusterer, SklearnClusterer, AbstractClusterItem
+from util.cluster import Cluster, OptimizingClusterer, SklearnClusterer, AbstractClusterItem, get_agglomerative_clustering_labels_with_optimal_silhouette_score
 from experiments.topic_source_curation.common import get_topic_str_for_prompts
 from experiments.topic_source_curation.summarized_source import SummarizedSource
 import numpy as np
@@ -54,7 +54,7 @@ def embed_text_voyageai(text):
     return np.array(VoyageAIEmbeddings(model="voyage-large-2-instruct").embed_query(text))
 
 
-def _get_cluster_summary_based_on_topic(topic_desc, strs_to_summarize):
+def get_cluster_summary_based_on_topic(topic_desc, strs_to_summarize):
     llm = ChatOpenAI("gpt-4o", 0)
     system = SystemMessage(content="You are a Jewish scholar familiar with Torah. Given a few ideas (wrapped in <idea> "
                                    "XML tags) about a given topic (wrapped in <topic> XML tags) output a summary of the "
@@ -126,11 +126,15 @@ def _cluster_sources(sources: list[SummarizedSource], topic) -> list[Cluster]:
     for i in range(len(HDBSCAN_PARAM_OPTS['min_samples'])):
         hdbscan_params = _get_ith_hdbscan_params(i)
         temp_clusterer = SklearnClusterer(embed_text_openai,
-                                           lambda x: HDBSCAN(**hdbscan_params).fit(x).labels_,
-                                           partial(_get_cluster_summary_based_on_topic, topic_desc), verbose=False)
+                                          lambda x: HDBSCAN(**hdbscan_params).fit(x).labels_,
+                                          partial(get_cluster_summary_based_on_topic, topic_desc), verbose=False)
         clusterers.append(temp_clusterer)
-    temp_clusterer = SklearnClusterer(embed_text_openai, lambda x: AffinityPropagation(damping=0.7, max_iter=1000, convergence_iter=100).fit(x).predict(x), partial(_get_cluster_summary_based_on_topic, topic_desc), verbose=False)
+    temp_clusterer = SklearnClusterer(embed_text_openai, lambda x: AffinityPropagation(damping=0.7, max_iter=1000, convergence_iter=100).fit(x).predict(x), partial(get_cluster_summary_based_on_topic, topic_desc), verbose=False)
     clusterers.append(temp_clusterer)
+    # temp_clusterer = SklearnClusterer(embed_text_openai,
+    #                                   get_agglomerative_clustering_labels_with_optimal_silhouette_score,
+    #                                   partial(_get_cluster_summary_based_on_topic, topic_desc), verbose=False)
+    # clusterers = [temp_clusterer]
 
     clusterer_optimizer = OptimizingClusterer(embed_text_openai, clusterers, verbose=False)
     clusters = clusterer_optimizer.cluster_and_summarize(sources)

diff --git a/experiments/topic_source_curation/curator.py b/experiments/topic_source_curation/curator.py
@@ -27,12 +27,12 @@
 TOPICS_TO_CURATE_CSV_PATH = 'input/Topic project plan - 1000 topics pages product - list of all topic slugs.csv'
 
 
-def get_topics_to_curate():
+def get_topics_to_curate() -> list[Topic]:
     topics = []
     with open(TOPICS_TO_CURATE_CSV_PATH, "r") as fin:
         cin = csv.DictReader(fin)
         for row in cin:
-            if len(row['curated']) > 0:
+            if len(row['curated'].strip()) > 0:
                 continue
             slug = row['slug'].strip()
             try:
@@ -46,7 +46,9 @@ def get_topics_to_curate():
 def save_curation(data, topic: Topic) -> list[SummarizedSource]:
     sources, clusters = data
     topic.description['en'] = get_or_generate_topic_description(topic, verbose=False)
-    contexts = run_parallel(sources, partial(get_context_for_source, topic=topic, clusters=clusters), max_workers=20, desc="Get source context")
+    # contexts = run_parallel(sources, partial(get_context_for_source, topic=topic, clusters=clusters), max_workers=20, desc="Get source context")
+    # not finding context helpful
+    contexts = ['']*len(sources)
     out = [{
         "ref": source.source.ref,
         "context": contexts[isource]