Skip to content

Commit

Permalink
Merge branch 'main' into style-guide
Browse files Browse the repository at this point in the history
  • Loading branch information
nsantacruz committed Jul 16, 2024
2 parents b870f91 + 3014d10 commit 05ca7eb
Show file tree
Hide file tree
Showing 16 changed files with 1,611 additions and 1,072 deletions.
9 changes: 5 additions & 4 deletions app/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
langchain[llms]==0.1.2
anthropic~=0.25.1
langchain[llms]~=0.2.1
anthropic~=0.26.1
stanza~=1.5.0
openai~=1.12.0
openai~=1.30.0
typer~=0.4.1
pydantic~=1.10.8
pydantic~=2.7.1
loguru~=0.7.2
tqdm~=4.66.1
celery[redis]~=5.2.7
diff-match-patch
dnspython~=2.5.0
tiktoken~=0.4.0
readability_lxml
tenacity==8.3.0
requests
numpy
git+https://github.com/Sefaria/LLM@v1.0.3#egg=sefaria_llm_interface&subdirectory=app/llm_interface
5 changes: 3 additions & 2 deletions app/topic_prompt/topic_prompt_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from topic_prompt.toprompt_llm_prompt import TopromptLLMPrompt, get_output_parser
from topic_prompt.toprompt import Toprompt, TopromptOptions
from topic_prompt.differentiate_writing import repeated_phrase
from util.general import escape_json_inner_quotes

from langchain.prompts import PromptTemplate
from basic_langchain.chat_models import ChatOpenAI
Expand All @@ -32,7 +33,7 @@ def _get_toprompt_options(lang: str, topic: Topic, source: TopicPromptSource, ot
responses += [HumanMessage(content=secondary_prompt.format())]

output_parser = get_output_parser()
parsed_output = output_parser.parse(curr_response.content)
parsed_output = output_parser.parse(escape_json_inner_quotes(curr_response.content))
parsed_output.title = _remove_colon_from_title_with_validation(responses, parsed_output.title)

topic_prompts += [Toprompt(topic, source, parsed_output.why, parsed_output.what, parsed_output.title)]
Expand All @@ -46,7 +47,7 @@ def _get_toprompt_options(lang: str, topic: Topic, source: TopicPromptSource, ot
partial_variables={"phrase": phrase_to_avoid, "format_instructions": get_output_parser().get_format_instructions()})
curr_response = llm([human_message] + responses + [HumanMessage(content=avoid_prompt.format())])
output_parser = get_output_parser()
parsed_output = output_parser.parse(curr_response.content)
parsed_output = output_parser.parse(escape_json_inner_quotes(curr_response.content))
parsed_output.title = _remove_colon_from_title_with_validation(responses + [curr_response], parsed_output.title)
topic_prompts[-1] = Toprompt(topic, source, parsed_output.why, parsed_output.what, parsed_output.title)

Expand Down
19 changes: 8 additions & 11 deletions app/util/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
from typing import Any, Callable, Union, TypeVar
from functools import reduce, partial
from tqdm import tqdm
from hdbscan import HDBSCAN
from sklearn.metrics import silhouette_score, pairwise_distances
from sklearn.cluster import KMeans, AffinityPropagation
from sklearn.cluster import AgglomerativeClustering, AffinityPropagation
import random
from dataclasses import dataclass
from numpy import ndarray
Expand Down Expand Up @@ -122,7 +121,7 @@ def _build_clusters_from_cluster_results(labels, embeddings, items):
return clusters, noise_items, noise_embeddings


def _guess_optimal_kmeans_clustering(embeddings, verbose=True):
def _guess_optimal_n_clusters(embeddings, get_model, verbose=True):
if len(embeddings) <= 1:
return len(embeddings)

Expand All @@ -136,20 +135,19 @@ def _guess_optimal_kmeans_clustering(embeddings, verbose=True):
n_cluster_end = n_cluster_start + 1
n_clusters = range(n_cluster_start, n_cluster_end)
for n_cluster in tqdm(n_clusters, total=len(n_clusters), desc='guess optimal clustering', disable=not verbose):
kmeans = KMeans(n_clusters=n_cluster, n_init='auto', random_state=RANDOM_SEED).fit(embeddings)
labels = kmeans.labels_
sil_coeff = silhouette_score(embeddings, labels, metric='cosine', random_state=RANDOM_SEED)
model = get_model(n_cluster).fit(embeddings)
sil_coeff = silhouette_score(embeddings, model.labels_, metric='cosine', random_state=RANDOM_SEED)
if sil_coeff > best_sil_coeff:
best_sil_coeff = sil_coeff
best_num_clusters = n_cluster
if verbose:
print("Best silhouette score", round(best_sil_coeff, 4))
print("Best N", best_num_clusters, "Best silhouette score", round(best_sil_coeff, 4))
return best_num_clusters


def make_kmeans_algo_with_optimal_silhouette_score(embeddings: list[np.ndarray]):
n_clusters = _guess_optimal_kmeans_clustering(embeddings)
return KMeans(n_clusters=n_clusters, n_init='auto', random_state=RANDOM_SEED)
def get_agglomerative_clustering_labels_with_optimal_silhouette_score(embeddings: list[np.ndarray]):
n_clusters = _guess_optimal_n_clusters(embeddings, lambda n: AgglomerativeClustering(n_clusters=n, linkage='average', metric='cosine'))
return AgglomerativeClustering(n_clusters=n_clusters, linkage='average', metric='cosine').fit(embeddings).labels_


class SklearnClusterer(AbstractClusterer):
Expand All @@ -167,7 +165,6 @@ def __init__(self, embedding_fn: Callable[[str], ndarray],
self._get_cluster_labels = get_cluster_labels
self._breakup_large_clusters = breakup_large_clusters


def clone(self, **kwargs) -> 'SklearnClusterer':
"""
Return new object with all the same data except modifications specified in kwargs
Expand Down
17 changes: 17 additions & 0 deletions app/util/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,23 @@
from basic_langchain.schema import SystemMessage, HumanMessage


def escape_json_inner_quotes(json_string):
"""
Given a JSON string, escape all double quotes that are in values to avoid invalid JSON
Assumes JSON is pretty for
:param json_string:
:return:
"""
pattern = r'(:\s*")(.*?)(?="[,}\n])'

def escape_quotes(match):
# Escape quotes within the matched group
return match.group(1) + match.group(2).replace('"', '\\"')

return re.sub(pattern, escape_quotes, json_string)


def get_source_text_with_fallback(source: TopicPromptSource, lang: str, auto_translate=False) -> str:
text = source.text.get(lang, "")
other_lang = "en" if lang == "he" else "he"
Expand Down
38 changes: 36 additions & 2 deletions app/util/sentencizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,44 @@ def print_completion(sentences):
print(sentence)


def claude_sentencizer(text, max_sentences=None):
text_left = text[:]
sentences = []
while len(text_left) > 0 and (max_sentences is None or len(sentences) < max_sentences):
next_sentence = claude_sentencizer_first_sentence(text_left)
if next_sentence is None:
break
istart = text_left.index(next_sentence)
text_left = text_left[istart+len(next_sentence):]
sentences.append(next_sentence)
return sentences


def claude_sentencizer_first_sentence(text):
from basic_langchain.chat_models import ChatAnthropic
from basic_langchain.schema import SystemMessage, HumanMessage
from util.general import get_by_xml_tag
system = SystemMessage(content="Given a text discussing Torah topics will little to no punctuation, "
"output the first sentence. Input is in <input> tags. The first sentence "
"should be output verbatim as it appears in <input> wrapped in "
"<first_sentence> tags. Since the input text has no punctuation, use your judgement as to where the first sentence ends. Prefer smaller sentences.")
human = HumanMessage(content=f"<input>{text}</input>")
llm = ChatAnthropic("claude-3-5-sonnet-20240620", temperature=0)
response = llm([system, human])
return get_by_xml_tag(response.content, "first_sentence")


if __name__ == '__main__':
import django
django.setup()
from sefaria.model import *
yo = """[26] See R. Joseph B. Soloveitchik, \"Tzedakah: Brotherhood and Fellowship,\" in Halakhic Morality: Essays on Ethics and Masorah, 126-127."""
sents = sentencize(yo)
print_completion(sents)
sup = Ref("Sifra, Vayikra Dibbura DeNedavah, Section 4:1").text('he').text
# sents = sentencize(yo)
# print_completion(sents)
sents = claude_sentencizer(sup, 5)
for sent in sents:
print(sent)
pass


2 changes: 1 addition & 1 deletion app/util/topic.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def get_topic_description_from_top_sources(topic: Topic, verbose=True):


def get_top_trefs_from_slug(slug, top_n=10) -> list[str]:
out = get_topic(True, slug, with_refs=True, ref_link_type_filters=['about', 'popular-writing-of'])
out = get_topic(True, slug, 'english', with_refs=True, ref_link_type_filters=['about', 'popular-writing-of'])
try:
trefs = [d['ref'] for d in out['refs']['about']['refs'] if not d['is_sheet']]
trefs = filter_invalid_refs(trefs[:top_n])
Expand Down
4 changes: 2 additions & 2 deletions chart/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 1.1.6
version: 1.1.7
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "v1.0.4"
appVersion: "v1.1.3"
2 changes: 1 addition & 1 deletion chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ deployment:
memory: 500Mi
limits:
cpu: 2000m
memory: 1Gi
memory: 5Gi
anthropicSecretRef: anthropic-api-key
openaiSecretRef: openai-api-key
redis:
Expand Down
84 changes: 77 additions & 7 deletions experiments/topic_source_curation/choose.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
import django
django.setup()
from sefaria.pagesheetrank import pagerank_rank_ref_list
from sefaria.model.text import Ref
from sefaria.model.text import Ref, library
from sefaria.client.wrapper import get_links
from sefaria.helper.llm.topic_prompt import make_topic_prompt_source
from sefaria.recommendation_engine import RecommendationEngine
import voyageai
from tqdm import tqdm
Expand Down Expand Up @@ -83,15 +84,80 @@ def choose(clusters: list[Cluster], topic: Topic) -> (list[SummarizedSource], li
link_pairs = _get_link_pairs_to_avoid(sorted_items)
chosen_sources, chosen_penalties, not_interesting_trefs = solve_clusters_iteratively(clusters, topic, sorted_items, primary_sources_trefs, link_pairs)
chosen_sources = (Artifact(chosen_sources)
.pipe(_remove_known_bad_sources)
.pipe(_switch_daf_shevui_for_talmud)
.pipe(_sort_sources_by_gpt_instruction, topic)
.pipe(_put_primary_sources_first, primary_sources_trefs)
.pipe(_remove_not_interesting_sources, not_interesting_trefs)
.pipe(_remove_duplicate_books).data
)
if len(chosen_sources) <= 5:
# try to pick more
clusters = _break_up_clusters(topic, clusters)
sorted_clusters = clusters
chosen_sources, chosen_penalties, not_interesting_trefs = solve_clusters_iteratively(clusters, topic, sorted_items, primary_sources_trefs, link_pairs)
chosen_sources = (Artifact(chosen_sources)
.pipe(_remove_known_bad_sources)
.pipe(_switch_daf_shevui_for_talmud)
.pipe(_sort_sources_by_gpt_instruction, topic)
.pipe(_put_primary_sources_first, primary_sources_trefs)
.pipe(_remove_not_interesting_sources, not_interesting_trefs)
.pipe(_remove_duplicate_books).data
)
save_clusters_and_chosen_sources_to_html(topic, sorted_clusters, chosen_sources, chosen_penalties, primary_sources_trefs, not_interesting_trefs)
return chosen_sources, clusters


def _break_up_clusters(topic: Topic, clusters: list[Cluster]):
new_clusters = clusters
counter = 0
while counter < 5:
temp_clusters = _break_up_largest_cluster(topic, new_clusters)
if temp_clusters is None:
break
new_clusters = temp_clusters
counter += 1
return new_clusters


def _break_up_largest_cluster(topic: Topic, clusters: list[Cluster]):
from util.cluster import SklearnClusterer
from experiments.topic_source_curation.cluster import embed_text_openai, get_cluster_summary_based_on_topic
from sklearn.cluster import AgglomerativeClustering
topic_desc = get_topic_str_for_prompts(topic, verbose=False)
get_cluster_summary = partial(get_cluster_summary_based_on_topic, topic_desc)
largest_cluster = max(clusters, key=len)
if len(largest_cluster) >= 6:
get_cluster_labels = lambda x: AgglomerativeClustering(n_clusters=2).fit(x).labels_
clusterer = SklearnClusterer(embed_text_openai, get_cluster_labels, get_cluster_summary, breakup_large_clusters=False)
new_clusters = [c for c in clusters if c != largest_cluster]
new_clusters += clusterer.cluster_items(largest_cluster.items)
return new_clusters
return clusters


def _remove_known_bad_sources(chosen_sources: list[SummarizedSource]) -> list[SummarizedSource]:
known_bad_titles = set(library.get_indexes_in_category_path(["Reference", "Dictionary"]))
return filter(lambda source: source.source.book_title['en'] not in known_bad_titles, chosen_sources)


def _switch_daf_shevui_for_talmud(chosen_sources: list[SummarizedSource]) -> list[SummarizedSource]:
import re
new_sources = []
for source in chosen_sources:
if source.source.ref.startswith('Daf Shevui'):
talmud_amud = re.search(r'Daf Shevui to (.*):\d+$', source.source.ref).group(1)
links = list(filter(lambda link: talmud_amud in link['ref'], get_links(source.source.ref, with_text=False)))
if len(links) == 1:
topic_prompt_source = make_topic_prompt_source(Ref(links[0]['ref']), '', False)
new_sources.append(SummarizedSource(topic_prompt_source, source.summary, source.embedding))
else:
print("LEN DAF SHEVUI LINKS", len(links), source.source.ref)
else:
new_sources.append(source)
return new_sources


def _remove_not_interesting_sources(chosen_sources: list[SummarizedSource], not_interesting_trefs: list[str]) -> list[SummarizedSource]:
return filter(lambda source: source.source.ref not in not_interesting_trefs, chosen_sources)

Expand Down Expand Up @@ -244,13 +310,17 @@ def choose_primary_sources(clusters: list[Cluster]) -> list[str]:
"""
orefs = reduce(lambda x, y: x + [Ref(item.source.ref) for item in y.items], clusters, [])
refs, pageranks = zip(*pagerank_rank_ref_list(orefs))
max_ref = refs[0].normal()
thresh = mean(pageranks) + 2 * stdev(pageranks)
is_primary = pageranks[0] > thresh
print(max_ref, "IS PRIMARY:", is_primary, round(pageranks[0], 3), round(thresh, 3))
if is_primary:
return [max_ref]
return []
max_refs = []
for ref, pr in zip(refs[:3], pageranks[:3]):
max_ref = ref.normal()
is_primary = pr > thresh
print(max_ref, "IS PRIMARY:", is_primary, round(pr, 3), round(thresh, 3))
if is_primary:
max_refs.append(max_ref)
else:
break
return max_refs


def choose_ideal_clusters(clusters: list[Cluster], max_clusters: int) -> list[Cluster]:
Expand Down
14 changes: 9 additions & 5 deletions experiments/topic_source_curation/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from basic_langchain.embeddings import VoyageAIEmbeddings, OpenAIEmbeddings
from util.pipeline import Artifact
from util.general import get_by_xml_tag, run_parallel, get_by_xml_list
from util.cluster import Cluster, OptimizingClusterer, SklearnClusterer, AbstractClusterItem
from util.cluster import Cluster, OptimizingClusterer, SklearnClusterer, AbstractClusterItem, get_agglomerative_clustering_labels_with_optimal_silhouette_score
from experiments.topic_source_curation.common import get_topic_str_for_prompts
from experiments.topic_source_curation.summarized_source import SummarizedSource
import numpy as np
Expand Down Expand Up @@ -54,7 +54,7 @@ def embed_text_voyageai(text):
return np.array(VoyageAIEmbeddings(model="voyage-large-2-instruct").embed_query(text))


def _get_cluster_summary_based_on_topic(topic_desc, strs_to_summarize):
def get_cluster_summary_based_on_topic(topic_desc, strs_to_summarize):
llm = ChatOpenAI("gpt-4o", 0)
system = SystemMessage(content="You are a Jewish scholar familiar with Torah. Given a few ideas (wrapped in <idea> "
"XML tags) about a given topic (wrapped in <topic> XML tags) output a summary of the "
Expand Down Expand Up @@ -126,11 +126,15 @@ def _cluster_sources(sources: list[SummarizedSource], topic) -> list[Cluster]:
for i in range(len(HDBSCAN_PARAM_OPTS['min_samples'])):
hdbscan_params = _get_ith_hdbscan_params(i)
temp_clusterer = SklearnClusterer(embed_text_openai,
lambda x: HDBSCAN(**hdbscan_params).fit(x).labels_,
partial(_get_cluster_summary_based_on_topic, topic_desc), verbose=False)
lambda x: HDBSCAN(**hdbscan_params).fit(x).labels_,
partial(get_cluster_summary_based_on_topic, topic_desc), verbose=False)
clusterers.append(temp_clusterer)
temp_clusterer = SklearnClusterer(embed_text_openai, lambda x: AffinityPropagation(damping=0.7, max_iter=1000, convergence_iter=100).fit(x).predict(x), partial(_get_cluster_summary_based_on_topic, topic_desc), verbose=False)
temp_clusterer = SklearnClusterer(embed_text_openai, lambda x: AffinityPropagation(damping=0.7, max_iter=1000, convergence_iter=100).fit(x).predict(x), partial(get_cluster_summary_based_on_topic, topic_desc), verbose=False)
clusterers.append(temp_clusterer)
# temp_clusterer = SklearnClusterer(embed_text_openai,
# get_agglomerative_clustering_labels_with_optimal_silhouette_score,
# partial(_get_cluster_summary_based_on_topic, topic_desc), verbose=False)
# clusterers = [temp_clusterer]

clusterer_optimizer = OptimizingClusterer(embed_text_openai, clusterers, verbose=False)
clusters = clusterer_optimizer.cluster_and_summarize(sources)
Expand Down
8 changes: 5 additions & 3 deletions experiments/topic_source_curation/curator.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@
TOPICS_TO_CURATE_CSV_PATH = 'input/Topic project plan - 1000 topics pages product - list of all topic slugs.csv'


def get_topics_to_curate():
def get_topics_to_curate() -> list[Topic]:
topics = []
with open(TOPICS_TO_CURATE_CSV_PATH, "r") as fin:
cin = csv.DictReader(fin)
for row in cin:
if len(row['curated']) > 0:
if len(row['curated'].strip()) > 0:
continue
slug = row['slug'].strip()
try:
Expand All @@ -46,7 +46,9 @@ def get_topics_to_curate():
def save_curation(data, topic: Topic) -> list[SummarizedSource]:
sources, clusters = data
topic.description['en'] = get_or_generate_topic_description(topic, verbose=False)
contexts = run_parallel(sources, partial(get_context_for_source, topic=topic, clusters=clusters), max_workers=20, desc="Get source context")
# contexts = run_parallel(sources, partial(get_context_for_source, topic=topic, clusters=clusters), max_workers=20, desc="Get source context")
# not finding context helpful
contexts = ['']*len(sources)
out = [{
"ref": source.source.ref,
"context": contexts[isource]
Expand Down
Loading

0 comments on commit 05ca7eb

Please sign in to comment.