Skip to content

Commit

Permalink
feat: add skeleton of chooser
Browse files Browse the repository at this point in the history
  • Loading branch information
nsantacruz committed May 2, 2024
1 parent a8c3c7e commit e77dca5
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 5 deletions.
51 changes: 47 additions & 4 deletions experiments/topic_source_curation_v2/choose.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,49 @@
"""
def choose_ideal_clusters(source_clusters: List[SourceCluster], max_clusters: int) -> List[SourceCluster]
choose some interesting, some fundamental
Given clusters tries to
1) Choose the "best" clusters such that each cluster represents an idea that should be part of the final topic page curation
2) For each chosen cluster, choose the "best" source to be curated. Best here needs to fulfill a few criteria including
- Category quota: sources should be from diverse categories in Sefaria
- Fundamental sources: funadmental sources from Tanakh and Talmud etc. should be chosen. These should be few, made 2-3
- Interesting sources: the rest of the sources should represent interesting ideas for a newcomer to Sefaria
"""
from experiments.topic_source_curation_v2.cluster import Cluster
from sefaria_llm_interface.topic_prompt import TopicPromptSource
from sklearn.metrics import pairwise_distances
from util.pipeline import Artifact
import numpy as np


def choose_ideal_sources_for_clusters(clusters: list[Cluster]) -> list[TopicPromptSource]:
return Artifact(clusters).pipe(choose_ideal_clusters, 20).pipe(choose_ideal_sources_for_clusters).data


def choose_ideal_clusters(clusters: list[Cluster], max_clusters: int) -> list[Cluster]:
sorted_clusters = _sort_by_highest_avg_pairwise_distance(clusters)
"""
Sorted in descending order from outliers to central
Choose a few central clusters and a few outliers
Also might want to use custom GPT sort to find "best" clusters based on various criteria
"""

def choose_ideal_sources(source_clusters: list[Cluster]) -> list[TopicPromptSource]:
"""
Criteria could be:
Pagerank based on link graph for topic page. Higher means more relevant
Pagerank delta == Global PR - Local PR. Need to decide what is good
Highest average pairwise cosine distance. Higher means more unique
Fulfills category quota. Want to choose sources from different categories
"""
pass


def _get_highest_avg_pairwise_distance_indices(embeddings: np.ndarray) -> np.ndarray:
distances = pairwise_distances(embeddings, metric='cosine')
sum_distances = np.sum(distances, axis=1)
avg_distances = sum_distances / (len(embeddings) - 1)
sorted_indices = np.argsort(avg_distances)[::-1] # Sort in descending order
return sorted_indices

def choose_ideal_sources(source_clusters: List[SourceCluster]) -> List[TopicPromptSource]
"""
def _sort_by_highest_avg_pairwise_distance(clusters: list[Cluster]) -> list[Cluster]:
embeddings = np.array([c.embedding for c in clusters])
sorted_indices = _get_highest_avg_pairwise_distance_indices(embeddings)
return [clusters[i] for i in sorted_indices]
3 changes: 2 additions & 1 deletion experiments/topic_source_curation_v2/curator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
from experiments.topic_source_curation_v2.gather.source_gatherer import gather_sources_about_topic
from experiments.topic_source_curation_v2.cluster import get_clustered_sources
from experiments.topic_source_curation_v2.choose import choose_ideal_sources_for_clusters
from sefaria.helper.llm.topic_prompt import _make_llm_topic
from sefaria_llm_interface.common.topic import Topic
from sefaria_llm_interface.topic_prompt import TopicPromptSource
Expand All @@ -12,7 +13,7 @@ def curate_topic(topic: Topic) -> list[TopicPromptSource]:
return (Artifact(topic)
.pipe(gather_sources_about_topic)
.pipe(get_clustered_sources, topic)
.pipe().data)
.pipe(choose_ideal_sources_for_clusters).data)

if __name__ == '__main__':
slug = "stars"
Expand Down

0 comments on commit e77dca5

Please sign in to comment.