diff --git a/frontend/app/layout.tsx b/frontend/app/layout.tsx index 3314e47..718ca7f 100644 --- a/frontend/app/layout.tsx +++ b/frontend/app/layout.tsx @@ -5,8 +5,8 @@ import "./globals.css"; const inter = Inter({ subsets: ["latin"] }); export const metadata: Metadata = { - title: "Create Next App", - description: "Generated by create next app", + title: "PyPi LLM Search", + description: "Find PyPi packages with natural language using LLM's", }; export default function RootLayout({ diff --git a/frontend/app/page.tsx b/frontend/app/page.tsx index f1a406a..f6cc69c 100644 --- a/frontend/app/page.tsx +++ b/frontend/app/page.tsx @@ -12,6 +12,7 @@ export default function Home() { const [sortDirection, setSortDirection] = useState("desc"); const [loading, setLoading] = useState(false); const [error, setError] = useState(""); + const [infoBoxVisible, setInfoBoxVisible] = useState(false); const handleSearch = async () => { setLoading(true); @@ -28,10 +29,8 @@ export default function Home() { }, }, ); - const sortedResults = response.data.matches.sort( - (a, b) => b.weekly_downloads - a.weekly_downloads, - ); - setResults(sortedResults); + const fetchedResults = response.data.matches; + setResults(sortResults(fetchedResults, sortField, sortDirection)); } catch (error) { setError("Error fetching search results."); console.error("Error fetching search results:", error); @@ -40,17 +39,20 @@ export default function Home() { } }; - const sortResults = (field) => { - const direction = - sortField === field && sortDirection === "asc" ? "desc" : "asc"; - const sorted = [...results].sort((a, b) => { + const sortResults = (data, field, direction) => { + return [...data].sort((a, b) => { if (a[field] < b[field]) return direction === "asc" ? -1 : 1; if (a[field] > b[field]) return direction === "asc" ? 1 : -1; return 0; }); - setResults(sorted); + }; + + const handleSort = (field) => { + const direction = + sortField === field && sortDirection === "asc" ? "desc" : "asc"; setSortField(field); setSortDirection(direction); + setResults(sortResults(results, field, direction)); }; return ( @@ -80,17 +82,38 @@ export default function Home() { {error &&

{error}

} +
+ +
+ + {infoBoxVisible && ( +
+

How does this work?

+

+ This application allows you to search for Python packages on PyPi + using natural language. So an example query would be "a package that + creates plots and beautiful visualizations". Once you click search, + your query will be matched against the summary and the first part of + the description of all PyPi packages with more than 50 weekly + downloads, and the 50 most similar results will be displayed in a + table below. +

+
+ )} + {results.length > 0 && (
-

- Displaying the {results.length} most similar results: -

diff --git a/notebooks/main.ipynb b/notebooks/main.ipynb index 5e870e7..88a98ec 100644 --- a/notebooks/main.ipynb +++ b/notebooks/main.ipynb @@ -34,14 +34,23 @@ "from pypi_llm.config import Config\n", "from pypi_llm.data.description_cleaner import DescriptionCleaner, CLEANING_FAILED\n", "from pypi_llm.data.reader import DataReader\n", + "from sentence_transformers import SentenceTransformer\n", + "from pypi_llm.vector_database import VectorDatabaseInterface\n", "\n", "load_dotenv()\n", "config = Config()\n", "\n", - "df = DataReader(config.DATA_DIR).read()\n", - "df = DescriptionCleaner().clean(df, \"description\", \"description_cleaned\")\n", - "df = df.filter(~pl.col(\"description_cleaned\").is_null())\n", - "df = df.filter(pl.col(\"description_cleaned\")!=CLEANING_FAILED)" + "# Load dataset and model\n", + "df = pl.read_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)\n", + "model = SentenceTransformer(config.EMBEDDINGS_MODEL_NAME)\n", + "\n", + "# Initialize vector database interface\n", + "vector_database_interface = VectorDatabaseInterface(\n", + " pinecone_token=config.PINECONE_TOKEN,\n", + " pinecone_index_name=config.PINECONE_INDEX_NAME,\n", + " embeddings_model=model,\n", + " pinecone_namespace=config.PINECONE_NAMESPACE,\n", + ")" ] }, { @@ -51,39 +60,85 @@ "metadata": {}, "outputs": [], "source": [ - "with pl.Config(fmt_str_lengths=1000):\n", + "with pl.Config(fmt_str_lengths=100):\n", " display(df.head(10))" ] }, { "cell_type": "code", "execution_count": null, - "id": "053c9cf1-9f79-4b98-bcc9-85b6b676da84", + "id": "bf393f0c-92c6-4d4a-bd97-d3ea7ebf2b80", "metadata": {}, "outputs": [], "source": [ - "from sentence_transformers import SentenceTransformer\n", - "model = SentenceTransformer(config.EMBEDDINGS_MODEL)\n", - "embeddings = model.encode(query)\n", - "\n", - "from pinecone import Pinecone, Index\n", - "pc = Pinecone(api_key=config.PINECONE_TOKEN)\n", - "index = pc.Index(config.PINECONE_INDEX_NAME)\n", - "\n", - "matches = index.query(\n", - " namespace=\"ns1\",\n", - " vector=embeddings.tolist(),\n", - " top_k=50,\n", - " include_values=False\n", + "query = \"find unused packages\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07ebd4fd-a0b9-4958-8325-bdff4be45a66", + "metadata": {}, + "outputs": [], + "source": [ + "df_matches = vector_database_interface.find_similar(query, top_k=100)\n", + "df_matches = df_matches.join(df, how=\"left\", on=\"name\")\n", + "df_matches = df_matches.sort(\"similarity\", descending=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa071203-a3cd-4e80-a7b7-0ac7562bef8d", + "metadata": {}, + "outputs": [], + "source": [ + "df_matches" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b7b28e7-495c-44db-a939-dfa3e2c45159", + "metadata": {}, + "outputs": [], + "source": [ + "# Rank the columns\n", + "df_matches = df_matches.with_columns(\n", + " rank_similarity=pl.col(\"similarity\").rank(\"dense\", descending=False),\n", + " rank_weekly_downloads=pl.col(\"weekly_downloads\").rank(\"dense\", descending=False)\n", ")\n", "\n", - "df_matches = pl.from_dicts([{'name' : x['id'], 'similarity': x['score']} for x in matches['matches']])\n", + "df_matches = df_matches.with_columns(\n", + " normalized_similarity=(pl.col(\"rank_similarity\") - 1) / (df_matches['rank_similarity'].max() - 1),\n", + " normalized_weekly_downloads=(pl.col(\"rank_weekly_downloads\") - 1) / (df_matches['rank_weekly_downloads'].max() - 1)\n", + ")\n", "\n", - "df_matches = df_matches.join(df, how = 'left', on = 'name')\n", + "df_matches = df_matches.with_columns(\n", + " score=0.5 * pl.col(\"normalized_similarity\") + 0.5 * pl.col(\"normalized_weekly_downloads\")\n", + ")\n", "\n", - "df_matches.sort('weekly_downloads', descending=True)\n", - "\n" + "# Sort the DataFrame by the combined score in descending order\n", + "df_matches = df_matches.sort(\"score\", descending=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5465cec-c717-4fc5-aa55-c4c7dc9e79cf", + "metadata": {}, + "outputs": [], + "source": [ + "df_matches.sort(\"score\", descending=True)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4384f057-8eaf-431d-a31a-f4f7e203ed35", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/pypi_llm/api/main.py b/pypi_llm/api/main.py index 1002eac..d5abfc5 100644 --- a/pypi_llm/api/main.py +++ b/pypi_llm/api/main.py @@ -6,18 +6,16 @@ from sentence_transformers import SentenceTransformer from pypi_llm.config import Config +from pypi_llm.utils.score_calculator import calculate_score from pypi_llm.vector_database import VectorDatabaseInterface app = FastAPI() -# Load environment variables load_dotenv() config = Config() -# Setup CORS origins = [ "http://localhost:3000", - # Add other origins if needed ] app.add_middleware( @@ -28,11 +26,9 @@ allow_headers=["*"], ) -# Load dataset and model df = pl.read_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) model = SentenceTransformer(config.EMBEDDINGS_MODEL_NAME) -# Initialize vector database interface vector_database_interface = VectorDatabaseInterface( pinecone_token=config.PINECONE_TOKEN, pinecone_index_name=config.PINECONE_INDEX_NAME, @@ -41,9 +37,9 @@ ) -# Define request and response models class QueryModel(BaseModel): query: str + top_k: int = 30 class Match(BaseModel): @@ -57,10 +53,14 @@ class SearchResponse(BaseModel): matches: list[Match] -# Define search endpoint @app.post("/search/", response_model=SearchResponse) async def search(query: QueryModel): - df_matches = vector_database_interface.find_similar(query.query, top_k=50) + df_matches = vector_database_interface.find_similar(query.query, top_k=query.top_k * 2) df_matches = df_matches.join(df, how="left", on="name") - df_matches = df_matches.sort("similarity", descending=True) + + df_matches = calculate_score(df_matches) + df_matches = df_matches.sort("score", descending=True) + df_matches = df_matches.head(query.top_k) + + print("sending") return SearchResponse(matches=df_matches.to_dicts()) diff --git a/pypi_llm/scripts/upsert_data.py b/pypi_llm/scripts/upsert_data.py index cb05af7..e7b7e43 100644 --- a/pypi_llm/scripts/upsert_data.py +++ b/pypi_llm/scripts/upsert_data.py @@ -27,6 +27,6 @@ ) df = df.with_columns( - summary_and_description_cleaned=pl.concat_str(pl.col("summary"), pl.lit(" "), pl.col("description_cleaned")) + summary_and_description_cleaned=pl.concat_str(pl.col("summary"), pl.lit(" - "), pl.col("description_cleaned")) ) vector_database_interface.upsert_polars(df, key_column="name", text_column="summary_and_description_cleaned") diff --git a/pypi_llm/utils/score_calculator.py b/pypi_llm/utils/score_calculator.py new file mode 100644 index 0000000..7dd2232 --- /dev/null +++ b/pypi_llm/utils/score_calculator.py @@ -0,0 +1,34 @@ +import polars as pl + + +def calculate_score(df: pl.DataFrame, weight_similarity=0.5, weight_weekly_downloads=0.5) -> pl.DataFrame: + """ + Calculate a combined score based on similarity and weekly downloads. + + The function ranks the similarity and weekly downloads, normalizes these ranks to a [0, 1] scale, + and then computes a combined score based on the provided weights for similarity and weekly downloads. + The DataFrame is sorted by the combined score in descending order. + + Args: + df (pl.DataFrame): DataFrame containing 'similarity' and 'weekly_downloads' columns. + weight_similarity (float): Weight for the similarity score in the combined score calculation. Default is 0.5. + weight_weekly_downloads (float): Weight for the weekly downloads score in the combined score calculation. Default is 0.5. + + """ + df = df.with_columns( + rank_similarity=pl.col("similarity").rank("dense", descending=False), + rank_weekly_downloads=pl.col("weekly_downloads").rank("dense", descending=False), + ) + + df = df.with_columns( + normalized_similarity=(pl.col("rank_similarity") - 1) / (df["rank_similarity"].max() - 1), + normalized_weekly_downloads=(pl.col("rank_weekly_downloads") - 1) / (df["rank_weekly_downloads"].max() - 1), + ) + + df = df.with_columns( + score=weight_similarity * pl.col("normalized_similarity") + + weight_weekly_downloads * pl.col("normalized_weekly_downloads") + ) + + df = df.sort("score", descending=True) + return df