diff --git a/frontend/app/layout.tsx b/frontend/app/layout.tsx
index 3314e47..718ca7f 100644
--- a/frontend/app/layout.tsx
+++ b/frontend/app/layout.tsx
@@ -5,8 +5,8 @@ import "./globals.css";
const inter = Inter({ subsets: ["latin"] });
export const metadata: Metadata = {
- title: "Create Next App",
- description: "Generated by create next app",
+ title: "PyPi LLM Search",
+ description: "Find PyPi packages with natural language using LLM's",
};
export default function RootLayout({
diff --git a/frontend/app/page.tsx b/frontend/app/page.tsx
index f1a406a..f6cc69c 100644
--- a/frontend/app/page.tsx
+++ b/frontend/app/page.tsx
@@ -12,6 +12,7 @@ export default function Home() {
const [sortDirection, setSortDirection] = useState("desc");
const [loading, setLoading] = useState(false);
const [error, setError] = useState("");
+ const [infoBoxVisible, setInfoBoxVisible] = useState(false);
const handleSearch = async () => {
setLoading(true);
@@ -28,10 +29,8 @@ export default function Home() {
},
},
);
- const sortedResults = response.data.matches.sort(
- (a, b) => b.weekly_downloads - a.weekly_downloads,
- );
- setResults(sortedResults);
+ const fetchedResults = response.data.matches;
+ setResults(sortResults(fetchedResults, sortField, sortDirection));
} catch (error) {
setError("Error fetching search results.");
console.error("Error fetching search results:", error);
@@ -40,17 +39,20 @@ export default function Home() {
}
};
- const sortResults = (field) => {
- const direction =
- sortField === field && sortDirection === "asc" ? "desc" : "asc";
- const sorted = [...results].sort((a, b) => {
+ const sortResults = (data, field, direction) => {
+ return [...data].sort((a, b) => {
if (a[field] < b[field]) return direction === "asc" ? -1 : 1;
if (a[field] > b[field]) return direction === "asc" ? 1 : -1;
return 0;
});
- setResults(sorted);
+ };
+
+ const handleSort = (field) => {
+ const direction =
+ sortField === field && sortDirection === "asc" ? "desc" : "asc";
setSortField(field);
setSortDirection(direction);
+ setResults(sortResults(results, field, direction));
};
return (
@@ -80,17 +82,38 @@ export default function Home() {
{error &&
{error}
}
+
+ setInfoBoxVisible(!infoBoxVisible)}
+ >
+ {infoBoxVisible ? "Hide Info" : "How does this work?"}
+
+
+
+ {infoBoxVisible && (
+
+
How does this work?
+
+ This application allows you to search for Python packages on PyPi
+ using natural language. So an example query would be "a package that
+ creates plots and beautiful visualizations". Once you click search,
+ your query will be matched against the summary and the first part of
+ the description of all PyPi packages with more than 50 weekly
+ downloads, and the 50 most similar results will be displayed in a
+ table below.
+
+
+ )}
+
{results.length > 0 && (
-
- Displaying the {results.length} most similar results:
-
diff --git a/notebooks/main.ipynb b/notebooks/main.ipynb
index 5e870e7..88a98ec 100644
--- a/notebooks/main.ipynb
+++ b/notebooks/main.ipynb
@@ -34,14 +34,23 @@
"from pypi_llm.config import Config\n",
"from pypi_llm.data.description_cleaner import DescriptionCleaner, CLEANING_FAILED\n",
"from pypi_llm.data.reader import DataReader\n",
+ "from sentence_transformers import SentenceTransformer\n",
+ "from pypi_llm.vector_database import VectorDatabaseInterface\n",
"\n",
"load_dotenv()\n",
"config = Config()\n",
"\n",
- "df = DataReader(config.DATA_DIR).read()\n",
- "df = DescriptionCleaner().clean(df, \"description\", \"description_cleaned\")\n",
- "df = df.filter(~pl.col(\"description_cleaned\").is_null())\n",
- "df = df.filter(pl.col(\"description_cleaned\")!=CLEANING_FAILED)"
+ "# Load dataset and model\n",
+ "df = pl.read_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)\n",
+ "model = SentenceTransformer(config.EMBEDDINGS_MODEL_NAME)\n",
+ "\n",
+ "# Initialize vector database interface\n",
+ "vector_database_interface = VectorDatabaseInterface(\n",
+ " pinecone_token=config.PINECONE_TOKEN,\n",
+ " pinecone_index_name=config.PINECONE_INDEX_NAME,\n",
+ " embeddings_model=model,\n",
+ " pinecone_namespace=config.PINECONE_NAMESPACE,\n",
+ ")"
]
},
{
@@ -51,39 +60,85 @@
"metadata": {},
"outputs": [],
"source": [
- "with pl.Config(fmt_str_lengths=1000):\n",
+ "with pl.Config(fmt_str_lengths=100):\n",
" display(df.head(10))"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "053c9cf1-9f79-4b98-bcc9-85b6b676da84",
+ "id": "bf393f0c-92c6-4d4a-bd97-d3ea7ebf2b80",
"metadata": {},
"outputs": [],
"source": [
- "from sentence_transformers import SentenceTransformer\n",
- "model = SentenceTransformer(config.EMBEDDINGS_MODEL)\n",
- "embeddings = model.encode(query)\n",
- "\n",
- "from pinecone import Pinecone, Index\n",
- "pc = Pinecone(api_key=config.PINECONE_TOKEN)\n",
- "index = pc.Index(config.PINECONE_INDEX_NAME)\n",
- "\n",
- "matches = index.query(\n",
- " namespace=\"ns1\",\n",
- " vector=embeddings.tolist(),\n",
- " top_k=50,\n",
- " include_values=False\n",
+ "query = \"find unused packages\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "07ebd4fd-a0b9-4958-8325-bdff4be45a66",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_matches = vector_database_interface.find_similar(query, top_k=100)\n",
+ "df_matches = df_matches.join(df, how=\"left\", on=\"name\")\n",
+ "df_matches = df_matches.sort(\"similarity\", descending=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fa071203-a3cd-4e80-a7b7-0ac7562bef8d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_matches"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8b7b28e7-495c-44db-a939-dfa3e2c45159",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Rank the columns\n",
+ "df_matches = df_matches.with_columns(\n",
+ " rank_similarity=pl.col(\"similarity\").rank(\"dense\", descending=False),\n",
+ " rank_weekly_downloads=pl.col(\"weekly_downloads\").rank(\"dense\", descending=False)\n",
")\n",
"\n",
- "df_matches = pl.from_dicts([{'name' : x['id'], 'similarity': x['score']} for x in matches['matches']])\n",
+ "df_matches = df_matches.with_columns(\n",
+ " normalized_similarity=(pl.col(\"rank_similarity\") - 1) / (df_matches['rank_similarity'].max() - 1),\n",
+ " normalized_weekly_downloads=(pl.col(\"rank_weekly_downloads\") - 1) / (df_matches['rank_weekly_downloads'].max() - 1)\n",
+ ")\n",
"\n",
- "df_matches = df_matches.join(df, how = 'left', on = 'name')\n",
+ "df_matches = df_matches.with_columns(\n",
+ " score=0.5 * pl.col(\"normalized_similarity\") + 0.5 * pl.col(\"normalized_weekly_downloads\")\n",
+ ")\n",
"\n",
- "df_matches.sort('weekly_downloads', descending=True)\n",
- "\n"
+ "# Sort the DataFrame by the combined score in descending order\n",
+ "df_matches = df_matches.sort(\"score\", descending=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d5465cec-c717-4fc5-aa55-c4c7dc9e79cf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_matches.sort(\"score\", descending=True)"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4384f057-8eaf-431d-a31a-f4f7e203ed35",
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
diff --git a/pypi_llm/api/main.py b/pypi_llm/api/main.py
index 1002eac..d5abfc5 100644
--- a/pypi_llm/api/main.py
+++ b/pypi_llm/api/main.py
@@ -6,18 +6,16 @@
from sentence_transformers import SentenceTransformer
from pypi_llm.config import Config
+from pypi_llm.utils.score_calculator import calculate_score
from pypi_llm.vector_database import VectorDatabaseInterface
app = FastAPI()
-# Load environment variables
load_dotenv()
config = Config()
-# Setup CORS
origins = [
"http://localhost:3000",
- # Add other origins if needed
]
app.add_middleware(
@@ -28,11 +26,9 @@
allow_headers=["*"],
)
-# Load dataset and model
df = pl.read_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
model = SentenceTransformer(config.EMBEDDINGS_MODEL_NAME)
-# Initialize vector database interface
vector_database_interface = VectorDatabaseInterface(
pinecone_token=config.PINECONE_TOKEN,
pinecone_index_name=config.PINECONE_INDEX_NAME,
@@ -41,9 +37,9 @@
)
-# Define request and response models
class QueryModel(BaseModel):
query: str
+ top_k: int = 30
class Match(BaseModel):
@@ -57,10 +53,14 @@ class SearchResponse(BaseModel):
matches: list[Match]
-# Define search endpoint
@app.post("/search/", response_model=SearchResponse)
async def search(query: QueryModel):
- df_matches = vector_database_interface.find_similar(query.query, top_k=50)
+ df_matches = vector_database_interface.find_similar(query.query, top_k=query.top_k * 2)
df_matches = df_matches.join(df, how="left", on="name")
- df_matches = df_matches.sort("similarity", descending=True)
+
+ df_matches = calculate_score(df_matches)
+ df_matches = df_matches.sort("score", descending=True)
+ df_matches = df_matches.head(query.top_k)
+
+ print("sending")
return SearchResponse(matches=df_matches.to_dicts())
diff --git a/pypi_llm/scripts/upsert_data.py b/pypi_llm/scripts/upsert_data.py
index cb05af7..e7b7e43 100644
--- a/pypi_llm/scripts/upsert_data.py
+++ b/pypi_llm/scripts/upsert_data.py
@@ -27,6 +27,6 @@
)
df = df.with_columns(
- summary_and_description_cleaned=pl.concat_str(pl.col("summary"), pl.lit(" "), pl.col("description_cleaned"))
+ summary_and_description_cleaned=pl.concat_str(pl.col("summary"), pl.lit(" - "), pl.col("description_cleaned"))
)
vector_database_interface.upsert_polars(df, key_column="name", text_column="summary_and_description_cleaned")
diff --git a/pypi_llm/utils/score_calculator.py b/pypi_llm/utils/score_calculator.py
new file mode 100644
index 0000000..7dd2232
--- /dev/null
+++ b/pypi_llm/utils/score_calculator.py
@@ -0,0 +1,34 @@
+import polars as pl
+
+
+def calculate_score(df: pl.DataFrame, weight_similarity=0.5, weight_weekly_downloads=0.5) -> pl.DataFrame:
+ """
+ Calculate a combined score based on similarity and weekly downloads.
+
+ The function ranks the similarity and weekly downloads, normalizes these ranks to a [0, 1] scale,
+ and then computes a combined score based on the provided weights for similarity and weekly downloads.
+ The DataFrame is sorted by the combined score in descending order.
+
+ Args:
+ df (pl.DataFrame): DataFrame containing 'similarity' and 'weekly_downloads' columns.
+ weight_similarity (float): Weight for the similarity score in the combined score calculation. Default is 0.5.
+ weight_weekly_downloads (float): Weight for the weekly downloads score in the combined score calculation. Default is 0.5.
+
+ """
+ df = df.with_columns(
+ rank_similarity=pl.col("similarity").rank("dense", descending=False),
+ rank_weekly_downloads=pl.col("weekly_downloads").rank("dense", descending=False),
+ )
+
+ df = df.with_columns(
+ normalized_similarity=(pl.col("rank_similarity") - 1) / (df["rank_similarity"].max() - 1),
+ normalized_weekly_downloads=(pl.col("rank_weekly_downloads") - 1) / (df["rank_weekly_downloads"].max() - 1),
+ )
+
+ df = df.with_columns(
+ score=weight_similarity * pl.col("normalized_similarity")
+ + weight_weekly_downloads * pl.col("normalized_weekly_downloads")
+ )
+
+ df = df.sort("score", descending=True)
+ return df