diff --git a/.gitignore b/.gitignore
index 1a607ed..bf23a33 100644
--- a/.gitignore
+++ b/.gitignore
@@ -166,5 +166,4 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
-/data
.env
diff --git a/Dockerfile b/Dockerfile
index 1c340da..161ac51 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,8 @@
# syntax=docker/dockerfile:1
-FROM python:3.9-slim-buster
+FROM python:3.10-slim-buster
-ENV POETRY_VERSION=1.4 \
+ENV POETRY_VERSION=1.6 \
POETRY_VIRTUALENVS_CREATE=false
# Install poetry
@@ -18,4 +18,6 @@ RUN poetry install --no-interaction --no-ansi --no-root --no-dev
# Copy Python code to the Docker image
COPY pypi_llm /code/pypi_llm/
+ENV PYTHONPATH=/code
+
CMD [ "python", "pypi_llm/foo.py"]
diff --git a/README.md b/README.md
index be632a1..56c95c2 100644
--- a/README.md
+++ b/README.md
@@ -1,55 +1,20 @@
# pypi-llm
-[![Release](https://img.shields.io/github/v/release/fpgmaas/pypi-llm)](https://img.shields.io/github/v/release/fpgmaas/pypi-llm)
-[![Build status](https://img.shields.io/github/actions/workflow/status/fpgmaas/pypi-llm/main.yml?branch=main)](https://github.com/fpgmaas/pypi-llm/actions/workflows/main.yml?query=branch%3Amain)
-[![codecov](https://codecov.io/gh/fpgmaas/pypi-llm/branch/main/graph/badge.svg)](https://codecov.io/gh/fpgmaas/pypi-llm)
-[![Commit activity](https://img.shields.io/github/commit-activity/m/fpgmaas/pypi-llm)](https://img.shields.io/github/commit-activity/m/fpgmaas/pypi-llm)
-[![License](https://img.shields.io/github/license/fpgmaas/pypi-llm)](https://img.shields.io/github/license/fpgmaas/pypi-llm)
+https://drive.google.com/file/d/1huR7-VD3AieBRCcQyRX9MWbPLMb_czjq/view?usp=sharing
-This is a template repository for Python projects that use Poetry for their dependency management.
+# setup
-- **Github repository**:
-- **Documentation**
-
-## Getting started with your project
-
-First, create a repository on GitHub with the same name as this project, and then run the following commands:
-
-```bash
-git init -b main
-git add .
-git commit -m "init commit"
-git remote add origin git@github.com:fpgmaas/pypi-llm.git
-git push -u origin main
```
-
-Finally, install the environment and the pre-commit hooks with
-
-```bash
-make install
+docker build -t pypi-llm .
```
-You are now ready to start development on your project!
-The CI/CD pipeline will be triggered when you open a pull request, merge to main, or when you create a new release.
-
-To finalize the set-up for publishing to PyPi or Artifactory, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/publishing/#set-up-for-pypi).
-For activating the automatic documentation with MkDocs, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/mkdocs/#enabling-the-documentation-on-github).
-To enable the code coverage reports, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/codecov/).
-
-## Releasing a new version
-
-- Create an API Token on [Pypi](https://pypi.org/).
-- Add the API Token to your projects secrets with the name `PYPI_TOKEN` by visiting [this page](https://github.com/fpgmaas/pypi-llm/settings/secrets/actions/new).
-- Create a [new release](https://github.com/fpgmaas/pypi-llm/releases/new) on Github.
-- Create a new tag in the form `*.*.*`.
-
-For more details, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/cicd/#how-to-trigger-a-release).
-
----
-
-Repository initiated with [fpgmaas/cookiecutter-poetry](https://github.com/fpgmaas/cookiecutter-poetry).
-
----
+```
+docker run --rm \
+ --env-file .env \
+ -v $(pwd)/data:/code/data \
+ pypi-llm \
+ python /code/pypi_llm/scripts/1_download_dataset.py
+```
## total
diff --git a/data/.gitignore b/data/.gitignore
new file mode 100644
index 0000000..5e7d273
--- /dev/null
+++ b/data/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
diff --git a/frontend/app/components/InfoBox.tsx b/frontend/app/components/InfoBox.tsx
new file mode 100644
index 0000000..23f757f
--- /dev/null
+++ b/frontend/app/components/InfoBox.tsx
@@ -0,0 +1,30 @@
+import React from "react";
+
+interface InfoBoxProps {
+ infoBoxVisible: boolean;
+}
+
+const InfoBox: React.FC = ({ infoBoxVisible }) => {
+ if (!infoBoxVisible) return null;
+
+ return (
+
+
How does this work?
+
+ This application allows you to search for Python packages on PyPi using
+ natural language. An example query would be "a package that creates
+ plots and beautiful visualizations".
+
+
+
+ Once you click search, your query will be matched against the summary
+ and the first part of the description of all PyPi packages with more
+ than 50 weekly downloads. The results are then scored based on their
+ similarity and their number of weekly downloads, and the thirty best
+ results are displayed in the table below.
+
- This application allows you to search for Python packages on PyPi
- using natural language. So an example query would be "a package that
- creates plots and beautiful visualizations". Once you click search,
- your query will be matched against the summary and the first part of
- the description of all PyPi packages with more than 50 weekly
- downloads, and the 50 most similar results will be displayed in a
- table below.
-
-
- )}
+
{results.length > 0 && (
diff --git a/frontend/app/utils/search.ts b/frontend/app/utils/search.ts
new file mode 100644
index 0000000..b8abf25
--- /dev/null
+++ b/frontend/app/utils/search.ts
@@ -0,0 +1,52 @@
+import axios from "axios";
+
+interface Match {
+ name: string;
+ similarity: number;
+ weekly_downloads: number;
+ summary: string;
+}
+
+export const handleSearch = async (
+ query: string,
+ sortField: string,
+ sortDirection: string,
+ setResults: React.Dispatch>,
+ setLoading: React.Dispatch>,
+ setError: React.Dispatch>,
+) => {
+ setLoading(true);
+ setError("");
+ try {
+ const response = await axios.post(
+ "http://localhost:8000/search",
+ {
+ query: query,
+ },
+ {
+ headers: {
+ "Content-Type": "application/json",
+ },
+ },
+ );
+ const fetchedResults: Match[] = response.data.matches;
+ setResults(sortResults(fetchedResults, sortField, sortDirection));
+ } catch (error) {
+ setError("Error fetching search results.");
+ console.error("Error fetching search results:", error);
+ } finally {
+ setLoading(false);
+ }
+};
+
+export const sortResults = (
+ data: Match[],
+ field: string,
+ direction: string,
+): Match[] => {
+ return [...data].sort((a, b) => {
+ if (a[field] < b[field]) return direction === "asc" ? -1 : 1;
+ if (a[field] > b[field]) return direction === "asc" ? 1 : -1;
+ return 0;
+ });
+};
diff --git a/poetry.lock b/poetry.lock
index ca0f5a5..f5845a9 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -841,6 +841,26 @@ test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe,
test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"]
tqdm = ["tqdm"]
+[[package]]
+name = "gdown"
+version = "5.2.0"
+description = "Google Drive Public File/Folder Downloader"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "gdown-5.2.0-py3-none-any.whl", hash = "sha256:33083832d82b1101bdd0e9df3edd0fbc0e1c5f14c9d8c38d2a35bf1683b526d6"},
+ {file = "gdown-5.2.0.tar.gz", hash = "sha256:2145165062d85520a3cd98b356c9ed522c5e7984d408535409fd46f94defc787"},
+]
+
+[package.dependencies]
+beautifulsoup4 = "*"
+filelock = "*"
+requests = {version = "*", extras = ["socks"]}
+tqdm = "*"
+
+[package.extras]
+test = ["build", "mypy", "pytest", "pytest-xdist", "ruff", "twine", "types-requests", "types-setuptools"]
+
[[package]]
name = "ghp-import"
version = "2.1.0"
@@ -2992,6 +3012,18 @@ tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""}
docs = ["furo (>=2023.8.19)", "sphinx (<7.2)", "sphinx-autodoc-typehints (>=1.24)"]
testing = ["covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)", "setuptools (>=68.1.2)", "wheel (>=0.41.2)"]
+[[package]]
+name = "pysocks"
+version = "1.7.1"
+description = "A Python SOCKS client module. See https://github.com/Anorov/PySocks for more information."
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+ {file = "PySocks-1.7.1-py27-none-any.whl", hash = "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299"},
+ {file = "PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5"},
+ {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"},
+]
+
[[package]]
name = "pytest"
version = "7.4.4"
@@ -3426,6 +3458,7 @@ files = [
certifi = ">=2017.4.17"
charset-normalizer = ">=2,<4"
idna = ">=2.5,<4"
+PySocks = {version = ">=1.5.6,<1.5.7 || >1.5.7", optional = true, markers = "extra == \"socks\""}
urllib3 = ">=1.21.1,<3"
[package.extras]
@@ -4902,4 +4935,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
[metadata]
lock-version = "2.0"
python-versions = ">=3.8,<4.0"
-content-hash = "05b453b7e4b34fce7a4a0023f82b194a23dc13b35b57e02218e80f0417694913"
+content-hash = "3b9517bb553ec909b8f2d6cb96baca236709a5964b5e5f05208b125591ed9270"
diff --git a/pypi_llm/api/main.py b/pypi_llm/api/main.py
index d5abfc5..1aa49a1 100644
--- a/pypi_llm/api/main.py
+++ b/pypi_llm/api/main.py
@@ -1,3 +1,5 @@
+import logging
+
import polars as pl
from dotenv import load_dotenv
from fastapi import FastAPI
@@ -6,9 +8,12 @@
from sentence_transformers import SentenceTransformer
from pypi_llm.config import Config
+from pypi_llm.utils.logging import setup_logging
from pypi_llm.utils.score_calculator import calculate_score
from pypi_llm.vector_database import VectorDatabaseInterface
+setup_logging()
+
app = FastAPI()
load_dotenv()
@@ -55,12 +60,20 @@ class SearchResponse(BaseModel):
@app.post("/search/", response_model=SearchResponse)
async def search(query: QueryModel):
+ """
+ Search for the packages whose summary and description have the highest similarity to the query.
+ We take the top_k * 2 most similar packages, and then calculate weighted score based on the similarity and weekly downloads.
+ The top_k packages with the highest score are returned.
+ """
+
+ logging.info(f"Searching for similar projects. Query: '{query.query}'")
df_matches = vector_database_interface.find_similar(query.query, top_k=query.top_k * 2)
df_matches = df_matches.join(df, how="left", on="name")
+ logging.info("Found similar projects. Calculating the weighted scores and filtering...")
df_matches = calculate_score(df_matches)
df_matches = df_matches.sort("score", descending=True)
df_matches = df_matches.head(query.top_k)
- print("sending")
+ logging.info("Returning the results...")
return SearchResponse(matches=df_matches.to_dicts())
diff --git a/pypi_llm/config.py b/pypi_llm/config.py
index afe4f68..88c69b4 100644
--- a/pypi_llm/config.py
+++ b/pypi_llm/config.py
@@ -5,14 +5,17 @@
@dataclass
class Config:
- DATA_DIR: Path = Path("data")
PINECONE_INDEX_NAME = "pypi"
PINECONE_NAMESPACE = "ns1"
PINECONE_TOKEN: str = field(default_factory=lambda: os.getenv("PINECONE_TOKEN"))
+
EMBEDDINGS_MODEL_NAME = "all-mpnet-base-v2"
EMBEDDINGS_DIMENSION = 768
- PROCESSED_DATASET_CSV_NAME = "dataset.csv"
+ DATA_DIR: Path = Path("data")
+ RAW_DATASET_CSV_NAME = "raw_dataset.csv"
+ PROCESSED_DATASET_CSV_NAME = "processed_dataset.csv"
+ GOOGLE_FILE_ID = "1huR7-VD3AieBRCcQyRX9MWbPLMb_czjq"
def __post_init__(self):
if not self.PINECONE_TOKEN:
diff --git a/pypi_llm/data/description_cleaner.py b/pypi_llm/data/description_cleaner.py
index 3b1b077..3ef2f1b 100644
--- a/pypi_llm/data/description_cleaner.py
+++ b/pypi_llm/data/description_cleaner.py
@@ -9,11 +9,36 @@
@dataclass
class DescriptionCleaner:
+ """
+ A class that provides methods to clean PyPi package descriptions in a DataFrame column.
+ """
+
def clean(self, df: pl.DataFrame, input_col: str, output_col: str) -> pl.DataFrame:
+ """
+ Cleans the text in the specified DataFrame column and returns the modified DataFrame.
+
+ Args:
+ df (pl.DataFrame): The DataFrame containing the text column to be cleaned.
+ input_col (str): The name of the input column containing the text to be cleaned.
+ output_col (str): The name of the output column to store the cleaned text.
+
+ Returns:
+ pl.DataFrame: The modified DataFrame with the cleaned text.
+ """
df = df.with_columns(pl.col(input_col).apply(self._clean_text).alias(output_col))
return df
def _clean_text(self, text: str) -> str:
+ """
+ Cleans the given text by removing HTML tags, markdown image links, markdown badges,
+ markdown links, URLs, special markdown characters, markdown headers, and extra whitespaces.
+
+ Args:
+ text (str): The text to be cleaned.
+
+ Returns:
+ str: The cleaned text.
+ """
try:
text = self._remove_html_tags(text)
text = self._remove_markdown_image_links(text)
diff --git a/pypi_llm/data/reader.py b/pypi_llm/data/reader.py
index 33b94f8..d1b6847 100644
--- a/pypi_llm/data/reader.py
+++ b/pypi_llm/data/reader.py
@@ -6,10 +6,21 @@
@dataclass
class DataReader:
- data_dir: Path
+ """
+ A class for reading and processing data from a raw PyPi dataset.
+ """
+
+ raw_dataset: Path
def read(self):
- df = pl.read_csv(self.data_dir / "pypi_dataset.csv")
+ """
+ Reads the raw dataset, performs data processing operations, and returns the processed dataframe.
+ The dataset should at least have the following columns: name, description, and number_of_downloads.
+
+ Returns:
+ DataFrame: The processed dataframe.
+ """
+ df = pl.read_csv(self.raw_dataset)
df = df.with_columns(weekly_downloads=(pl.col("number_of_downloads") / 4).round().cast(pl.Int32))
df = df.drop("number_of_downloads")
df = df.unique(subset="name")
diff --git a/pypi_llm/scripts/0_setup_pinecone.py b/pypi_llm/scripts/0_setup_pinecone.py
new file mode 100644
index 0000000..3ff1a2c
--- /dev/null
+++ b/pypi_llm/scripts/0_setup_pinecone.py
@@ -0,0 +1,32 @@
+import logging
+
+from dotenv import load_dotenv
+from pinecone import Pinecone, ServerlessSpec
+
+from pypi_llm.config import Config
+from pypi_llm.utils.logging import setup_logging
+
+setup_logging()
+
+if __name__ == "__main__":
+ """
+ This script sets up a Pinecone index for storing embeddings.
+
+ It loads the environment variables from a .env file, creates a Pinecone client,
+ and creates an index with the specified name, dimension, metric, and serverless specification.
+ """
+
+ load_dotenv()
+ config = Config()
+
+ logging.info("Connection to Pinecone..")
+ pc = Pinecone(api_key=config.PINECONE_TOKEN)
+
+ logging.info("Creating Pinecone index..")
+ pc.create_index(
+ name=config.PINECONE_INDEX_NAME,
+ dimension=config.EMBEDDINGS_DIMENSION,
+ metric="dotproduct",
+ spec=ServerlessSpec(cloud="aws", region="us-east-1"),
+ )
+ logging.info("Done!")
diff --git a/pypi_llm/scripts/1_download_dataset.py b/pypi_llm/scripts/1_download_dataset.py
new file mode 100644
index 0000000..3d4250e
--- /dev/null
+++ b/pypi_llm/scripts/1_download_dataset.py
@@ -0,0 +1,22 @@
+import logging
+
+import gdown
+from dotenv import load_dotenv
+
+from pypi_llm.config import Config
+from pypi_llm.utils.logging import setup_logging
+
+setup_logging()
+
+if __name__ == "__main__":
+ """
+ Downloads the dataset from a Google Drive link using the gdown library.
+ """
+ load_dotenv()
+ config = Config()
+
+ logging.info("Downloading raw dataset from Google Drive...")
+ url = f"https://drive.google.com/uc?id={config.GOOGLE_FILE_ID}"
+ output = str(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
+ gdown.download(url, output, quiet=False)
+ logging.info("Done!")
diff --git a/pypi_llm/scripts/2_process_dataset.py b/pypi_llm/scripts/2_process_dataset.py
new file mode 100644
index 0000000..cb35337
--- /dev/null
+++ b/pypi_llm/scripts/2_process_dataset.py
@@ -0,0 +1,31 @@
+import logging
+
+import polars as pl
+from dotenv import load_dotenv
+
+from pypi_llm.config import Config
+from pypi_llm.data.description_cleaner import CLEANING_FAILED, DescriptionCleaner
+from pypi_llm.data.reader import DataReader
+from pypi_llm.utils.logging import setup_logging
+
+setup_logging()
+
+if __name__ == "__main__":
+ """
+ This script processes a dataset by cleaning the description column and saving the processed dataset as a CSV file.
+ """
+
+ load_dotenv()
+ config = Config()
+
+ logging.info("Reading the raw dataset...")
+ df = DataReader(config.DATA_DIR / config.RAW_DATASET_CSV_NAME).read()
+
+ logging.info("Cleaning the descriptions...")
+ df = DescriptionCleaner().clean(df, "description", "description_cleaned")
+ df = df.filter(~pl.col("description_cleaned").is_null())
+ df = df.filter(pl.col("description_cleaned") != CLEANING_FAILED)
+
+ logging.info("Storing the processed dataset...")
+ df.write_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
+ logging.info("Done!")
diff --git a/pypi_llm/scripts/upsert_data.py b/pypi_llm/scripts/3_upsert_data.py
similarity index 65%
rename from pypi_llm/scripts/upsert_data.py
rename to pypi_llm/scripts/3_upsert_data.py
index e7b7e43..9206edd 100644
--- a/pypi_llm/scripts/upsert_data.py
+++ b/pypi_llm/scripts/3_upsert_data.py
@@ -1,24 +1,26 @@
+import logging
+
import polars as pl
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from pypi_llm.config import Config
-from pypi_llm.data.description_cleaner import CLEANING_FAILED, DescriptionCleaner
-from pypi_llm.data.reader import DataReader
+from pypi_llm.utils.logging import setup_logging
from pypi_llm.vector_database import VectorDatabaseInterface
+setup_logging()
+
if __name__ == "__main__":
+ """
+ Upserts data from a processed dataset CSV into a vector database.
+ """
load_dotenv()
config = Config()
- df = DataReader(config.DATA_DIR).read()
-
- df = DescriptionCleaner().clean(df, "description", "description_cleaned")
- df = df.filter(~pl.col("description_cleaned").is_null())
- df = df.filter(pl.col("description_cleaned") != CLEANING_FAILED)
-
- df.write_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
+ logging.info("Reading the processed dataset...")
+ df = pl.read_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
+ logging.info("Connecting to the vector database..")
vector_database_interface = VectorDatabaseInterface(
pinecone_token=config.PINECONE_TOKEN,
pinecone_index_name=config.PINECONE_INDEX_NAME,
@@ -26,7 +28,9 @@
pinecone_namespace=config.PINECONE_NAMESPACE,
)
+ logging.info("Upserting data into the vector database..")
df = df.with_columns(
summary_and_description_cleaned=pl.concat_str(pl.col("summary"), pl.lit(" - "), pl.col("description_cleaned"))
)
vector_database_interface.upsert_polars(df, key_column="name", text_column="summary_and_description_cleaned")
+ logging.info("Done!")
diff --git a/pypi_llm/scripts/setup_pinecone.py b/pypi_llm/scripts/setup_pinecone.py
deleted file mode 100644
index 126499a..0000000
--- a/pypi_llm/scripts/setup_pinecone.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from dotenv import load_dotenv
-from pinecone import Pinecone, ServerlessSpec
-
-from pypi_llm.config import Config
-
-if __name__ == "__main__":
- load_dotenv()
- config = Config()
-
- pc = Pinecone(api_key=config.PINECONE_TOKEN)
-
- pc.create_index(
- name=config.PINECONE_INDEX_NAME,
- dimension=config.EMBEDDINGS_DIMENSION,
- metric="dotproduct",
- spec=ServerlessSpec(cloud="aws", region="us-east-1"),
- )
diff --git a/pypi_llm/utils/logging.py b/pypi_llm/utils/logging.py
new file mode 100644
index 0000000..9ddc72c
--- /dev/null
+++ b/pypi_llm/utils/logging.py
@@ -0,0 +1,9 @@
+import logging
+
+
+def setup_logging():
+ logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+ handlers=[logging.StreamHandler()],
+ )
diff --git a/pypi_llm/vector_database/interface.py b/pypi_llm/vector_database/interface.py
index d42496b..ecba020 100644
--- a/pypi_llm/vector_database/interface.py
+++ b/pypi_llm/vector_database/interface.py
@@ -5,6 +5,17 @@
class VectorDatabaseInterface:
+ """
+ A class that provides an interface for interacting with a vector database.
+
+ Args:
+ pinecone_token (str): The Pinecone API token.
+ pinecone_index_name (str): The name of the Pinecone index.
+ pinecone_namespace (str): The namespace for the Pinecone index.
+ embeddings_model (SentenceTransformer): The sentence transformer model for encoding text into embeddings.
+ batch_size (int, optional): The batch size for upserting data. Defaults to 250.
+ """
+
def __init__(
self,
pinecone_token: str,
@@ -20,11 +31,29 @@ def __init__(
self.pinecone_namespace = pinecone_namespace
def upsert_polars(self, df: pl.DataFrame, key_column: str, text_column: str):
+ """
+ Upserts the data from a Polars DataFrame into the vector database.
+
+ Args:
+ df (pl.DataFrame): The Polars DataFrame containing the data to be upserted.
+ key_column (str): The name of the column in the DataFrame containing the unique keys.
+ text_column (str): The name of the column in the DataFrame containing the text data.
+ """
df_chunks = self._split_dataframe_in_batches(df)
for chunk in tqdm(df_chunks, desc="Upserting batches", unit="batch"):
self._upsert_chunk(chunk, key_column, text_column)
def find_similar(self, query: str, top_k: int = 25) -> pl.DataFrame:
+ """
+ Finds similar vectors in the database for a given query.
+
+ Args:
+ query (str): The query string.
+ top_k (int, optional): The number of similar vectors to retrieve. Defaults to 25.
+
+ Returns:
+ pl.DataFrame: A Polars DataFrame containing the similar vectors and their similarity scores.
+ """
embeddings = self.model.encode(query)
matches = self.index.query(
namespace=self.pinecone_namespace, vector=embeddings.tolist(), top_k=top_k, include_values=False
diff --git a/pyproject.toml b/pyproject.toml
index d967012..8319198 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,7 @@ tqdm = "^4.66.4"
fastapi = "^0.111.0"
pydantic = "^2.7.4"
uvicorn = "^0.30.1"
+gdown = "^5.2.0"
[tool.poetry.group.dev.dependencies]
pytest = "^7.2.0"
diff --git a/tests/test_foo.py b/tests/test_foo.py
deleted file mode 100644
index 2fef3fe..0000000
--- a/tests/test_foo.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from pypi_llm.foo import foo
-
-
-def test_foo():
- assert foo("foo") == "foo"