diff --git a/.gitignore b/.gitignore index 1a607ed..bf23a33 100644 --- a/.gitignore +++ b/.gitignore @@ -166,5 +166,4 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -/data .env diff --git a/Dockerfile b/Dockerfile index 1c340da..161ac51 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,8 @@ # syntax=docker/dockerfile:1 -FROM python:3.9-slim-buster +FROM python:3.10-slim-buster -ENV POETRY_VERSION=1.4 \ +ENV POETRY_VERSION=1.6 \ POETRY_VIRTUALENVS_CREATE=false # Install poetry @@ -18,4 +18,6 @@ RUN poetry install --no-interaction --no-ansi --no-root --no-dev # Copy Python code to the Docker image COPY pypi_llm /code/pypi_llm/ +ENV PYTHONPATH=/code + CMD [ "python", "pypi_llm/foo.py"] diff --git a/README.md b/README.md index be632a1..56c95c2 100644 --- a/README.md +++ b/README.md @@ -1,55 +1,20 @@ # pypi-llm -[![Release](https://img.shields.io/github/v/release/fpgmaas/pypi-llm)](https://img.shields.io/github/v/release/fpgmaas/pypi-llm) -[![Build status](https://img.shields.io/github/actions/workflow/status/fpgmaas/pypi-llm/main.yml?branch=main)](https://github.com/fpgmaas/pypi-llm/actions/workflows/main.yml?query=branch%3Amain) -[![codecov](https://codecov.io/gh/fpgmaas/pypi-llm/branch/main/graph/badge.svg)](https://codecov.io/gh/fpgmaas/pypi-llm) -[![Commit activity](https://img.shields.io/github/commit-activity/m/fpgmaas/pypi-llm)](https://img.shields.io/github/commit-activity/m/fpgmaas/pypi-llm) -[![License](https://img.shields.io/github/license/fpgmaas/pypi-llm)](https://img.shields.io/github/license/fpgmaas/pypi-llm) +https://drive.google.com/file/d/1huR7-VD3AieBRCcQyRX9MWbPLMb_czjq/view?usp=sharing -This is a template repository for Python projects that use Poetry for their dependency management. +# setup -- **Github repository**: -- **Documentation** - -## Getting started with your project - -First, create a repository on GitHub with the same name as this project, and then run the following commands: - -```bash -git init -b main -git add . -git commit -m "init commit" -git remote add origin git@github.com:fpgmaas/pypi-llm.git -git push -u origin main ``` - -Finally, install the environment and the pre-commit hooks with - -```bash -make install +docker build -t pypi-llm . ``` -You are now ready to start development on your project! -The CI/CD pipeline will be triggered when you open a pull request, merge to main, or when you create a new release. - -To finalize the set-up for publishing to PyPi or Artifactory, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/publishing/#set-up-for-pypi). -For activating the automatic documentation with MkDocs, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/mkdocs/#enabling-the-documentation-on-github). -To enable the code coverage reports, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/codecov/). - -## Releasing a new version - -- Create an API Token on [Pypi](https://pypi.org/). -- Add the API Token to your projects secrets with the name `PYPI_TOKEN` by visiting [this page](https://github.com/fpgmaas/pypi-llm/settings/secrets/actions/new). -- Create a [new release](https://github.com/fpgmaas/pypi-llm/releases/new) on Github. -- Create a new tag in the form `*.*.*`. - -For more details, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/cicd/#how-to-trigger-a-release). - ---- - -Repository initiated with [fpgmaas/cookiecutter-poetry](https://github.com/fpgmaas/cookiecutter-poetry). - ---- +``` +docker run --rm \ + --env-file .env \ + -v $(pwd)/data:/code/data \ + pypi-llm \ + python /code/pypi_llm/scripts/1_download_dataset.py +``` ## total diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..5e7d273 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore diff --git a/frontend/app/components/InfoBox.tsx b/frontend/app/components/InfoBox.tsx new file mode 100644 index 0000000..23f757f --- /dev/null +++ b/frontend/app/components/InfoBox.tsx @@ -0,0 +1,30 @@ +import React from "react"; + +interface InfoBoxProps { + infoBoxVisible: boolean; +} + +const InfoBox: React.FC = ({ infoBoxVisible }) => { + if (!infoBoxVisible) return null; + + return ( +
+

How does this work?

+

+ This application allows you to search for Python packages on PyPi using + natural language. An example query would be "a package that creates + plots and beautiful visualizations". +

+
+

+ Once you click search, your query will be matched against the summary + and the first part of the description of all PyPi packages with more + than 50 weekly downloads. The results are then scored based on their + similarity and their number of weekly downloads, and the thirty best + results are displayed in the table below. +

+
+ ); +}; + +export default InfoBox; diff --git a/frontend/components/SearchResultsTable.tsx b/frontend/app/components/SearchResultsTable.tsx similarity index 100% rename from frontend/components/SearchResultsTable.tsx rename to frontend/app/components/SearchResultsTable.tsx diff --git a/frontend/app/page.tsx b/frontend/app/page.tsx index f6cc69c..48b7210 100644 --- a/frontend/app/page.tsx +++ b/frontend/app/page.tsx @@ -1,53 +1,28 @@ "use client"; import { useState } from "react"; -import axios from "axios"; -import SearchResultsTable from "../components/SearchResultsTable"; +import { handleSearch, sortResults } from "./utils/search"; +import SearchResultsTable from "./components/SearchResultsTable"; +import InfoBox from "./components/InfoBox"; import { ClipLoader } from "react-spinners"; -export default function Home() { - const [text, setText] = useState(""); - const [results, setResults] = useState([]); - const [sortField, setSortField] = useState("weekly_downloads"); - const [sortDirection, setSortDirection] = useState("desc"); - const [loading, setLoading] = useState(false); - const [error, setError] = useState(""); - const [infoBoxVisible, setInfoBoxVisible] = useState(false); - - const handleSearch = async () => { - setLoading(true); - setError(""); - try { - const response = await axios.post( - "http://localhost:8000/search", - { - query: text, - }, - { - headers: { - "Content-Type": "application/json", - }, - }, - ); - const fetchedResults = response.data.matches; - setResults(sortResults(fetchedResults, sortField, sortDirection)); - } catch (error) { - setError("Error fetching search results."); - console.error("Error fetching search results:", error); - } finally { - setLoading(false); - } - }; +interface Match { + name: string; + similarity: number; + weekly_downloads: number; + summary: string; +} - const sortResults = (data, field, direction) => { - return [...data].sort((a, b) => { - if (a[field] < b[field]) return direction === "asc" ? -1 : 1; - if (a[field] > b[field]) return direction === "asc" ? 1 : -1; - return 0; - }); - }; +export default function Home() { + const [text, setText] = useState(""); + const [results, setResults] = useState([]); + const [sortField, setSortField] = useState("similarity"); + const [sortDirection, setSortDirection] = useState("desc"); + const [loading, setLoading] = useState(false); + const [error, setError] = useState(""); + const [infoBoxVisible, setInfoBoxVisible] = useState(false); - const handleSort = (field) => { + const handleSort = (field: string) => { const direction = sortField === field && sortDirection === "asc" ? "desc" : "asc"; setSortField(field); @@ -72,7 +47,16 @@ export default function Home() { > @@ -91,20 +75,7 @@ export default function Home() { - {infoBoxVisible && ( -
-

How does this work?

-

- This application allows you to search for Python packages on PyPi - using natural language. So an example query would be "a package that - creates plots and beautiful visualizations". Once you click search, - your query will be matched against the summary and the first part of - the description of all PyPi packages with more than 50 weekly - downloads, and the 50 most similar results will be displayed in a - table below. -

-
- )} + {results.length > 0 && (
diff --git a/frontend/app/utils/search.ts b/frontend/app/utils/search.ts new file mode 100644 index 0000000..b8abf25 --- /dev/null +++ b/frontend/app/utils/search.ts @@ -0,0 +1,52 @@ +import axios from "axios"; + +interface Match { + name: string; + similarity: number; + weekly_downloads: number; + summary: string; +} + +export const handleSearch = async ( + query: string, + sortField: string, + sortDirection: string, + setResults: React.Dispatch>, + setLoading: React.Dispatch>, + setError: React.Dispatch>, +) => { + setLoading(true); + setError(""); + try { + const response = await axios.post( + "http://localhost:8000/search", + { + query: query, + }, + { + headers: { + "Content-Type": "application/json", + }, + }, + ); + const fetchedResults: Match[] = response.data.matches; + setResults(sortResults(fetchedResults, sortField, sortDirection)); + } catch (error) { + setError("Error fetching search results."); + console.error("Error fetching search results:", error); + } finally { + setLoading(false); + } +}; + +export const sortResults = ( + data: Match[], + field: string, + direction: string, +): Match[] => { + return [...data].sort((a, b) => { + if (a[field] < b[field]) return direction === "asc" ? -1 : 1; + if (a[field] > b[field]) return direction === "asc" ? 1 : -1; + return 0; + }); +}; diff --git a/poetry.lock b/poetry.lock index ca0f5a5..f5845a9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -841,6 +841,26 @@ test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe, test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"] tqdm = ["tqdm"] +[[package]] +name = "gdown" +version = "5.2.0" +description = "Google Drive Public File/Folder Downloader" +optional = false +python-versions = ">=3.8" +files = [ + {file = "gdown-5.2.0-py3-none-any.whl", hash = "sha256:33083832d82b1101bdd0e9df3edd0fbc0e1c5f14c9d8c38d2a35bf1683b526d6"}, + {file = "gdown-5.2.0.tar.gz", hash = "sha256:2145165062d85520a3cd98b356c9ed522c5e7984d408535409fd46f94defc787"}, +] + +[package.dependencies] +beautifulsoup4 = "*" +filelock = "*" +requests = {version = "*", extras = ["socks"]} +tqdm = "*" + +[package.extras] +test = ["build", "mypy", "pytest", "pytest-xdist", "ruff", "twine", "types-requests", "types-setuptools"] + [[package]] name = "ghp-import" version = "2.1.0" @@ -2992,6 +3012,18 @@ tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""} docs = ["furo (>=2023.8.19)", "sphinx (<7.2)", "sphinx-autodoc-typehints (>=1.24)"] testing = ["covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)", "setuptools (>=68.1.2)", "wheel (>=0.41.2)"] +[[package]] +name = "pysocks" +version = "1.7.1" +description = "A Python SOCKS client module. See https://github.com/Anorov/PySocks for more information." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "PySocks-1.7.1-py27-none-any.whl", hash = "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299"}, + {file = "PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5"}, + {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"}, +] + [[package]] name = "pytest" version = "7.4.4" @@ -3426,6 +3458,7 @@ files = [ certifi = ">=2017.4.17" charset-normalizer = ">=2,<4" idna = ">=2.5,<4" +PySocks = {version = ">=1.5.6,<1.5.7 || >1.5.7", optional = true, markers = "extra == \"socks\""} urllib3 = ">=1.21.1,<3" [package.extras] @@ -4902,4 +4935,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.8,<4.0" -content-hash = "05b453b7e4b34fce7a4a0023f82b194a23dc13b35b57e02218e80f0417694913" +content-hash = "3b9517bb553ec909b8f2d6cb96baca236709a5964b5e5f05208b125591ed9270" diff --git a/pypi_llm/api/main.py b/pypi_llm/api/main.py index d5abfc5..1aa49a1 100644 --- a/pypi_llm/api/main.py +++ b/pypi_llm/api/main.py @@ -1,3 +1,5 @@ +import logging + import polars as pl from dotenv import load_dotenv from fastapi import FastAPI @@ -6,9 +8,12 @@ from sentence_transformers import SentenceTransformer from pypi_llm.config import Config +from pypi_llm.utils.logging import setup_logging from pypi_llm.utils.score_calculator import calculate_score from pypi_llm.vector_database import VectorDatabaseInterface +setup_logging() + app = FastAPI() load_dotenv() @@ -55,12 +60,20 @@ class SearchResponse(BaseModel): @app.post("/search/", response_model=SearchResponse) async def search(query: QueryModel): + """ + Search for the packages whose summary and description have the highest similarity to the query. + We take the top_k * 2 most similar packages, and then calculate weighted score based on the similarity and weekly downloads. + The top_k packages with the highest score are returned. + """ + + logging.info(f"Searching for similar projects. Query: '{query.query}'") df_matches = vector_database_interface.find_similar(query.query, top_k=query.top_k * 2) df_matches = df_matches.join(df, how="left", on="name") + logging.info("Found similar projects. Calculating the weighted scores and filtering...") df_matches = calculate_score(df_matches) df_matches = df_matches.sort("score", descending=True) df_matches = df_matches.head(query.top_k) - print("sending") + logging.info("Returning the results...") return SearchResponse(matches=df_matches.to_dicts()) diff --git a/pypi_llm/config.py b/pypi_llm/config.py index afe4f68..88c69b4 100644 --- a/pypi_llm/config.py +++ b/pypi_llm/config.py @@ -5,14 +5,17 @@ @dataclass class Config: - DATA_DIR: Path = Path("data") PINECONE_INDEX_NAME = "pypi" PINECONE_NAMESPACE = "ns1" PINECONE_TOKEN: str = field(default_factory=lambda: os.getenv("PINECONE_TOKEN")) + EMBEDDINGS_MODEL_NAME = "all-mpnet-base-v2" EMBEDDINGS_DIMENSION = 768 - PROCESSED_DATASET_CSV_NAME = "dataset.csv" + DATA_DIR: Path = Path("data") + RAW_DATASET_CSV_NAME = "raw_dataset.csv" + PROCESSED_DATASET_CSV_NAME = "processed_dataset.csv" + GOOGLE_FILE_ID = "1huR7-VD3AieBRCcQyRX9MWbPLMb_czjq" def __post_init__(self): if not self.PINECONE_TOKEN: diff --git a/pypi_llm/data/description_cleaner.py b/pypi_llm/data/description_cleaner.py index 3b1b077..3ef2f1b 100644 --- a/pypi_llm/data/description_cleaner.py +++ b/pypi_llm/data/description_cleaner.py @@ -9,11 +9,36 @@ @dataclass class DescriptionCleaner: + """ + A class that provides methods to clean PyPi package descriptions in a DataFrame column. + """ + def clean(self, df: pl.DataFrame, input_col: str, output_col: str) -> pl.DataFrame: + """ + Cleans the text in the specified DataFrame column and returns the modified DataFrame. + + Args: + df (pl.DataFrame): The DataFrame containing the text column to be cleaned. + input_col (str): The name of the input column containing the text to be cleaned. + output_col (str): The name of the output column to store the cleaned text. + + Returns: + pl.DataFrame: The modified DataFrame with the cleaned text. + """ df = df.with_columns(pl.col(input_col).apply(self._clean_text).alias(output_col)) return df def _clean_text(self, text: str) -> str: + """ + Cleans the given text by removing HTML tags, markdown image links, markdown badges, + markdown links, URLs, special markdown characters, markdown headers, and extra whitespaces. + + Args: + text (str): The text to be cleaned. + + Returns: + str: The cleaned text. + """ try: text = self._remove_html_tags(text) text = self._remove_markdown_image_links(text) diff --git a/pypi_llm/data/reader.py b/pypi_llm/data/reader.py index 33b94f8..d1b6847 100644 --- a/pypi_llm/data/reader.py +++ b/pypi_llm/data/reader.py @@ -6,10 +6,21 @@ @dataclass class DataReader: - data_dir: Path + """ + A class for reading and processing data from a raw PyPi dataset. + """ + + raw_dataset: Path def read(self): - df = pl.read_csv(self.data_dir / "pypi_dataset.csv") + """ + Reads the raw dataset, performs data processing operations, and returns the processed dataframe. + The dataset should at least have the following columns: name, description, and number_of_downloads. + + Returns: + DataFrame: The processed dataframe. + """ + df = pl.read_csv(self.raw_dataset) df = df.with_columns(weekly_downloads=(pl.col("number_of_downloads") / 4).round().cast(pl.Int32)) df = df.drop("number_of_downloads") df = df.unique(subset="name") diff --git a/pypi_llm/scripts/0_setup_pinecone.py b/pypi_llm/scripts/0_setup_pinecone.py new file mode 100644 index 0000000..3ff1a2c --- /dev/null +++ b/pypi_llm/scripts/0_setup_pinecone.py @@ -0,0 +1,32 @@ +import logging + +from dotenv import load_dotenv +from pinecone import Pinecone, ServerlessSpec + +from pypi_llm.config import Config +from pypi_llm.utils.logging import setup_logging + +setup_logging() + +if __name__ == "__main__": + """ + This script sets up a Pinecone index for storing embeddings. + + It loads the environment variables from a .env file, creates a Pinecone client, + and creates an index with the specified name, dimension, metric, and serverless specification. + """ + + load_dotenv() + config = Config() + + logging.info("Connection to Pinecone..") + pc = Pinecone(api_key=config.PINECONE_TOKEN) + + logging.info("Creating Pinecone index..") + pc.create_index( + name=config.PINECONE_INDEX_NAME, + dimension=config.EMBEDDINGS_DIMENSION, + metric="dotproduct", + spec=ServerlessSpec(cloud="aws", region="us-east-1"), + ) + logging.info("Done!") diff --git a/pypi_llm/scripts/1_download_dataset.py b/pypi_llm/scripts/1_download_dataset.py new file mode 100644 index 0000000..3d4250e --- /dev/null +++ b/pypi_llm/scripts/1_download_dataset.py @@ -0,0 +1,22 @@ +import logging + +import gdown +from dotenv import load_dotenv + +from pypi_llm.config import Config +from pypi_llm.utils.logging import setup_logging + +setup_logging() + +if __name__ == "__main__": + """ + Downloads the dataset from a Google Drive link using the gdown library. + """ + load_dotenv() + config = Config() + + logging.info("Downloading raw dataset from Google Drive...") + url = f"https://drive.google.com/uc?id={config.GOOGLE_FILE_ID}" + output = str(config.DATA_DIR / config.RAW_DATASET_CSV_NAME) + gdown.download(url, output, quiet=False) + logging.info("Done!") diff --git a/pypi_llm/scripts/2_process_dataset.py b/pypi_llm/scripts/2_process_dataset.py new file mode 100644 index 0000000..cb35337 --- /dev/null +++ b/pypi_llm/scripts/2_process_dataset.py @@ -0,0 +1,31 @@ +import logging + +import polars as pl +from dotenv import load_dotenv + +from pypi_llm.config import Config +from pypi_llm.data.description_cleaner import CLEANING_FAILED, DescriptionCleaner +from pypi_llm.data.reader import DataReader +from pypi_llm.utils.logging import setup_logging + +setup_logging() + +if __name__ == "__main__": + """ + This script processes a dataset by cleaning the description column and saving the processed dataset as a CSV file. + """ + + load_dotenv() + config = Config() + + logging.info("Reading the raw dataset...") + df = DataReader(config.DATA_DIR / config.RAW_DATASET_CSV_NAME).read() + + logging.info("Cleaning the descriptions...") + df = DescriptionCleaner().clean(df, "description", "description_cleaned") + df = df.filter(~pl.col("description_cleaned").is_null()) + df = df.filter(pl.col("description_cleaned") != CLEANING_FAILED) + + logging.info("Storing the processed dataset...") + df.write_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) + logging.info("Done!") diff --git a/pypi_llm/scripts/upsert_data.py b/pypi_llm/scripts/3_upsert_data.py similarity index 65% rename from pypi_llm/scripts/upsert_data.py rename to pypi_llm/scripts/3_upsert_data.py index e7b7e43..9206edd 100644 --- a/pypi_llm/scripts/upsert_data.py +++ b/pypi_llm/scripts/3_upsert_data.py @@ -1,24 +1,26 @@ +import logging + import polars as pl from dotenv import load_dotenv from sentence_transformers import SentenceTransformer from pypi_llm.config import Config -from pypi_llm.data.description_cleaner import CLEANING_FAILED, DescriptionCleaner -from pypi_llm.data.reader import DataReader +from pypi_llm.utils.logging import setup_logging from pypi_llm.vector_database import VectorDatabaseInterface +setup_logging() + if __name__ == "__main__": + """ + Upserts data from a processed dataset CSV into a vector database. + """ load_dotenv() config = Config() - df = DataReader(config.DATA_DIR).read() - - df = DescriptionCleaner().clean(df, "description", "description_cleaned") - df = df.filter(~pl.col("description_cleaned").is_null()) - df = df.filter(pl.col("description_cleaned") != CLEANING_FAILED) - - df.write_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) + logging.info("Reading the processed dataset...") + df = pl.read_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) + logging.info("Connecting to the vector database..") vector_database_interface = VectorDatabaseInterface( pinecone_token=config.PINECONE_TOKEN, pinecone_index_name=config.PINECONE_INDEX_NAME, @@ -26,7 +28,9 @@ pinecone_namespace=config.PINECONE_NAMESPACE, ) + logging.info("Upserting data into the vector database..") df = df.with_columns( summary_and_description_cleaned=pl.concat_str(pl.col("summary"), pl.lit(" - "), pl.col("description_cleaned")) ) vector_database_interface.upsert_polars(df, key_column="name", text_column="summary_and_description_cleaned") + logging.info("Done!") diff --git a/pypi_llm/scripts/setup_pinecone.py b/pypi_llm/scripts/setup_pinecone.py deleted file mode 100644 index 126499a..0000000 --- a/pypi_llm/scripts/setup_pinecone.py +++ /dev/null @@ -1,17 +0,0 @@ -from dotenv import load_dotenv -from pinecone import Pinecone, ServerlessSpec - -from pypi_llm.config import Config - -if __name__ == "__main__": - load_dotenv() - config = Config() - - pc = Pinecone(api_key=config.PINECONE_TOKEN) - - pc.create_index( - name=config.PINECONE_INDEX_NAME, - dimension=config.EMBEDDINGS_DIMENSION, - metric="dotproduct", - spec=ServerlessSpec(cloud="aws", region="us-east-1"), - ) diff --git a/pypi_llm/utils/logging.py b/pypi_llm/utils/logging.py new file mode 100644 index 0000000..9ddc72c --- /dev/null +++ b/pypi_llm/utils/logging.py @@ -0,0 +1,9 @@ +import logging + + +def setup_logging(): + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler()], + ) diff --git a/pypi_llm/vector_database/interface.py b/pypi_llm/vector_database/interface.py index d42496b..ecba020 100644 --- a/pypi_llm/vector_database/interface.py +++ b/pypi_llm/vector_database/interface.py @@ -5,6 +5,17 @@ class VectorDatabaseInterface: + """ + A class that provides an interface for interacting with a vector database. + + Args: + pinecone_token (str): The Pinecone API token. + pinecone_index_name (str): The name of the Pinecone index. + pinecone_namespace (str): The namespace for the Pinecone index. + embeddings_model (SentenceTransformer): The sentence transformer model for encoding text into embeddings. + batch_size (int, optional): The batch size for upserting data. Defaults to 250. + """ + def __init__( self, pinecone_token: str, @@ -20,11 +31,29 @@ def __init__( self.pinecone_namespace = pinecone_namespace def upsert_polars(self, df: pl.DataFrame, key_column: str, text_column: str): + """ + Upserts the data from a Polars DataFrame into the vector database. + + Args: + df (pl.DataFrame): The Polars DataFrame containing the data to be upserted. + key_column (str): The name of the column in the DataFrame containing the unique keys. + text_column (str): The name of the column in the DataFrame containing the text data. + """ df_chunks = self._split_dataframe_in_batches(df) for chunk in tqdm(df_chunks, desc="Upserting batches", unit="batch"): self._upsert_chunk(chunk, key_column, text_column) def find_similar(self, query: str, top_k: int = 25) -> pl.DataFrame: + """ + Finds similar vectors in the database for a given query. + + Args: + query (str): The query string. + top_k (int, optional): The number of similar vectors to retrieve. Defaults to 25. + + Returns: + pl.DataFrame: A Polars DataFrame containing the similar vectors and their similarity scores. + """ embeddings = self.model.encode(query) matches = self.index.query( namespace=self.pinecone_namespace, vector=embeddings.tolist(), top_k=top_k, include_values=False diff --git a/pyproject.toml b/pyproject.toml index d967012..8319198 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ tqdm = "^4.66.4" fastapi = "^0.111.0" pydantic = "^2.7.4" uvicorn = "^0.30.1" +gdown = "^5.2.0" [tool.poetry.group.dev.dependencies] pytest = "^7.2.0" diff --git a/tests/test_foo.py b/tests/test_foo.py deleted file mode 100644 index 2fef3fe..0000000 --- a/tests/test_foo.py +++ /dev/null @@ -1,5 +0,0 @@ -from pypi_llm.foo import foo - - -def test_foo(): - assert foo("foo") == "foo"