diff --git a/Dockerfile b/Dockerfile index 335907a..28af359 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,4 +20,4 @@ COPY pypi_scout /code/pypi_scout/ ENV PYTHONPATH=/code -CMD [ "python", "pypi_scout/foo.py"] +CMD [ "/bin/bash" ] diff --git a/README.md b/README.md index 1c41984..489c1b4 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,15 @@ -# pypi-scout +# ✨PyPi Scout https://drive.google.com/file/d/1huR7-VD3AieBRCcQyRX9MWbPLMb_czjq/view?usp=sharing # setup +```sh +cp .env.template .env +``` + +add API token + ``` docker build -t pypi-scout . ``` @@ -13,46 +19,5 @@ docker run --rm \ --env-file .env \ -v $(pwd)/data:/code/data \ pypi-scout \ - python /code/pypi_scout/scripts/1_download_dataset.py -``` - -## total - -```sql -WITH recent_downloads AS ( - SELECT - project, - COUNT(*) AS download_count - FROM - `bigquery-public-data.pypi.file_downloads` - WHERE - DATE(timestamp) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 28 DAY) AND CURRENT_DATE() - GROUP BY - project - HAVING - download_count >= 250 -) -SELECT - rd.project AS name, - dm.description AS description, - dm.summary AS summary, - dm.version AS latest_version, - rd.download_count AS number_of_downloads -FROM - recent_downloads rd -JOIN - `bigquery-public-data.pypi.distribution_metadata` dm -ON - rd.project = dm.name -WHERE - dm.upload_time = ( - SELECT - MAX(upload_time) - FROM - `bigquery-public-data.pypi.distribution_metadata` sub_dm - WHERE - sub_dm.name = dm.name - ) -ORDER BY - rd.download_count DESC; + python /code/pypi_scout/scripts/setup.py ``` diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..9e8c28d --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,27 @@ +version: "3.8" + +services: + frontend: + build: + context: ./frontend + dockerfile: Dockerfile + ports: + - "3000:3000" + volumes: + - ./frontend:/app + environment: + - NODE_ENV=production + + backend: + build: + context: . + dockerfile: Dockerfile + command: poetry run uvicorn pypi_scout.api.main:app --host 0.0.0.0 --port 8000 --reload + ports: + - "8000:8000" + volumes: + - .:/code + environment: + - PYTHONPATH=/code + depends_on: + - frontend diff --git a/frontend/Dockerfile b/frontend/Dockerfile new file mode 100644 index 0000000..290190a --- /dev/null +++ b/frontend/Dockerfile @@ -0,0 +1,23 @@ +# Use the official Node.js image as the base image +FROM node:18-alpine + +# Set the working directory inside the container +WORKDIR /app + +# Copy package.json and package-lock.json files to the container +COPY package.json package-lock.json ./ + +# Install dependencies +RUN npm install + +# Copy the rest of the application code to the container +COPY . . + +# Build the Next.js application +RUN npm run build + +# Expose the port on which the application will run +EXPOSE 3000 + +# Start the Next.js application +CMD ["npm", "run", "start"] diff --git a/frontend/app/components/InfoBox.tsx b/frontend/app/components/InfoBox.tsx index 23f757f..9765e92 100644 --- a/frontend/app/components/InfoBox.tsx +++ b/frontend/app/components/InfoBox.tsx @@ -12,8 +12,8 @@ const InfoBox: React.FC = ({ infoBoxVisible }) => {

How does this work?

This application allows you to search for Python packages on PyPi using - natural language. An example query would be "a package that creates - plots and beautiful visualizations". + natural language. An example query would be "a package that creates + plots and beautiful visualizations".


diff --git a/frontend/app/utils/search.ts b/frontend/app/utils/search.ts index b8abf25..56c652a 100644 --- a/frontend/app/utils/search.ts +++ b/frontend/app/utils/search.ts @@ -45,8 +45,9 @@ export const sortResults = ( direction: string, ): Match[] => { return [...data].sort((a, b) => { - if (a[field] < b[field]) return direction === "asc" ? -1 : 1; - if (a[field] > b[field]) return direction === "asc" ? 1 : -1; + // @ts-ignore + if (a[field] < b[field]) return direction === "asc" ? -1 : 1; // @ts-ignore + if (a[field] > b[field]) return direction === "asc" ? 1 : -1; // @ts-ignore return 0; }); }; diff --git a/pypi_bigquery.sql b/pypi_bigquery.sql new file mode 100644 index 0000000..0dec4b7 --- /dev/null +++ b/pypi_bigquery.sql @@ -0,0 +1,36 @@ +WITH recent_downloads AS ( + SELECT + project, + COUNT(*) AS download_count + FROM + `bigquery-public-data.pypi.file_downloads` + WHERE + DATE(timestamp) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 28 DAY) AND CURRENT_DATE() + GROUP BY + project + HAVING + download_count >= 250 +) +SELECT + rd.project AS name, + dm.description AS description, + dm.summary AS summary, + dm.version AS latest_version, + rd.download_count AS number_of_downloads +FROM + recent_downloads rd +JOIN + `bigquery-public-data.pypi.distribution_metadata` dm +ON + rd.project = dm.name +WHERE + dm.upload_time = ( + SELECT + MAX(upload_time) + FROM + `bigquery-public-data.pypi.distribution_metadata` sub_dm + WHERE + sub_dm.name = dm.name + ) +ORDER BY + rd.download_count DESC; diff --git a/pypi_scout/scripts/0_setup_pinecone.py b/pypi_scout/scripts/0_setup_pinecone.py deleted file mode 100644 index 409fcf4..0000000 --- a/pypi_scout/scripts/0_setup_pinecone.py +++ /dev/null @@ -1,32 +0,0 @@ -import logging - -from dotenv import load_dotenv -from pinecone import Pinecone, ServerlessSpec - -from pypi_scout.config import Config -from pypi_scout.utils.logging import setup_logging - -setup_logging() - -if __name__ == "__main__": - """ - This script sets up a Pinecone index for storing embeddings. - - It loads the environment variables from a .env file, creates a Pinecone client, - and creates an index with the specified name, dimension, metric, and serverless specification. - """ - - load_dotenv() - config = Config() - - logging.info("Connection to Pinecone..") - pc = Pinecone(api_key=config.PINECONE_TOKEN) - - logging.info("Creating Pinecone index..") - pc.create_index( - name=config.PINECONE_INDEX_NAME, - dimension=config.EMBEDDINGS_DIMENSION, - metric="dotproduct", - spec=ServerlessSpec(cloud="aws", region="us-east-1"), - ) - logging.info("Done!") diff --git a/pypi_scout/scripts/1_download_dataset.py b/pypi_scout/scripts/download_dataset.py similarity index 89% rename from pypi_scout/scripts/1_download_dataset.py rename to pypi_scout/scripts/download_dataset.py index 1a461b8..8351780 100644 --- a/pypi_scout/scripts/1_download_dataset.py +++ b/pypi_scout/scripts/download_dataset.py @@ -6,9 +6,8 @@ from pypi_scout.config import Config from pypi_scout.utils.logging import setup_logging -setup_logging() -if __name__ == "__main__": +def download_dataset(): """ Downloads the dataset from a Google Drive link using the gdown library. """ @@ -20,3 +19,8 @@ output = str(config.DATA_DIR / config.RAW_DATASET_CSV_NAME) gdown.download(url, output, quiet=False) logging.info("Done!") + + +if __name__ == "__main__": + setup_logging() + download_dataset() diff --git a/pypi_scout/scripts/2_process_dataset.py b/pypi_scout/scripts/process_dataset.py similarity index 93% rename from pypi_scout/scripts/2_process_dataset.py rename to pypi_scout/scripts/process_dataset.py index dd604f9..8edfb66 100644 --- a/pypi_scout/scripts/2_process_dataset.py +++ b/pypi_scout/scripts/process_dataset.py @@ -8,9 +8,8 @@ from pypi_scout.data.reader import DataReader from pypi_scout.utils.logging import setup_logging -setup_logging() -if __name__ == "__main__": +def process_dataset(): """ This script processes a dataset by cleaning the description column and saving the processed dataset as a CSV file. """ @@ -29,3 +28,8 @@ logging.info("Storing the processed dataset...") df.write_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME) logging.info("Done!") + + +if __name__ == "__main__": + setup_logging() + process_dataset() diff --git a/pypi_scout/scripts/setup.py b/pypi_scout/scripts/setup.py new file mode 100644 index 0000000..3f952ed --- /dev/null +++ b/pypi_scout/scripts/setup.py @@ -0,0 +1,13 @@ +from pypi_scout.scripts.download_dataset import download_dataset +from pypi_scout.scripts.process_dataset import process_dataset +from pypi_scout.scripts.setup_pinecone import setup_pinecone +from pypi_scout.scripts.upsert_data import upsert_data +from pypi_scout.utils.logging import setup_logging + +setup_logging() + +if __name__ == "__main__": + setup_pinecone() + download_dataset() + process_dataset() + upsert_data() diff --git a/pypi_scout/scripts/setup_pinecone.py b/pypi_scout/scripts/setup_pinecone.py new file mode 100644 index 0000000..4ef883a --- /dev/null +++ b/pypi_scout/scripts/setup_pinecone.py @@ -0,0 +1,43 @@ +import logging + +from dotenv import load_dotenv +from pinecone import Pinecone, ServerlessSpec +from pinecone.core.client.exceptions import PineconeApiException + +from pypi_scout.config import Config +from pypi_scout.utils.logging import setup_logging + + +def setup_pinecone(): + """ + This script sets up a Pinecone index for storing embeddings. + + It loads the environment variables from a .env file, creates a Pinecone client, + and creates an index with the specified name, dimension, metric, and serverless specification. + """ + + load_dotenv() + config = Config() + + logging.info("Connecting to Pinecone..") + pc = Pinecone(api_key=config.PINECONE_TOKEN) + + try: + logging.info("Creating Pinecone index..") + pc.create_index( + name=config.PINECONE_INDEX_NAME, + dimension=config.EMBEDDINGS_DIMENSION, + metric="dotproduct", + spec=ServerlessSpec(cloud="aws", region="us-east-1"), + ) + logging.info("Pinecone index created successfully.") + except PineconeApiException as e: + if e.status == 409: + logging.warning(f"Pinecone index '{config.PINECONE_INDEX_NAME}' already exists.") + else: + logging.exception("An error occurred while creating the Pinecone index.") + + +if __name__ == "__main__": + setup_logging() + setup_pinecone() diff --git a/pypi_scout/scripts/3_upsert_data.py b/pypi_scout/scripts/upsert_data.py similarity index 95% rename from pypi_scout/scripts/3_upsert_data.py rename to pypi_scout/scripts/upsert_data.py index 613727b..796bb83 100644 --- a/pypi_scout/scripts/3_upsert_data.py +++ b/pypi_scout/scripts/upsert_data.py @@ -8,9 +8,8 @@ from pypi_scout.utils.logging import setup_logging from pypi_scout.vector_database import VectorDatabaseInterface -setup_logging() -if __name__ == "__main__": +def upsert_data(): """ Upserts data from a processed dataset CSV into a vector database. """ @@ -34,3 +33,8 @@ ) vector_database_interface.upsert_polars(df, key_column="name", text_column="summary_and_description_cleaned") logging.info("Done!") + + +if __name__ == "__main__": + setup_logging() + upsert_data()