Skip to content

Commit

Permalink
add docker compose
Browse files Browse the repository at this point in the history
  • Loading branch information
florian committed Jun 15, 2024
1 parent e2cdf6e commit c226b28
Show file tree
Hide file tree
Showing 13 changed files with 174 additions and 86 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ COPY pypi_scout /code/pypi_scout/

ENV PYTHONPATH=/code

CMD [ "python", "pypi_scout/foo.py"]
CMD [ "/bin/bash" ]
51 changes: 8 additions & 43 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
# pypi-scout
# ✨PyPi Scout

https://drive.google.com/file/d/1huR7-VD3AieBRCcQyRX9MWbPLMb_czjq/view?usp=sharing

# setup

```sh
cp .env.template .env
```

add API token

```
docker build -t pypi-scout .
```
Expand All @@ -13,46 +19,5 @@ docker run --rm \
--env-file .env \
-v $(pwd)/data:/code/data \
pypi-scout \
python /code/pypi_scout/scripts/1_download_dataset.py
```

## total

```sql
WITH recent_downloads AS (
SELECT
project,
COUNT(*) AS download_count
FROM
`bigquery-public-data.pypi.file_downloads`
WHERE
DATE(timestamp) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 28 DAY) AND CURRENT_DATE()
GROUP BY
project
HAVING
download_count >= 250
)
SELECT
rd.project AS name,
dm.description AS description,
dm.summary AS summary,
dm.version AS latest_version,
rd.download_count AS number_of_downloads
FROM
recent_downloads rd
JOIN
`bigquery-public-data.pypi.distribution_metadata` dm
ON
rd.project = dm.name
WHERE
dm.upload_time = (
SELECT
MAX(upload_time)
FROM
`bigquery-public-data.pypi.distribution_metadata` sub_dm
WHERE
sub_dm.name = dm.name
)
ORDER BY
rd.download_count DESC;
python /code/pypi_scout/scripts/setup.py
```
27 changes: 27 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
version: "3.8"

services:
frontend:
build:
context: ./frontend
dockerfile: Dockerfile
ports:
- "3000:3000"
volumes:
- ./frontend:/app
environment:
- NODE_ENV=production

backend:
build:
context: .
dockerfile: Dockerfile
command: poetry run uvicorn pypi_scout.api.main:app --host 0.0.0.0 --port 8000 --reload
ports:
- "8000:8000"
volumes:
- .:/code
environment:
- PYTHONPATH=/code
depends_on:
- frontend
23 changes: 23 additions & 0 deletions frontend/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Use the official Node.js image as the base image
FROM node:18-alpine

# Set the working directory inside the container
WORKDIR /app

# Copy package.json and package-lock.json files to the container
COPY package.json package-lock.json ./

# Install dependencies
RUN npm install

# Copy the rest of the application code to the container
COPY . .

# Build the Next.js application
RUN npm run build

# Expose the port on which the application will run
EXPOSE 3000

# Start the Next.js application
CMD ["npm", "run", "start"]
4 changes: 2 additions & 2 deletions frontend/app/components/InfoBox.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ const InfoBox: React.FC<InfoBoxProps> = ({ infoBoxVisible }) => {
<h2 className="text-2xl font-bold mb-2">How does this work?</h2>
<p className="text-gray-700">
This application allows you to search for Python packages on PyPi using
natural language. An example query would be "a package that creates
plots and beautiful visualizations".
natural language. An example query would be &quot;a package that creates
plots and beautiful visualizations&quot;.
</p>
<br />
<p className="text-gray-700">
Expand Down
5 changes: 3 additions & 2 deletions frontend/app/utils/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,9 @@ export const sortResults = (
direction: string,
): Match[] => {
return [...data].sort((a, b) => {
if (a[field] < b[field]) return direction === "asc" ? -1 : 1;
if (a[field] > b[field]) return direction === "asc" ? 1 : -1;
// @ts-ignore
if (a[field] < b[field]) return direction === "asc" ? -1 : 1; // @ts-ignore
if (a[field] > b[field]) return direction === "asc" ? 1 : -1; // @ts-ignore
return 0;
});
};
36 changes: 36 additions & 0 deletions pypi_bigquery.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
WITH recent_downloads AS (
SELECT
project,
COUNT(*) AS download_count
FROM
`bigquery-public-data.pypi.file_downloads`
WHERE
DATE(timestamp) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 28 DAY) AND CURRENT_DATE()
GROUP BY
project
HAVING
download_count >= 250
)
SELECT
rd.project AS name,
dm.description AS description,
dm.summary AS summary,
dm.version AS latest_version,
rd.download_count AS number_of_downloads
FROM
recent_downloads rd
JOIN
`bigquery-public-data.pypi.distribution_metadata` dm
ON
rd.project = dm.name
WHERE
dm.upload_time = (
SELECT
MAX(upload_time)
FROM
`bigquery-public-data.pypi.distribution_metadata` sub_dm
WHERE
sub_dm.name = dm.name
)
ORDER BY
rd.download_count DESC;
32 changes: 0 additions & 32 deletions pypi_scout/scripts/0_setup_pinecone.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@
from pypi_scout.config import Config
from pypi_scout.utils.logging import setup_logging

setup_logging()

if __name__ == "__main__":
def download_dataset():
"""
Downloads the dataset from a Google Drive link using the gdown library.
"""
Expand All @@ -20,3 +19,8 @@
output = str(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
gdown.download(url, output, quiet=False)
logging.info("Done!")


if __name__ == "__main__":
setup_logging()
download_dataset()
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@
from pypi_scout.data.reader import DataReader
from pypi_scout.utils.logging import setup_logging

setup_logging()

if __name__ == "__main__":
def process_dataset():
"""
This script processes a dataset by cleaning the description column and saving the processed dataset as a CSV file.
"""
Expand All @@ -29,3 +28,8 @@
logging.info("Storing the processed dataset...")
df.write_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
logging.info("Done!")


if __name__ == "__main__":
setup_logging()
process_dataset()
13 changes: 13 additions & 0 deletions pypi_scout/scripts/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from pypi_scout.scripts.download_dataset import download_dataset
from pypi_scout.scripts.process_dataset import process_dataset
from pypi_scout.scripts.setup_pinecone import setup_pinecone
from pypi_scout.scripts.upsert_data import upsert_data
from pypi_scout.utils.logging import setup_logging

setup_logging()

if __name__ == "__main__":
setup_pinecone()
download_dataset()
process_dataset()
upsert_data()
43 changes: 43 additions & 0 deletions pypi_scout/scripts/setup_pinecone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import logging

from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from pinecone.core.client.exceptions import PineconeApiException

from pypi_scout.config import Config
from pypi_scout.utils.logging import setup_logging


def setup_pinecone():
"""
This script sets up a Pinecone index for storing embeddings.
It loads the environment variables from a .env file, creates a Pinecone client,
and creates an index with the specified name, dimension, metric, and serverless specification.
"""

load_dotenv()
config = Config()

logging.info("Connecting to Pinecone..")
pc = Pinecone(api_key=config.PINECONE_TOKEN)

try:
logging.info("Creating Pinecone index..")
pc.create_index(
name=config.PINECONE_INDEX_NAME,
dimension=config.EMBEDDINGS_DIMENSION,
metric="dotproduct",
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
logging.info("Pinecone index created successfully.")
except PineconeApiException as e:
if e.status == 409:
logging.warning(f"Pinecone index '{config.PINECONE_INDEX_NAME}' already exists.")
else:
logging.exception("An error occurred while creating the Pinecone index.")


if __name__ == "__main__":
setup_logging()
setup_pinecone()
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@
from pypi_scout.utils.logging import setup_logging
from pypi_scout.vector_database import VectorDatabaseInterface

setup_logging()

if __name__ == "__main__":
def upsert_data():
"""
Upserts data from a processed dataset CSV into a vector database.
"""
Expand All @@ -34,3 +33,8 @@
)
vector_database_interface.upsert_polars(df, key_column="name", text_column="summary_and_description_cleaned")
logging.info("Done!")


if __name__ == "__main__":
setup_logging()
upsert_data()

0 comments on commit c226b28

Please sign in to comment.