add docker compose

fpgmaas · Jun 15, 2024 · c226b28 · c226b28
1 parent e2cdf6e
commit c226b28
Show file tree

Hide file tree

Showing 13 changed files with 174 additions and 86 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -20,4 +20,4 @@ COPY pypi_scout /code/pypi_scout/
 
 ENV PYTHONPATH=/code
 
-CMD [ "python", "pypi_scout/foo.py"]
+CMD [ "/bin/bash" ]
diff --git a/README.md b/README.md
@@ -1,9 +1,15 @@
-# pypi-scout
+# ✨PyPi Scout
 
 https://drive.google.com/file/d/1huR7-VD3AieBRCcQyRX9MWbPLMb_czjq/view?usp=sharing
 
 # setup
 
+```sh
+cp .env.template .env
+```
+
+add API token
+
 ```
 docker build -t pypi-scout .
 ```
@@ -13,46 +19,5 @@ docker run --rm \
   --env-file .env \
   -v $(pwd)/data:/code/data \
   pypi-scout \
-  python /code/pypi_scout/scripts/1_download_dataset.py
-```
-
-## total
-
-```sql
-WITH recent_downloads AS (
-  SELECT
-    project,
-    COUNT(*) AS download_count
-  FROM
-    `bigquery-public-data.pypi.file_downloads`
-  WHERE
-    DATE(timestamp) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 28 DAY) AND CURRENT_DATE()
-  GROUP BY
-    project
-  HAVING
-    download_count >= 250
-)
-SELECT
-  rd.project AS name,
-  dm.description AS description,
-  dm.summary AS summary,
-  dm.version AS latest_version,
-  rd.download_count AS number_of_downloads
-FROM
-  recent_downloads rd
-JOIN
-  `bigquery-public-data.pypi.distribution_metadata` dm
-ON
-  rd.project = dm.name
-WHERE
-  dm.upload_time = (
-    SELECT
-      MAX(upload_time)
-    FROM
-      `bigquery-public-data.pypi.distribution_metadata` sub_dm
-    WHERE
-      sub_dm.name = dm.name
-  )
-ORDER BY
-  rd.download_count DESC;
+  python /code/pypi_scout/scripts/setup.py
 ```
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,27 @@
+version: "3.8"
+
+services:
+  frontend:
+    build:
+      context: ./frontend
+      dockerfile: Dockerfile
+    ports:
+      - "3000:3000"
+    volumes:
+      - ./frontend:/app
+    environment:
+      - NODE_ENV=production
+
+  backend:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    command: poetry run uvicorn pypi_scout.api.main:app --host 0.0.0.0 --port 8000 --reload
+    ports:
+      - "8000:8000"
+    volumes:
+      - .:/code
+    environment:
+      - PYTHONPATH=/code
+    depends_on:
+      - frontend
diff --git a/frontend/Dockerfile b/frontend/Dockerfile
@@ -0,0 +1,23 @@
+# Use the official Node.js image as the base image
+FROM node:18-alpine
+
+# Set the working directory inside the container
+WORKDIR /app
+
+# Copy package.json and package-lock.json files to the container
+COPY package.json package-lock.json ./
+
+# Install dependencies
+RUN npm install
+
+# Copy the rest of the application code to the container
+COPY . .
+
+# Build the Next.js application
+RUN npm run build
+
+# Expose the port on which the application will run
+EXPOSE 3000
+
+# Start the Next.js application
+CMD ["npm", "run", "start"]
diff --git a/frontend/app/components/InfoBox.tsx b/frontend/app/components/InfoBox.tsx
@@ -12,8 +12,8 @@ const InfoBox: React.FC<InfoBoxProps> = ({ infoBoxVisible }) => {
       <h2 className="text-2xl font-bold mb-2">How does this work?</h2>
       <p className="text-gray-700">
         This application allows you to search for Python packages on PyPi using
-        natural language. An example query would be "a package that creates
-        plots and beautiful visualizations".
+        natural language. An example query would be &quot;a package that creates
+        plots and beautiful visualizations&quot;.
       </p>
       <br />
       <p className="text-gray-700">

diff --git a/frontend/app/utils/search.ts b/frontend/app/utils/search.ts
@@ -45,8 +45,9 @@ export const sortResults = (
   direction: string,
 ): Match[] => {
   return [...data].sort((a, b) => {
-    if (a[field] < b[field]) return direction === "asc" ? -1 : 1;
-    if (a[field] > b[field]) return direction === "asc" ? 1 : -1;
+    // @ts-ignore
+    if (a[field] < b[field]) return direction === "asc" ? -1 : 1; // @ts-ignore
+    if (a[field] > b[field]) return direction === "asc" ? 1 : -1; // @ts-ignore
     return 0;
   });
 };
diff --git a/pypi_bigquery.sql b/pypi_bigquery.sql
@@ -0,0 +1,36 @@
+WITH recent_downloads AS (
+  SELECT
+    project,
+    COUNT(*) AS download_count
+  FROM
+    `bigquery-public-data.pypi.file_downloads`
+  WHERE
+    DATE(timestamp) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 28 DAY) AND CURRENT_DATE()
+  GROUP BY
+    project
+  HAVING
+    download_count >= 250
+)
+SELECT
+  rd.project AS name,
+  dm.description AS description,
+  dm.summary AS summary,
+  dm.version AS latest_version,
+  rd.download_count AS number_of_downloads
+FROM
+  recent_downloads rd
+JOIN
+  `bigquery-public-data.pypi.distribution_metadata` dm
+ON
+  rd.project = dm.name
+WHERE
+  dm.upload_time = (
+    SELECT
+      MAX(upload_time)
+    FROM
+      `bigquery-public-data.pypi.distribution_metadata` sub_dm
+    WHERE
+      sub_dm.name = dm.name
+  )
+ORDER BY
+  rd.download_count DESC;
diff --git a/pypi_scout/scripts/0_setup_pinecone.py b/pypi_scout/scripts/0_setup_pinecone.py
diff --git a/pypi_scout/scripts/1_download_dataset.py → pypi_scout/scripts/download_dataset.py b/pypi_scout/scripts/1_download_dataset.py → pypi_scout/scripts/download_dataset.py
@@ -6,9 +6,8 @@
 from pypi_scout.config import Config
 from pypi_scout.utils.logging import setup_logging
 
-setup_logging()
 
-if __name__ == "__main__":
+def download_dataset():
     """
     Downloads the dataset from a Google Drive link using the gdown library.
     """
@@ -20,3 +19,8 @@
     output = str(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
     gdown.download(url, output, quiet=False)
     logging.info("Done!")
+
+
+if __name__ == "__main__":
+    setup_logging()
+    download_dataset()
diff --git a/pypi_scout/scripts/2_process_dataset.py → pypi_scout/scripts/process_dataset.py b/pypi_scout/scripts/2_process_dataset.py → pypi_scout/scripts/process_dataset.py
@@ -8,9 +8,8 @@
 from pypi_scout.data.reader import DataReader
 from pypi_scout.utils.logging import setup_logging
 
-setup_logging()
 
-if __name__ == "__main__":
+def process_dataset():
     """
     This script processes a dataset by cleaning the description column and saving the processed dataset as a CSV file.
     """
@@ -29,3 +28,8 @@
     logging.info("Storing the processed dataset...")
     df.write_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
     logging.info("Done!")
+
+
+if __name__ == "__main__":
+    setup_logging()
+    process_dataset()
diff --git a/pypi_scout/scripts/setup.py b/pypi_scout/scripts/setup.py
@@ -0,0 +1,13 @@
+from pypi_scout.scripts.download_dataset import download_dataset
+from pypi_scout.scripts.process_dataset import process_dataset
+from pypi_scout.scripts.setup_pinecone import setup_pinecone
+from pypi_scout.scripts.upsert_data import upsert_data
+from pypi_scout.utils.logging import setup_logging
+
+setup_logging()
+
+if __name__ == "__main__":
+    setup_pinecone()
+    download_dataset()
+    process_dataset()
+    upsert_data()
diff --git a/pypi_scout/scripts/setup_pinecone.py b/pypi_scout/scripts/setup_pinecone.py
@@ -0,0 +1,43 @@
+import logging
+
+from dotenv import load_dotenv
+from pinecone import Pinecone, ServerlessSpec
+from pinecone.core.client.exceptions import PineconeApiException
+
+from pypi_scout.config import Config
+from pypi_scout.utils.logging import setup_logging
+
+
+def setup_pinecone():
+    """
+    This script sets up a Pinecone index for storing embeddings.
+
+    It loads the environment variables from a .env file, creates a Pinecone client,
+    and creates an index with the specified name, dimension, metric, and serverless specification.
+    """
+
+    load_dotenv()
+    config = Config()
+
+    logging.info("Connecting to Pinecone..")
+    pc = Pinecone(api_key=config.PINECONE_TOKEN)
+
+    try:
+        logging.info("Creating Pinecone index..")
+        pc.create_index(
+            name=config.PINECONE_INDEX_NAME,
+            dimension=config.EMBEDDINGS_DIMENSION,
+            metric="dotproduct",
+            spec=ServerlessSpec(cloud="aws", region="us-east-1"),
+        )
+        logging.info("Pinecone index created successfully.")
+    except PineconeApiException as e:
+        if e.status == 409:
+            logging.warning(f"Pinecone index '{config.PINECONE_INDEX_NAME}' already exists.")
+        else:
+            logging.exception("An error occurred while creating the Pinecone index.")
+
+
+if __name__ == "__main__":
+    setup_logging()
+    setup_pinecone()
diff --git a/pypi_scout/scripts/3_upsert_data.py → pypi_scout/scripts/upsert_data.py b/pypi_scout/scripts/3_upsert_data.py → pypi_scout/scripts/upsert_data.py
@@ -8,9 +8,8 @@
 from pypi_scout.utils.logging import setup_logging
 from pypi_scout.vector_database import VectorDatabaseInterface
 
-setup_logging()
 
-if __name__ == "__main__":
+def upsert_data():
     """
     Upserts data from a processed dataset CSV into a vector database.
     """
@@ -34,3 +33,8 @@
     )
     vector_database_interface.upsert_polars(df, key_column="name", text_column="summary_and_description_cleaned")
     logging.info("Done!")
+
+
+if __name__ == "__main__":
+    setup_logging()
+    upsert_data()