diff --git a/.gitignore b/.gitignore
index 1a607ed..bf23a33 100644
--- a/.gitignore
+++ b/.gitignore
@@ -166,5 +166,4 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
-/data
 .env
diff --git a/Dockerfile b/Dockerfile
index 1c340da..161ac51 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,8 @@
 # syntax=docker/dockerfile:1
 
-FROM python:3.9-slim-buster
+FROM python:3.10-slim-buster
 
-ENV POETRY_VERSION=1.4 \
+ENV POETRY_VERSION=1.6 \
     POETRY_VIRTUALENVS_CREATE=false
 
 # Install poetry
@@ -18,4 +18,6 @@ RUN poetry install --no-interaction --no-ansi --no-root --no-dev
 # Copy Python code to the Docker image
 COPY pypi_llm /code/pypi_llm/
 
+ENV PYTHONPATH=/code
+
 CMD [ "python", "pypi_llm/foo.py"]
diff --git a/README.md b/README.md
index be632a1..56c95c2 100644
--- a/README.md
+++ b/README.md
@@ -1,55 +1,20 @@
 # pypi-llm
 
-[![Release](https://img.shields.io/github/v/release/fpgmaas/pypi-llm)](https://img.shields.io/github/v/release/fpgmaas/pypi-llm)
-[![Build status](https://img.shields.io/github/actions/workflow/status/fpgmaas/pypi-llm/main.yml?branch=main)](https://github.com/fpgmaas/pypi-llm/actions/workflows/main.yml?query=branch%3Amain)
-[![codecov](https://codecov.io/gh/fpgmaas/pypi-llm/branch/main/graph/badge.svg)](https://codecov.io/gh/fpgmaas/pypi-llm)
-[![Commit activity](https://img.shields.io/github/commit-activity/m/fpgmaas/pypi-llm)](https://img.shields.io/github/commit-activity/m/fpgmaas/pypi-llm)
-[![License](https://img.shields.io/github/license/fpgmaas/pypi-llm)](https://img.shields.io/github/license/fpgmaas/pypi-llm)
+https://drive.google.com/file/d/1huR7-VD3AieBRCcQyRX9MWbPLMb_czjq/view?usp=sharing
 
-This is a template repository for Python projects that use Poetry for their dependency management.
+# setup
 
-- **Github repository**: <https://github.com/fpgmaas/pypi-llm/>
-- **Documentation** <https://fpgmaas.github.io/pypi-llm/>
-
-## Getting started with your project
-
-First, create a repository on GitHub with the same name as this project, and then run the following commands:
-
-```bash
-git init -b main
-git add .
-git commit -m "init commit"
-git remote add origin git@github.com:fpgmaas/pypi-llm.git
-git push -u origin main
 ```
-
-Finally, install the environment and the pre-commit hooks with
-
-```bash
-make install
+docker build -t pypi-llm .
 ```
 
-You are now ready to start development on your project!
-The CI/CD pipeline will be triggered when you open a pull request, merge to main, or when you create a new release.
-
-To finalize the set-up for publishing to PyPi or Artifactory, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/publishing/#set-up-for-pypi).
-For activating the automatic documentation with MkDocs, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/mkdocs/#enabling-the-documentation-on-github).
-To enable the code coverage reports, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/codecov/).
-
-## Releasing a new version
-
-- Create an API Token on [Pypi](https://pypi.org/).
-- Add the API Token to your projects secrets with the name `PYPI_TOKEN` by visiting [this page](https://github.com/fpgmaas/pypi-llm/settings/secrets/actions/new).
-- Create a [new release](https://github.com/fpgmaas/pypi-llm/releases/new) on Github.
-- Create a new tag in the form `*.*.*`.
-
-For more details, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/cicd/#how-to-trigger-a-release).
-
----
-
-Repository initiated with [fpgmaas/cookiecutter-poetry](https://github.com/fpgmaas/cookiecutter-poetry).
-
----
+```
+docker run --rm \
+  --env-file .env \
+  -v $(pwd)/data:/code/data \
+  pypi-llm \
+  python /code/pypi_llm/scripts/1_download_dataset.py
+```
 
 ## total
 
diff --git a/data/.gitignore b/data/.gitignore
new file mode 100644
index 0000000..5e7d273
--- /dev/null
+++ b/data/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
diff --git a/frontend/app/components/InfoBox.tsx b/frontend/app/components/InfoBox.tsx
new file mode 100644
index 0000000..23f757f
--- /dev/null
+++ b/frontend/app/components/InfoBox.tsx
@@ -0,0 +1,30 @@
+import React from "react";
+
+interface InfoBoxProps {
+  infoBoxVisible: boolean;
+}
+
+const InfoBox: React.FC<InfoBoxProps> = ({ infoBoxVisible }) => {
+  if (!infoBoxVisible) return null;
+
+  return (
+    <div className="w-3/5 bg-white p-6 rounded-lg shadow-lg mt-4">
+      <h2 className="text-2xl font-bold mb-2">How does this work?</h2>
+      <p className="text-gray-700">
+        This application allows you to search for Python packages on PyPi using
+        natural language. An example query would be "a package that creates
+        plots and beautiful visualizations".
+      </p>
+      <br />
+      <p className="text-gray-700">
+        Once you click search, your query will be matched against the summary
+        and the first part of the description of all PyPi packages with more
+        than 50 weekly downloads. The results are then scored based on their
+        similarity and their number of weekly downloads, and the thirty best
+        results are displayed in the table below.
+      </p>
+    </div>
+  );
+};
+
+export default InfoBox;
diff --git a/frontend/components/SearchResultsTable.tsx b/frontend/app/components/SearchResultsTable.tsx
similarity index 100%
rename from frontend/components/SearchResultsTable.tsx
rename to frontend/app/components/SearchResultsTable.tsx
diff --git a/frontend/app/page.tsx b/frontend/app/page.tsx
index f6cc69c..48b7210 100644
--- a/frontend/app/page.tsx
+++ b/frontend/app/page.tsx
@@ -1,53 +1,28 @@
 "use client";
 
 import { useState } from "react";
-import axios from "axios";
-import SearchResultsTable from "../components/SearchResultsTable";
+import { handleSearch, sortResults } from "./utils/search";
+import SearchResultsTable from "./components/SearchResultsTable";
+import InfoBox from "./components/InfoBox";
 import { ClipLoader } from "react-spinners";
 
-export default function Home() {
-  const [text, setText] = useState("");
-  const [results, setResults] = useState([]);
-  const [sortField, setSortField] = useState("weekly_downloads");
-  const [sortDirection, setSortDirection] = useState("desc");
-  const [loading, setLoading] = useState(false);
-  const [error, setError] = useState("");
-  const [infoBoxVisible, setInfoBoxVisible] = useState(false);
-
-  const handleSearch = async () => {
-    setLoading(true);
-    setError("");
-    try {
-      const response = await axios.post(
-        "http://localhost:8000/search",
-        {
-          query: text,
-        },
-        {
-          headers: {
-            "Content-Type": "application/json",
-          },
-        },
-      );
-      const fetchedResults = response.data.matches;
-      setResults(sortResults(fetchedResults, sortField, sortDirection));
-    } catch (error) {
-      setError("Error fetching search results.");
-      console.error("Error fetching search results:", error);
-    } finally {
-      setLoading(false);
-    }
-  };
+interface Match {
+  name: string;
+  similarity: number;
+  weekly_downloads: number;
+  summary: string;
+}
 
-  const sortResults = (data, field, direction) => {
-    return [...data].sort((a, b) => {
-      if (a[field] < b[field]) return direction === "asc" ? -1 : 1;
-      if (a[field] > b[field]) return direction === "asc" ? 1 : -1;
-      return 0;
-    });
-  };
+export default function Home() {
+  const [text, setText] = useState<string>("");
+  const [results, setResults] = useState<Match[]>([]);
+  const [sortField, setSortField] = useState<string>("similarity");
+  const [sortDirection, setSortDirection] = useState<string>("desc");
+  const [loading, setLoading] = useState<boolean>(false);
+  const [error, setError] = useState<string>("");
+  const [infoBoxVisible, setInfoBoxVisible] = useState<boolean>(false);
 
-  const handleSort = (field) => {
+  const handleSort = (field: string) => {
     const direction =
       sortField === field && sortDirection === "asc" ? "desc" : "asc";
     setSortField(field);
@@ -72,7 +47,16 @@ export default function Home() {
         ></textarea>
         <button
           className="w-[250px] p-2 border rounded bg-blue-500 text-white hover:bg-blue-700 focus:outline-none focus:ring-2 focus:ring-blue-500"
-          onClick={handleSearch}
+          onClick={() =>
+            handleSearch(
+              text,
+              sortField,
+              sortDirection,
+              setResults,
+              setLoading,
+              setError,
+            )
+          }
         >
           Search
         </button>
@@ -91,20 +75,7 @@ export default function Home() {
         </button>
       </div>
 
-      {infoBoxVisible && (
-        <div className="w-3/5 bg-white p-6 rounded-lg shadow-lg mt-4">
-          <h2 className="text-2xl font-bold mb-2">How does this work?</h2>
-          <p className="text-gray-700">
-            This application allows you to search for Python packages on PyPi
-            using natural language. So an example query would be "a package that
-            creates plots and beautiful visualizations". Once you click search,
-            your query will be matched against the summary and the first part of
-            the description of all PyPi packages with more than 50 weekly
-            downloads, and the 50 most similar results will be displayed in a
-            table below.
-          </p>
-        </div>
-      )}
+      <InfoBox infoBoxVisible={infoBoxVisible} />
 
       {results.length > 0 && (
         <div className="w-full flex justify-center mt-6">
diff --git a/frontend/app/utils/search.ts b/frontend/app/utils/search.ts
new file mode 100644
index 0000000..b8abf25
--- /dev/null
+++ b/frontend/app/utils/search.ts
@@ -0,0 +1,52 @@
+import axios from "axios";
+
+interface Match {
+  name: string;
+  similarity: number;
+  weekly_downloads: number;
+  summary: string;
+}
+
+export const handleSearch = async (
+  query: string,
+  sortField: string,
+  sortDirection: string,
+  setResults: React.Dispatch<React.SetStateAction<Match[]>>,
+  setLoading: React.Dispatch<React.SetStateAction<boolean>>,
+  setError: React.Dispatch<React.SetStateAction<string>>,
+) => {
+  setLoading(true);
+  setError("");
+  try {
+    const response = await axios.post(
+      "http://localhost:8000/search",
+      {
+        query: query,
+      },
+      {
+        headers: {
+          "Content-Type": "application/json",
+        },
+      },
+    );
+    const fetchedResults: Match[] = response.data.matches;
+    setResults(sortResults(fetchedResults, sortField, sortDirection));
+  } catch (error) {
+    setError("Error fetching search results.");
+    console.error("Error fetching search results:", error);
+  } finally {
+    setLoading(false);
+  }
+};
+
+export const sortResults = (
+  data: Match[],
+  field: string,
+  direction: string,
+): Match[] => {
+  return [...data].sort((a, b) => {
+    if (a[field] < b[field]) return direction === "asc" ? -1 : 1;
+    if (a[field] > b[field]) return direction === "asc" ? 1 : -1;
+    return 0;
+  });
+};
diff --git a/poetry.lock b/poetry.lock
index ca0f5a5..f5845a9 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -841,6 +841,26 @@ test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe,
 test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"]
 tqdm = ["tqdm"]
 
+[[package]]
+name = "gdown"
+version = "5.2.0"
+description = "Google Drive Public File/Folder Downloader"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "gdown-5.2.0-py3-none-any.whl", hash = "sha256:33083832d82b1101bdd0e9df3edd0fbc0e1c5f14c9d8c38d2a35bf1683b526d6"},
+    {file = "gdown-5.2.0.tar.gz", hash = "sha256:2145165062d85520a3cd98b356c9ed522c5e7984d408535409fd46f94defc787"},
+]
+
+[package.dependencies]
+beautifulsoup4 = "*"
+filelock = "*"
+requests = {version = "*", extras = ["socks"]}
+tqdm = "*"
+
+[package.extras]
+test = ["build", "mypy", "pytest", "pytest-xdist", "ruff", "twine", "types-requests", "types-setuptools"]
+
 [[package]]
 name = "ghp-import"
 version = "2.1.0"
@@ -2992,6 +3012,18 @@ tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""}
 docs = ["furo (>=2023.8.19)", "sphinx (<7.2)", "sphinx-autodoc-typehints (>=1.24)"]
 testing = ["covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)", "setuptools (>=68.1.2)", "wheel (>=0.41.2)"]
 
+[[package]]
+name = "pysocks"
+version = "1.7.1"
+description = "A Python SOCKS client module. See https://github.com/Anorov/PySocks for more information."
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "PySocks-1.7.1-py27-none-any.whl", hash = "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299"},
+    {file = "PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5"},
+    {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"},
+]
+
 [[package]]
 name = "pytest"
 version = "7.4.4"
@@ -3426,6 +3458,7 @@ files = [
 certifi = ">=2017.4.17"
 charset-normalizer = ">=2,<4"
 idna = ">=2.5,<4"
+PySocks = {version = ">=1.5.6,<1.5.7 || >1.5.7", optional = true, markers = "extra == \"socks\""}
 urllib3 = ">=1.21.1,<3"
 
 [package.extras]
@@ -4902,4 +4935,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8,<4.0"
-content-hash = "05b453b7e4b34fce7a4a0023f82b194a23dc13b35b57e02218e80f0417694913"
+content-hash = "3b9517bb553ec909b8f2d6cb96baca236709a5964b5e5f05208b125591ed9270"
diff --git a/pypi_llm/api/main.py b/pypi_llm/api/main.py
index d5abfc5..1aa49a1 100644
--- a/pypi_llm/api/main.py
+++ b/pypi_llm/api/main.py
@@ -1,3 +1,5 @@
+import logging
+
 import polars as pl
 from dotenv import load_dotenv
 from fastapi import FastAPI
@@ -6,9 +8,12 @@
 from sentence_transformers import SentenceTransformer
 
 from pypi_llm.config import Config
+from pypi_llm.utils.logging import setup_logging
 from pypi_llm.utils.score_calculator import calculate_score
 from pypi_llm.vector_database import VectorDatabaseInterface
 
+setup_logging()
+
 app = FastAPI()
 
 load_dotenv()
@@ -55,12 +60,20 @@ class SearchResponse(BaseModel):
 
 @app.post("/search/", response_model=SearchResponse)
 async def search(query: QueryModel):
+    """
+    Search for the packages whose summary and description have the highest similarity to the query.
+    We take the top_k * 2 most similar packages, and then calculate weighted score based on the similarity and weekly downloads.
+    The top_k packages with the highest score are returned.
+    """
+
+    logging.info(f"Searching for similar projects. Query: '{query.query}'")
     df_matches = vector_database_interface.find_similar(query.query, top_k=query.top_k * 2)
     df_matches = df_matches.join(df, how="left", on="name")
 
+    logging.info("Found similar projects. Calculating the weighted scores and filtering...")
     df_matches = calculate_score(df_matches)
     df_matches = df_matches.sort("score", descending=True)
     df_matches = df_matches.head(query.top_k)
 
-    print("sending")
+    logging.info("Returning the results...")
     return SearchResponse(matches=df_matches.to_dicts())
diff --git a/pypi_llm/config.py b/pypi_llm/config.py
index afe4f68..88c69b4 100644
--- a/pypi_llm/config.py
+++ b/pypi_llm/config.py
@@ -5,14 +5,17 @@
 
 @dataclass
 class Config:
-    DATA_DIR: Path = Path("data")
     PINECONE_INDEX_NAME = "pypi"
     PINECONE_NAMESPACE = "ns1"
     PINECONE_TOKEN: str = field(default_factory=lambda: os.getenv("PINECONE_TOKEN"))
+
     EMBEDDINGS_MODEL_NAME = "all-mpnet-base-v2"
     EMBEDDINGS_DIMENSION = 768
 
-    PROCESSED_DATASET_CSV_NAME = "dataset.csv"
+    DATA_DIR: Path = Path("data")
+    RAW_DATASET_CSV_NAME = "raw_dataset.csv"
+    PROCESSED_DATASET_CSV_NAME = "processed_dataset.csv"
+    GOOGLE_FILE_ID = "1huR7-VD3AieBRCcQyRX9MWbPLMb_czjq"
 
     def __post_init__(self):
         if not self.PINECONE_TOKEN:
diff --git a/pypi_llm/data/description_cleaner.py b/pypi_llm/data/description_cleaner.py
index 3b1b077..3ef2f1b 100644
--- a/pypi_llm/data/description_cleaner.py
+++ b/pypi_llm/data/description_cleaner.py
@@ -9,11 +9,36 @@
 
 @dataclass
 class DescriptionCleaner:
+    """
+    A class that provides methods to clean PyPi package descriptions in a DataFrame column.
+    """
+
     def clean(self, df: pl.DataFrame, input_col: str, output_col: str) -> pl.DataFrame:
+        """
+        Cleans the text in the specified DataFrame column and returns the modified DataFrame.
+
+        Args:
+            df (pl.DataFrame): The DataFrame containing the text column to be cleaned.
+            input_col (str): The name of the input column containing the text to be cleaned.
+            output_col (str): The name of the output column to store the cleaned text.
+
+        Returns:
+            pl.DataFrame: The modified DataFrame with the cleaned text.
+        """
         df = df.with_columns(pl.col(input_col).apply(self._clean_text).alias(output_col))
         return df
 
     def _clean_text(self, text: str) -> str:
+        """
+        Cleans the given text by removing HTML tags, markdown image links, markdown badges,
+        markdown links, URLs, special markdown characters, markdown headers, and extra whitespaces.
+
+        Args:
+            text (str): The text to be cleaned.
+
+        Returns:
+            str: The cleaned text.
+        """
         try:
             text = self._remove_html_tags(text)
             text = self._remove_markdown_image_links(text)
diff --git a/pypi_llm/data/reader.py b/pypi_llm/data/reader.py
index 33b94f8..d1b6847 100644
--- a/pypi_llm/data/reader.py
+++ b/pypi_llm/data/reader.py
@@ -6,10 +6,21 @@
 
 @dataclass
 class DataReader:
-    data_dir: Path
+    """
+    A class for reading and processing data from a raw PyPi dataset.
+    """
+
+    raw_dataset: Path
 
     def read(self):
-        df = pl.read_csv(self.data_dir / "pypi_dataset.csv")
+        """
+        Reads the raw dataset, performs data processing operations, and returns the processed dataframe.
+        The dataset should at least have the following columns: name, description, and number_of_downloads.
+
+        Returns:
+            DataFrame: The processed dataframe.
+        """
+        df = pl.read_csv(self.raw_dataset)
         df = df.with_columns(weekly_downloads=(pl.col("number_of_downloads") / 4).round().cast(pl.Int32))
         df = df.drop("number_of_downloads")
         df = df.unique(subset="name")
diff --git a/pypi_llm/scripts/0_setup_pinecone.py b/pypi_llm/scripts/0_setup_pinecone.py
new file mode 100644
index 0000000..3ff1a2c
--- /dev/null
+++ b/pypi_llm/scripts/0_setup_pinecone.py
@@ -0,0 +1,32 @@
+import logging
+
+from dotenv import load_dotenv
+from pinecone import Pinecone, ServerlessSpec
+
+from pypi_llm.config import Config
+from pypi_llm.utils.logging import setup_logging
+
+setup_logging()
+
+if __name__ == "__main__":
+    """
+    This script sets up a Pinecone index for storing embeddings.
+
+    It loads the environment variables from a .env file, creates a Pinecone client,
+    and creates an index with the specified name, dimension, metric, and serverless specification.
+    """
+
+    load_dotenv()
+    config = Config()
+
+    logging.info("Connection to Pinecone..")
+    pc = Pinecone(api_key=config.PINECONE_TOKEN)
+
+    logging.info("Creating Pinecone index..")
+    pc.create_index(
+        name=config.PINECONE_INDEX_NAME,
+        dimension=config.EMBEDDINGS_DIMENSION,
+        metric="dotproduct",
+        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
+    )
+    logging.info("Done!")
diff --git a/pypi_llm/scripts/1_download_dataset.py b/pypi_llm/scripts/1_download_dataset.py
new file mode 100644
index 0000000..3d4250e
--- /dev/null
+++ b/pypi_llm/scripts/1_download_dataset.py
@@ -0,0 +1,22 @@
+import logging
+
+import gdown
+from dotenv import load_dotenv
+
+from pypi_llm.config import Config
+from pypi_llm.utils.logging import setup_logging
+
+setup_logging()
+
+if __name__ == "__main__":
+    """
+    Downloads the dataset from a Google Drive link using the gdown library.
+    """
+    load_dotenv()
+    config = Config()
+
+    logging.info("Downloading raw dataset from Google Drive...")
+    url = f"https://drive.google.com/uc?id={config.GOOGLE_FILE_ID}"
+    output = str(config.DATA_DIR / config.RAW_DATASET_CSV_NAME)
+    gdown.download(url, output, quiet=False)
+    logging.info("Done!")
diff --git a/pypi_llm/scripts/2_process_dataset.py b/pypi_llm/scripts/2_process_dataset.py
new file mode 100644
index 0000000..cb35337
--- /dev/null
+++ b/pypi_llm/scripts/2_process_dataset.py
@@ -0,0 +1,31 @@
+import logging
+
+import polars as pl
+from dotenv import load_dotenv
+
+from pypi_llm.config import Config
+from pypi_llm.data.description_cleaner import CLEANING_FAILED, DescriptionCleaner
+from pypi_llm.data.reader import DataReader
+from pypi_llm.utils.logging import setup_logging
+
+setup_logging()
+
+if __name__ == "__main__":
+    """
+    This script processes a dataset by cleaning the description column and saving the processed dataset as a CSV file.
+    """
+
+    load_dotenv()
+    config = Config()
+
+    logging.info("Reading the raw dataset...")
+    df = DataReader(config.DATA_DIR / config.RAW_DATASET_CSV_NAME).read()
+
+    logging.info("Cleaning the descriptions...")
+    df = DescriptionCleaner().clean(df, "description", "description_cleaned")
+    df = df.filter(~pl.col("description_cleaned").is_null())
+    df = df.filter(pl.col("description_cleaned") != CLEANING_FAILED)
+
+    logging.info("Storing the processed dataset...")
+    df.write_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
+    logging.info("Done!")
diff --git a/pypi_llm/scripts/upsert_data.py b/pypi_llm/scripts/3_upsert_data.py
similarity index 65%
rename from pypi_llm/scripts/upsert_data.py
rename to pypi_llm/scripts/3_upsert_data.py
index e7b7e43..9206edd 100644
--- a/pypi_llm/scripts/upsert_data.py
+++ b/pypi_llm/scripts/3_upsert_data.py
@@ -1,24 +1,26 @@
+import logging
+
 import polars as pl
 from dotenv import load_dotenv
 from sentence_transformers import SentenceTransformer
 
 from pypi_llm.config import Config
-from pypi_llm.data.description_cleaner import CLEANING_FAILED, DescriptionCleaner
-from pypi_llm.data.reader import DataReader
+from pypi_llm.utils.logging import setup_logging
 from pypi_llm.vector_database import VectorDatabaseInterface
 
+setup_logging()
+
 if __name__ == "__main__":
+    """
+    Upserts data from a processed dataset CSV into a vector database.
+    """
     load_dotenv()
     config = Config()
 
-    df = DataReader(config.DATA_DIR).read()
-
-    df = DescriptionCleaner().clean(df, "description", "description_cleaned")
-    df = df.filter(~pl.col("description_cleaned").is_null())
-    df = df.filter(pl.col("description_cleaned") != CLEANING_FAILED)
-
-    df.write_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
+    logging.info("Reading the processed dataset...")
+    df = pl.read_csv(config.DATA_DIR / config.PROCESSED_DATASET_CSV_NAME)
 
+    logging.info("Connecting to the vector database..")
     vector_database_interface = VectorDatabaseInterface(
         pinecone_token=config.PINECONE_TOKEN,
         pinecone_index_name=config.PINECONE_INDEX_NAME,
@@ -26,7 +28,9 @@
         pinecone_namespace=config.PINECONE_NAMESPACE,
     )
 
+    logging.info("Upserting data into the vector database..")
     df = df.with_columns(
         summary_and_description_cleaned=pl.concat_str(pl.col("summary"), pl.lit(" - "), pl.col("description_cleaned"))
     )
     vector_database_interface.upsert_polars(df, key_column="name", text_column="summary_and_description_cleaned")
+    logging.info("Done!")
diff --git a/pypi_llm/scripts/setup_pinecone.py b/pypi_llm/scripts/setup_pinecone.py
deleted file mode 100644
index 126499a..0000000
--- a/pypi_llm/scripts/setup_pinecone.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from dotenv import load_dotenv
-from pinecone import Pinecone, ServerlessSpec
-
-from pypi_llm.config import Config
-
-if __name__ == "__main__":
-    load_dotenv()
-    config = Config()
-
-    pc = Pinecone(api_key=config.PINECONE_TOKEN)
-
-    pc.create_index(
-        name=config.PINECONE_INDEX_NAME,
-        dimension=config.EMBEDDINGS_DIMENSION,
-        metric="dotproduct",
-        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
-    )
diff --git a/pypi_llm/utils/logging.py b/pypi_llm/utils/logging.py
new file mode 100644
index 0000000..9ddc72c
--- /dev/null
+++ b/pypi_llm/utils/logging.py
@@ -0,0 +1,9 @@
+import logging
+
+
+def setup_logging():
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        handlers=[logging.StreamHandler()],
+    )
diff --git a/pypi_llm/vector_database/interface.py b/pypi_llm/vector_database/interface.py
index d42496b..ecba020 100644
--- a/pypi_llm/vector_database/interface.py
+++ b/pypi_llm/vector_database/interface.py
@@ -5,6 +5,17 @@
 
 
 class VectorDatabaseInterface:
+    """
+    A class that provides an interface for interacting with a vector database.
+
+    Args:
+        pinecone_token (str): The Pinecone API token.
+        pinecone_index_name (str): The name of the Pinecone index.
+        pinecone_namespace (str): The namespace for the Pinecone index.
+        embeddings_model (SentenceTransformer): The sentence transformer model for encoding text into embeddings.
+        batch_size (int, optional): The batch size for upserting data. Defaults to 250.
+    """
+
     def __init__(
         self,
         pinecone_token: str,
@@ -20,11 +31,29 @@ def __init__(
         self.pinecone_namespace = pinecone_namespace
 
     def upsert_polars(self, df: pl.DataFrame, key_column: str, text_column: str):
+        """
+        Upserts the data from a Polars DataFrame into the vector database.
+
+        Args:
+            df (pl.DataFrame): The Polars DataFrame containing the data to be upserted.
+            key_column (str): The name of the column in the DataFrame containing the unique keys.
+            text_column (str): The name of the column in the DataFrame containing the text data.
+        """
         df_chunks = self._split_dataframe_in_batches(df)
         for chunk in tqdm(df_chunks, desc="Upserting batches", unit="batch"):
             self._upsert_chunk(chunk, key_column, text_column)
 
     def find_similar(self, query: str, top_k: int = 25) -> pl.DataFrame:
+        """
+        Finds similar vectors in the database for a given query.
+
+        Args:
+            query (str): The query string.
+            top_k (int, optional): The number of similar vectors to retrieve. Defaults to 25.
+
+        Returns:
+            pl.DataFrame: A Polars DataFrame containing the similar vectors and their similarity scores.
+        """
         embeddings = self.model.encode(query)
         matches = self.index.query(
             namespace=self.pinecone_namespace, vector=embeddings.tolist(), top_k=top_k, include_values=False
diff --git a/pyproject.toml b/pyproject.toml
index d967012..8319198 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,7 @@ tqdm = "^4.66.4"
 fastapi = "^0.111.0"
 pydantic = "^2.7.4"
 uvicorn = "^0.30.1"
+gdown = "^5.2.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.2.0"
diff --git a/tests/test_foo.py b/tests/test_foo.py
deleted file mode 100644
index 2fef3fe..0000000
--- a/tests/test_foo.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from pypi_llm.foo import foo
-
-
-def test_foo():
-    assert foo("foo") == "foo"