Kaszanas · Kaszanas · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/.env.template b/.env.template
@@ -0,0 +1,2 @@
+# To have imports resolve correctly this should be the path to the root of the project:
+TEST_WORKSPACE=
diff --git a/.github/PULL_REQUEST_TEMPLATE.MD b/.github/PULL_REQUEST_TEMPLATE.MD
@@ -2,7 +2,7 @@
 ## Description
 <!--- Describe your changes in detail -->
 
-## Related IssueS
+## Related Issues
 <!--- This project only accepts pull requests related to open issues -->
 <!--- If suggesting a new feature or change, please discuss it in an issue first -->
 <!--- If fixing a bug, there should be an issue describing it with steps to reproduce -->

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,26 +1,29 @@
 name: continuous integration (ci)
 
-on: [pull_request, workflow_dispatch]
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+      - dev
+  workflow_dispatch:
 
 # To successfully find the files that are required for testing:
 env:
   TEST_WORKSPACE: ${{ github.workspace }}
 
 jobs:
-
   pre_commit:
     # Set up operating system
     runs-on: ubuntu-latest
-
     # Define job steps
     steps:
-
       - name: Check-out repository
         uses: actions/checkout@v4
 
       - name: Build Dev Docker Image
         run: |
-          make docker_build_dev
+          make docker_build_devcontainer
 
       - name: Docker Run pre-commit on all files.
         run: |
@@ -41,7 +44,7 @@ jobs:
 
       - name: Build Dev Docker Image
         run: |
-          make docker_build_dev PYTHON_VERSION=${{ matrix.python-version }}
+          make docker_build_devcontainer PYTHON_VERSION=${{ matrix.python-version }}
 
       - name: Build Docker Image With Python ${{ matrix.python-version }}
         run: |

diff --git a/.github/workflows/docker_images.yml b/.github/workflows/docker_images.yml
@@ -0,0 +1,47 @@
+name: Publish Docker Images
+
+# This should run only after the tests from the CI pipeline have passed.
+# On a rare ocassion contributors can trigger this manually, and it should also
+# run after a release has been published.
+on:
+  workflow_run:
+    workflows: ["continuous integration (ci)"]
+    types:
+      - completed
+  push:
+    branches:
+      - main
+      - dev
+  workflow_dispatch:
+  release:
+    types: [published]
+
+jobs:
+  push_to_registries:
+    name: Push Docker Image to Docker Hub
+    runs-on: ubuntu-latest
+    permissions:
+      packages: write
+      contents: read
+    steps:
+      - name: Check out Code
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+      - name: Log in to Docker Hub
+        uses: docker/login-action@e92390c5fb421da1463c202d546fed0ec5c39f20
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_TOKEN }}
+      - name: Extract Metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@8e5442c4ef9f78752691e2d8f8d19755c6f78e81
+        with:
+          images: |
+            kaszanas/datasetpreparator
+      - name: Build and Push Docker images
+        uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0
+        with:
+          context: .
+          file: ./docker/Dockerfile
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,8 @@
 /.vscode
 /venv*
 
-/processing
+processing/
+maps/
 
 *.SC2Replay
 *.SC2Map
@@ -34,3 +35,5 @@ ruff_cache/
 
 # PyCharm
 /.idea
+
+.env
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -56,24 +56,36 @@ docker run -it -v .:/app datasetpreparator:devcontainer
 
 ### Local Development
 
-Ready to contribute? Here's how to set up `datasetpreparator` for local development.
+Ready to contribute? Here's how to set up `datasetpreparator` for local development. The code style standards that we use are defined in the `.pre-commit-config.yaml` file.
 
 1. Download a copy of `datasetpreparator` locally.
 2. Install `datasetpreparator` using `poetry`:
 
-    ```console
-      poetry install
-    ```
+```console
+  poetry install
+```
+
+3. Install the pre-commit hooks:
+
+```console
+  poetry run pre-commit install
+```
 
-3. Use `git` (or similar) to create a branch for local development and make your changes:
+4. Use `git` (or similar) to create a branch for local development and make your changes:
 
-    ```console
-      git checkout -b name-of-your-bugfix-or-feature
-    ```
+```console
+  git checkout -b name-of-your-bugfix-or-feature
+```
+
+5. When you're done making changes, check that your changes conform to any code formatting requirements and pass any tests.
 
-4. When you're done making changes, check that your changes conform to any code formatting requirements and pass any tests.
+6. Format your commit with `commitizen`:
+
+```console
+  poetry run cz commit
+```
 
-5. Commit your changes and open a pull request.
+7. Commit your changes (we are using commitizen to check commit messages) and open a pull request.
 
 ## Pull Request Guidelines
 

diff --git a/README.md b/README.md
@@ -2,53 +2,98 @@
 
 # DatasetPreparator
 
-Tools in this repository were used to create the **[SC2ReSet: StarCraft II Esport Replaypack Set](https://doi.org/10.5281/zenodo.5575796)**, and finally **[SC2EGSet: StarCraft II Esport Game State Dataset](https://doi.org/10.5281/zenodo.5503997)**.
+This project contains various scripts that can assist in the process of preparing datasets. To have a broad overview of the tools please refer to the **[Detailed Tools Description](#detailed-tools-description)**.
+
+Tools in this repository were used to create the **[SC2ReSet: StarCraft II Esport Replaypack Set](https://doi.org/10.5281/zenodo.5575796)**, and finally **[SC2EGSet: StarCraft II Esport Game State Dataset](https://doi.org/10.5281/zenodo.5503997)**, citation information **[Cite Us!](#cite-us)**.
 
 ## Installation
 
-To install current version of the toolset as separate CLI tools run the following command:
+> [!NOTE]
+> To run this project there are some prerequisites that you need to have installed on your system:
+> - Docker
+> - make
+
+Our prefered way of distributing the toolset is through DockerHub. We Use the Docker Image to provide a fully reproducible environment for our scripts.
 
+To pull the image from DockerHub, run the following command:
+
+```bash
+docker pull kaszanas/datasetpreparator:latest
 ```
-pip install datasetpreparator[all]
+
+If you wish to clone the repository and build the Docker image yourself, run the following command:
+
+```bash
+make docker_build
 ```
 
-After that each of the scripts should be available to call from the command line directly.
+After building the image, please refer to the **[Command Line Arguments Usage](#command-line-arguments-usage)** section for the usage of the scripts and for a full description for each of the scripts refer to **[Detailed Tools Description](#detailed-tools-description)**.
+
+
+## Command Line Arguments Usage
+
+When using Docker, you will have to pass the arguments through the `docker run` command and mount the input/output directory. Below is an example of how to run the `directory_flattener` script using Docker. For ease of use we have prepared example directory structure in the `processing` directory. The command below uses that to issue a command to flatten the directory structure:
+
+```bash
+docker run \
+  -v "./processing:/app/processing" \
+  datasetpreparator:latest \
+  python3 directory_flattener.py \
+  --input_path /app/processing/directory_flattener/input \
+  --output_path /app/processing/directory_flattener/output
+```
 
-## Dataset Preparation Steps
+## SC2EGSet Dataset Reproduction Steps
 
-To reproduce our experience with defining a dataset and to be able to compare your results with our work we describe how to perform the processing below.
+> [!NOTE]
+> Instructions below are for reproducing the result of the SC2EGSet dataset. If you wish to use the tools in this repository separately for your own dataset, please refer to the **[Detailed Tools Description](#detailed-tools-description)**.
 
 ### Using Docker
 
-1. Build the docker image from: https://github.com/Kaszanas/SC2InfoExtractorGo
-2. Run the commands as described in the ```makefile```. But first make sure that all of the script parameters are set according to your needs.
+We provide a release image containing all of the scripts. To see the usage of these scripts please refer to their respective ``README.md`` files as described in [Detailed Tools Description](#detailed-tools-description).
 
-### Using Python
+The following steps were used to prepare the SC2EGSet dataset:
+1. Build the docker image for the DatasetPreparator using the provided ```makefile``` command: ```make docker_build```. This will load all of the dependencies such as the [SC2InfoExtractorGo](https://github.com/Kaszanas/SC2InfoExtractorGo).
+2. Place the input replaypacks into `./processing/directory_flattener/input` directory.
+3. Run the command ```make sc2reset_sc2egset``` to process the replaypacks and create the dataset. The output will be placed in `./processing/sc2egset_replaypack_processor/output` directory.
 
-0. Obtain replays to process. This can be a replaypack or your own replay folder.
-1. Download latest version of [SC2InfoExtractorGo](https://github.com/Kaszanas/SC2InfoExtractorGo), or build it from source.
-2. **Optional** If the replays that you have are held in nested directories it is best to use  ```src/directory_flattener.py```. This will copy the directory and place all of the files to the top directory where it can be further processed. In order to preserve the old directory structure, a .json file is created. The file contains the old directory tree to a mapping: ```{"replayUniqueHash": "whereItWasInOldStructure"}```. This step is is required in order to properly use [SC2InfoExtractorGo](https://github.com/Kaszanas/SC2InfoExtractorGo) as it only lists the files immediately available on the top level of the input directory. [SC2InfoExtractorGo](https://github.com/Kaszanas/SC2InfoExtractorGo).
-3. **Optional** Use the map downloader ```src/sc2_map_downloader.py``` to download maps that were used in the replays that you obtained. This is required for the next step.
-4. **Optional** Use the [SC2MapLocaleExtractor](https://github.com/Kaszanas/SC2MapLocaleExtractor) to obtain the mapping of ```{"foreign_map_name": "english_map_name"}``` which is required for the [SC2InfoExtractorGo](https://github.com/Kaszanas/SC2InfoExtractorGo) to translate the map names in the output .json files.
-5. Perform replaypack processing using ```src/sc2_replaypack_processor.py``` with the [SC2InfoExtractorGo](https://github.com/Kaszanas/SC2InfoExtractorGo) placed in PATH, or next to the script.
-6. **Optional** Using the ```src/file_renamer.py```, rename the files that were generated in the previous step. This is not required and is done to increase the readibility of the directory structure for the output.
-7. Using the ```src/file_packager.py```, create .zip archives containing the datasets and the supplementary files. By finishing this stage, your dataset should be ready to upload.
 
-#### Customization
+### Detailed Tools Description
 
-In order to specify different processing flags for https://github.com/Kaszanas/SC2InfoExtractorGo please modify the ```sc2_replaypack_processor.py``` file directly
+Each of the scripts has its usage described in their respective `README.md` files, you can find the detailed description of the available tools below.
 
-## Command Line Arguments Usage
+#### CLI Usage; Generic scripts
+1. [Directory Packager (dir_packager): README](src/datasetpreparator/dir_packager/README.md)
+2. [Directory Flattener (directory_flattener): README](src/datasetpreparator/directory_flattener/README.md)
+3. [File Renamer (file_renamer): README](src/datasetpreparator/file_renamer/README.md)
+4. [JSON Merger (json_merger): README](src/datasetpreparator/json_merger/README.md)
+5. [Processed Mapping Copier (processed_mapping_copier): README](src/datasetpreparator/processed_mapping_copier/README.md)
+
+#### CLI Usage; StarCraft 2 Specific Scripts
+1. [SC2 Map Downloader (sc2_map_downloader): README](src/datasetpreparator/sc2/sc2_map_downloader/README.md)
+2. [SC2EGSet Replaypack Processor (sc2egset_replaypack_processor): README](src/datasetpreparator/sc2/sc2egset_replaypack_processor/README.md)
+3. [SC2ReSet Replaypack Downloader (sc2reset_replaypack_downloader): README](src/datasetpreparator/sc2/sc2reset_replaypack_downloader/README.md)
+
+
+<!-- ### Using Python
+
+1. Obtain replays to process. This can be a replaypack or your own replay folder.
+2. Download latest version of [SC2InfoExtractorGo](https://github.com/Kaszanas/SC2InfoExtractorGo), or build it from source.
+3. **Optional** If the replays that you have are held in nested directories it is best to use  ```src/directory_flattener.py```. This will copy the directory and place all of the files to the top directory where it can be further processed. In order to preserve the old directory structure, a .json file is created. The file contains the old directory tree to a mapping: ```{"replayUniqueHash": "whereItWasInOldStructure"}```. This step is is required in order to properly use [SC2InfoExtractorGo](https://github.com/Kaszanas/SC2InfoExtractorGo) as it only lists the files immediately available on the top level of the input directory. [SC2InfoExtractorGo](https://github.com/Kaszanas/SC2InfoExtractorGo).
+4. **Optional** Use the map downloader ```src/sc2_map_downloader.py``` to download maps that were used in the replays that you obtained. This is required for the next step.
+5. **Optional** Use the [SC2MapLocaleExtractor](https://github.com/Kaszanas/SC2MapLocaleExtractor) to obtain the mapping of ```{"foreign_map_name": "english_map_name"}``` which is required for the [SC2InfoExtractorGo](https://github.com/Kaszanas/SC2InfoExtractorGo) to translate the map names in the output .json files.
+6. Perform replaypack processing using ```src/sc2_replaypack_processor.py``` with the [SC2InfoExtractorGo](https://github.com/Kaszanas/SC2InfoExtractorGo) placed in PATH, or next to the script.
+7. **Optional** Using the ```src/file_renamer.py```, rename the files that were generated in the previous step. This is not required and is done to increase the readibility of the directory structure for the output.
+8. Using the ```src/file_packager.py```, create .zip archives containing the datasets and the supplementary files. By finishing this stage, your dataset should be ready to upload. -->
 
-Each of the scripts has its usage described in their respective `README.md` files.
 
 ## Contributing and Reporting Issues
 
 If you want to report a bug, request a feature, or open any other issue, please do so in the **[issue tracker](https://github.com/Kaszanas/SC2DatasetPreparator/issues/new/choose)**.
 
 Please see **[CONTRIBUTING.md](https://github.com/Kaszanas/SC2DatasetPreparator/blob/main/CONTRIBUTING.md)** for detailed development instructions and contribution guidelines.
 
-## Citing
+## Cite Us!
 
 ### This Repository
 

diff --git a/ci/install_poetry.py b/ci/install_poetry.py
@@ -23,6 +23,7 @@
 
 For full documentation, visit https://python-poetry.org/docs/#installation.
 """  # noqa: E501
+
 import sys
 
 

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,31 +1,68 @@
-# Built .exe replay parsing tool is required to run sc2_replaypack_processor
-# https://github.com/Kaszanas/SC2InfoExtractorGo
 
 ARG PYTHON_VERSION=3.11
 
-FROM kaszanas/sc2infoextractorgo:latest as extractor
+# Built .exe replay parsing tool is required to run sc2_replaypack_processor
+# https://github.com/Kaszanas/SC2InfoExtractorGo
+FROM kaszanas/sc2infoextractorgo:latest AS extractor
 
-FROM python:${PYTHON_VERSION}-alpine
+FROM python:${PYTHON_VERSION}-alpine AS build
 
 WORKDIR /app
 
 # Copying the replay parsing tool:
-COPY --from=extractor /SC2InfoExtractorGo /SC2InfoExtractorGo
+# sc2egset_replaypack_processor requires the .exe file to be in the same directory as the script:
+COPY --from=extractor /app/SC2InfoExtractorGo /app/SC2InfoExtractorGo
+COPY --from=extractor /app/maps/ /app//processing/maps/
+
+# Ensure the executable has the right permissions
+RUN chmod +x /app/SC2InfoExtractorGo
 
 # Copy only what is required to install the project:
 COPY pyproject.toml poetry.lock ci/install_poetry.py /app/
 
 # Install poetry
 # TODO: this is rather ugly, we are installing poetry into the release Docker build. Use multi-stage builds instead.
 ENV POETRY_HOME=/opt/poetry
-RUN python3 install_poetry.py --version 1.8.2 && \
+RUN python3 install_poetry.py --version 1.8.4 && \
     $POETRY_HOME/bin/poetry --version
 
 # Install only dependencies without installing current project:
-RUN $POETRY_HOME/bin/poetry config virtualenvs.create false && $POETRY_HOME/bin/poetry install --no-root
+RUN $POETRY_HOME/bin/poetry \
+    config virtualenvs.create false \
+    && $POETRY_HOME/bin/poetry install --no-root
 
 # Copy entire repository contents
 COPY . .
 
+# Copy test files:
+COPY /src/ /app/src/
+COPY /tests/__init__.py /app/tests/__init__.py
+COPY /tests/conftest.py /app/tests/conftest.py
+COPY /tests/test_utils.py /app/tests/test_utils.py
+COPY /tests/test_settings.py /app/tests/test_settings.py
+COPY /tests/test_main.py /app/tests/test_main.py
+COPY /tests/test_cases/ /app/tests/test_cases/
+
+# Copy docs files:
+COPY /docs/ /app/docs/
+COPY mkdocs.yml /app/mkdocs.yml
+COPY README.md /app/README.md
+COPY CONTRIBUTING.md /app/CONTRIBUTING.md
+
+# Bring the scripts to the top level.
+# They import parts of the project but as long as the project is installed
+# in the same environment, they can run from anywhere as long as the environment
+# is activated.
+COPY /src/datasetpreparator/directory_flattener/directory_flattener.py \
+    /src/datasetpreparator/directory_packager/directory_packager.py \
+    /src/datasetpreparator/file_renamer/file_renamer.py \
+    /src/datasetpreparator/json_merger/json_merger.py \
+    /src/datasetpreparator/processed_mapping_copier/processed_mapping_copier.py \
+    /src/datasetpreparator/sc2/sc2_map_downloader/sc2_map_downloader.py \
+    /src/datasetpreparator/sc2/sc2egset_replaypack_processor/sc2egset_replaypack_processor.py \
+    /src/datasetpreparator/sc2/sc2reset_replaypack_downloader/sc2reset_replaypack_downloader.py \
+    /app/
+
+
 # Install current project:
 RUN $POETRY_HOME/bin/poetry install --all-extras
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# To have imports resolve correctly this should be the path to the root of the project:
		TEST_WORKSPACE=
Original file line number	Diff line number	Diff line change
Expand Up		@@ -23,6 +23,7 @@

		For full documentation, visit https://python-poetry.org/docs/#installation.
		""" # noqa: E501

		import sys


Expand Down