Merge pull request #2145 from input-output-hk/jpraynaud/2123-retries-…

…e2e-test Feat: support retries in e2e tests in CI
input-output-hk · Nov 29, 2024 · 60db5ce · 60db5ce
2 parents 0ebf6bb + fcb728e
commit 60db5ce
Show file tree

Hide file tree

Showing 7 changed files with 202 additions and 60 deletions.
diff --git a/.github/workflows/backward-compatibility.yml b/.github/workflows/backward-compatibility.yml
@@ -119,18 +119,29 @@ jobs:
           mkdir artifacts
 
       - name: Run E2E tests
-        shell: bash
-        run: |
-          ./mithril-binaries/e2e/mithril-end-to-end -vvv \
-            --bin-directory ./mithril-binaries/e2e \
-            --work-directory=./artifacts \
-            --devnet-scripts-directory=./mithril-test-lab/mithril-devnet \
-            --cardano-node-version ${{ matrix.cardano_node_version }} \
-            --cardano-slot-length 0.25 \
-            --cardano-epoch-length 45.0 \
-            --signed-entity-types ${{ needs.prepare-env-variables.outputs.signed-entity-types }} \
-          && echo "SUCCESS=true" >> $GITHUB_ENV \
-          || (echo "SUCCESS=false" >> $GITHUB_ENV && exit 1)
+        uses: nick-fields/retry@v3
+        with:
+          shell: bash
+          max_attempts: 3
+          retry_on_exit_code: 2
+          timeout_minutes: 10
+          warning_on_retry: true
+          command: |
+            ./mithril-binaries/e2e/mithril-end-to-end -vvv \
+              --bin-directory ./mithril-binaries/e2e \
+              --work-directory=./artifacts \
+              --devnet-scripts-directory=./mithril-test-lab/mithril-devnet \
+              --cardano-node-version ${{ matrix.cardano_node_version }} \
+              --cardano-slot-length 0.25 \
+              --cardano-epoch-length 45.0 \
+              --signed-entity-types ${{ needs.prepare-env-variables.outputs.signed-entity-types }}
+            EXIT_CODE=$?
+            if [ $EXIT_CODE -eq 0 ]; then
+              echo "SUCCESS=true" >> $GITHUB_ENV
+            else
+              echo "SUCCESS=false" >> $GITHUB_ENV
+            fi
+            exit $EXIT_CODE
 
       - name: Define the JSON file name for the test result
         shell: bash

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -347,25 +347,34 @@ jobs:
           mkdir artifacts
 
       - name: Test
-        run: |
-          cat > ./mithril-end-to-end.sh << EOF
-          #!/bin/bash
-          set -x
-          ./mithril-end-to-end -vvv \\
-                  --bin-directory ./bin \\
-                  --work-directory=./artifacts \\
-                  --devnet-scripts-directory=./mithril-test-lab/mithril-devnet \\
-                  --mithril-era=${{ matrix.era }} \\
-                  --cardano-node-version ${{ matrix.cardano_node_version }} \\
-                  --cardano-hard-fork-latest-era-at-epoch ${{ matrix.hard_fork_latest_era_at_epoch }} ${{ matrix.extra_args }} \\
-          EOF
-          # If there is a next era, we need to specify it with '--mithril-next-era'
-          if [[ "${{ matrix.next_era }}" != "" ]]; then
-            echo "  --mithril-next-era=${{ matrix.next_era }}" >> ./mithril-end-to-end.sh
-          fi
-          chmod u+x ./mithril-end-to-end.sh
-          ./mithril-end-to-end.sh
-          rm ./mithril-end-to-end.sh
+        uses: nick-fields/retry@v3
+        with:
+          shell: bash
+          max_attempts: 3
+          retry_on_exit_code: 2
+          timeout_minutes: 10
+          warning_on_retry: true
+          command: |
+            cat > ./mithril-end-to-end.sh << EOF
+            #!/bin/bash
+            set -x
+            ./mithril-end-to-end -vvv \\
+                    --bin-directory ./bin \\
+                    --work-directory=./artifacts \\
+                    --devnet-scripts-directory=./mithril-test-lab/mithril-devnet \\
+                    --mithril-era=${{ matrix.era }} \\
+                    --cardano-node-version ${{ matrix.cardano_node_version }} \\
+                    --cardano-hard-fork-latest-era-at-epoch ${{ matrix.hard_fork_latest_era_at_epoch }} ${{ matrix.extra_args }} \\
+            EOF
+            # If there is a next era, we need to specify it with '--mithril-next-era'
+            if [[ "${{ matrix.next_era }}" != "" ]]; then
+              echo "  --mithril-next-era=${{ matrix.next_era }}" >> ./mithril-end-to-end.sh
+            fi
+            chmod u+x ./mithril-end-to-end.sh
+            ./mithril-end-to-end.sh
+            EXIT_CODE=$?
+            rm ./mithril-end-to-end.sh
+            exit $EXIT_CODE
 
       - name: Upload E2E Tests Artifacts
         if: ${{ failure() }}

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/mithril-test-lab/mithril-end-to-end/Cargo.toml b/mithril-test-lab/mithril-end-to-end/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "mithril-end-to-end"
-version = "0.4.49"
+version = "0.4.50"
 authors = { workspace = true }
 edition = { workspace = true }
 documentation = { workspace = true }

diff --git a/mithril-test-lab/mithril-end-to-end/src/devnet/mod.rs b/mithril-test-lab/mithril-end-to-end/src/devnet/mod.rs
@@ -1,3 +1,3 @@
 mod runner;
 
-pub use runner::{Devnet, DevnetBootstrapArgs, DevnetTopology, PoolNode};
+pub use runner::{Devnet, DevnetBootstrapArgs, DevnetTopology, PoolNode, RetryableDevnetError};
diff --git a/mithril-test-lab/mithril-end-to-end/src/devnet/runner.rs b/mithril-test-lab/mithril-end-to-end/src/devnet/runner.rs
@@ -6,8 +6,13 @@ use std::fs::{self, read_to_string, File};
 use std::io::Read;
 use std::path::{Path, PathBuf};
 use std::process::Stdio;
+use thiserror::Error;
 use tokio::process::Command;
 
+#[derive(Error, Debug, PartialEq, Eq)]
+#[error("Retryable devnet error: `{0}`")]
+pub struct RetryableDevnetError(pub String);
+
 #[derive(Debug, Clone, Default)]
 pub struct Devnet {
     artifacts_dir: PathBuf,
@@ -211,7 +216,9 @@ impl Devnet {
             .with_context(|| "Error while starting the devnet")?;
         match status.code() {
             Some(0) => Ok(()),
-            Some(code) => Err(anyhow!("Run devnet exited with status code: {code}")),
+            Some(code) => Err(anyhow!(RetryableDevnetError(format!(
+                "Run devnet exited with status code: {code}"
+            )))),
             None => Err(anyhow!("Run devnet terminated by signal")),
         }
     }
@@ -258,7 +265,9 @@ impl Devnet {
             .with_context(|| "Error while delegating stakes to the pools")?;
         match status.code() {
             Some(0) => Ok(()),
-            Some(code) => Err(anyhow!("Delegating stakes exited with status code: {code}")),
+            Some(code) => Err(anyhow!(RetryableDevnetError(format!(
+                "Delegating stakes exited with status code: {code}"
+            )))),
             None => Err(anyhow!("Delegating stakes terminated by signal")),
         }
     }
@@ -282,9 +291,9 @@ impl Devnet {
             .with_context(|| "Error while writing era marker on chain")?;
         match status.code() {
             Some(0) => Ok(()),
-            Some(code) => Err(anyhow!(
+            Some(code) => Err(anyhow!(RetryableDevnetError(format!(
                 "Write era marker on chain exited with status code: {code}"
-            )),
+            )))),
             None => Err(anyhow!("Write era marker on chain terminated by signal")),
         }
     }
@@ -308,9 +317,9 @@ impl Devnet {
             .with_context(|| "Error while to transferring funds on chain")?;
         match status.code() {
             Some(0) => Ok(()),
-            Some(code) => Err(anyhow!(
+            Some(code) => Err(anyhow!(RetryableDevnetError(format!(
                 "Transfer funds on chain exited with status code: {code}"
-            )),
+            )))),
             None => Err(anyhow!("Transfer funds on chain terminated by signal")),
         }
     }

diff --git a/mithril-test-lab/mithril-end-to-end/src/main.rs b/mithril-test-lab/mithril-end-to-end/src/main.rs
@@ -1,13 +1,15 @@
 use anyhow::{anyhow, Context};
 use clap::{CommandFactory, Parser, Subcommand};
 use slog::{Drain, Level, Logger};
-use slog_scope::{error, info, warn};
+use slog_scope::{error, info};
 use std::{
-    fs,
+    fmt, fs,
     path::{Path, PathBuf},
+    process::{ExitCode, Termination},
     sync::Arc,
     time::Duration,
 };
+use thiserror::Error;
 use tokio::{
     signal::unix::{signal, SignalKind},
     sync::Mutex,
@@ -17,7 +19,8 @@ use tokio::{
 use mithril_common::StdResult;
 use mithril_doc::GenerateDocCommands;
 use mithril_end_to_end::{
-    Devnet, DevnetBootstrapArgs, MithrilInfrastructure, MithrilInfrastructureConfig, RunOnly, Spec,
+    Devnet, DevnetBootstrapArgs, MithrilInfrastructure, MithrilInfrastructureConfig,
+    RetryableDevnetError, RunOnly, Spec,
 };
 
 /// Tests args
@@ -152,8 +155,16 @@ enum EndToEndCommands {
     GenerateDoc(GenerateDocCommands),
 }
 
-#[tokio::main]
-async fn main() -> StdResult<()> {
+fn main() -> AppResult {
+    tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .unwrap()
+        .block_on(async { main_exec().await })
+        .into()
+}
+
+async fn main_exec() -> StdResult<()> {
     let args = Args::parse();
     let _guard = slog_scope::set_global_logger(build_logger(&args));
 
@@ -198,9 +209,69 @@ async fn main() -> StdResult<()> {
 
     app_stopper.stop().await;
     join_set.shutdown().await;
+
     res
 }
 
+#[derive(Debug)]
+enum AppResult {
+    Success(),
+    UnretryableError(anyhow::Error),
+    RetryableError(anyhow::Error),
+    Cancelled(anyhow::Error),
+}
+
+impl AppResult {
+    fn exit_code(&self) -> ExitCode {
+        match self {
+            AppResult::Success() => ExitCode::SUCCESS,
+            AppResult::UnretryableError(_) | AppResult::Cancelled(_) => ExitCode::FAILURE,
+            AppResult::RetryableError(_) => ExitCode::from(2),
+        }
+    }
+}
+
+impl fmt::Display for AppResult {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            AppResult::Success() => write!(f, "Success"),
+            AppResult::UnretryableError(error) => write!(f, "Error(Unretryable): {error:?}"),
+            AppResult::RetryableError(error) => write!(f, "Error(Retryable): {error:?}"),
+            AppResult::Cancelled(error) => write!(f, "Cancelled: {error:?}"),
+        }
+    }
+}
+
+impl Termination for AppResult {
+    fn report(self) -> ExitCode {
+        let exit_code = self.exit_code();
+        println!(" ");
+        println!("{:-^100}", "");
+        println!("Mithril End to End test outcome:");
+        println!("{:-^100}", "");
+        println!("{self}");
+
+        exit_code
+    }
+}
+
+impl From<StdResult<()>> for AppResult {
+    fn from(result: StdResult<()>) -> Self {
+        match result {
+            Ok(()) => AppResult::Success(),
+            Err(error) => {
+                if error.is::<RetryableDevnetError>() {
+                    AppResult::RetryableError(error)
+                } else if error.is::<SignalError>() {
+                    AppResult::Cancelled(error)
+                } else {
+                    AppResult::UnretryableError(error)
+                }
+            }
+        }
+    }
+}
+
 struct App {
     devnet: Arc<Mutex<Option<Devnet>>>,
     infrastructure: Arc<Mutex<Option<MithrilInfrastructure>>>,
@@ -338,31 +409,73 @@ fn create_workdir_if_not_exist_clean_otherwise(work_dir: &Path) {
     fs::create_dir(work_dir).expect("Work dir creation failure");
 }
 
+#[derive(Error, Debug, PartialEq, Eq)]
+#[error("Signal received: `{0}`")]
+pub struct SignalError(pub String);
+
 fn with_gracefull_shutdown(join_set: &mut JoinSet<StdResult<()>>) {
     join_set.spawn(async move {
         let mut sigterm = signal(SignalKind::terminate()).expect("Failed to create SIGTERM signal");
-        sigterm
-            .recv()
-            .await
-            .ok_or(anyhow!("Failed to receive SIGTERM"))
-            .inspect(|()| warn!("Received SIGTERM"))
+        sigterm.recv().await;
+
+        Err(anyhow!(SignalError("SIGTERM".to_string())))
     });
 
     join_set.spawn(async move {
         let mut sigterm = signal(SignalKind::interrupt()).expect("Failed to create SIGINT signal");
-        sigterm
-            .recv()
-            .await
-            .ok_or(anyhow!("Failed to receive SIGINT"))
-            .inspect(|()| warn!("Received SIGINT"))
+        sigterm.recv().await;
+
+        Err(anyhow!(SignalError("SIGINT".to_string())))
     });
 
     join_set.spawn(async move {
         let mut sigterm = signal(SignalKind::quit()).expect("Failed to create SIGQUIT signal");
-        sigterm
-            .recv()
-            .await
-            .ok_or(anyhow!("Failed to receive SIGQUIT"))
-            .inspect(|()| warn!("Received SIGQUIT"))
+        sigterm.recv().await;
+
+        Err(anyhow!(SignalError("SIGQUIT".to_string())))
     });
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn app_result_exit_code() {
+        let expected_exit_code = ExitCode::SUCCESS;
+        let exit_code = AppResult::Success().exit_code();
+        assert_eq!(expected_exit_code, exit_code);
+
+        let expected_exit_code = ExitCode::FAILURE;
+        let exit_code = AppResult::UnretryableError(anyhow::anyhow!("an error")).exit_code();
+        assert_eq!(expected_exit_code, exit_code);
+
+        let expected_exit_code = ExitCode::from(2);
+        let exit_code = AppResult::RetryableError(anyhow::anyhow!("an error")).exit_code();
+        assert_eq!(expected_exit_code, exit_code);
+
+        let expected_exit_code = ExitCode::FAILURE;
+        let exit_code = AppResult::Cancelled(anyhow::anyhow!("an error")).exit_code();
+        assert_eq!(expected_exit_code, exit_code);
+    }
+
+    #[test]
+    fn app_result_conversion() {
+        assert!(matches!(AppResult::from(Ok(())), AppResult::Success()));
+
+        assert!(matches!(
+            AppResult::from(Err(anyhow!(RetryableDevnetError("an error".to_string())))),
+            AppResult::RetryableError(_)
+        ));
+
+        assert!(matches!(
+            AppResult::from(Err(anyhow!("an error"))),
+            AppResult::UnretryableError(_)
+        ));
+
+        assert!(matches!(
+            AppResult::from(Err(anyhow!(SignalError("an error".to_string())))),
+            AppResult::Cancelled(_)
+        ));
+    }
+}