From e7d67885da6c6bfb6003f571e4803570a3545fff Mon Sep 17 00:00:00 2001 From: Alexandre Gattiker Date: Thu, 3 Oct 2019 20:18:52 +0200 Subject: [PATCH] Added integration tests for eventhubs-streamanalytics-eventhubs (#54) * Create test_spec.json * Report throughput for multiple event hubs * Changed ASA EH output to array of events * Update README.md --- .../azure-event-hubs/report-throughput.sh | 28 ++++++++++++----- eventhubs-streamanalytics-eventhubs/README.md | 31 +++++++++++++++++++ ...eam-analytics-job-simple-arm-template.json | 3 +- .../test_spec.json | 13 ++++++++ integration-tests/azure-pipelines.yml | 12 ++++++- .../databricks/job/run-databricks-job.sh | 1 + .../notebooks/verify-eventhubs.scala | 4 ++- 7 files changed, 82 insertions(+), 10 deletions(-) create mode 100644 eventhubs-streamanalytics-eventhubs/test_spec.json diff --git a/components/azure-event-hubs/report-throughput.sh b/components/azure-event-hubs/report-throughput.sh index a92a7c94..2c90c46d 100755 --- a/components/azure-event-hubs/report-throughput.sh +++ b/components/azure-event-hubs/report-throughput.sh @@ -6,20 +6,34 @@ set -euo pipefail REPORT_THROUGHPUT_MINUTES=${REPORT_THROUGHPUT_MINUTES:-30} ofs=2 -eh_resource=$(az eventhubs namespace show -g $RESOURCE_GROUP -n "$EVENTHUB_NAMESPACE" --query id -o tsv) +eh_resources=$(az eventhubs namespace show -g $RESOURCE_GROUP -n "$EVENTHUB_NAMESPACE" --query id -o tsv) +if [ -n "${EVENTHUB_NAMESPACE_OUT:-}" ]; then + eh_resources="$eh_resources $(az eventhubs namespace show -g $RESOURCE_GROUP -n "$EVENTHUB_NAMESPACE_OUT" --query id -o tsv)" +fi eh_capacity=$(az eventhubs namespace show -g $RESOURCE_GROUP -n "$EVENTHUB_NAMESPACE" --query sku.capacity -o tsv) metric_names="IncomingMessages IncomingBytes OutgoingMessages OutgoingBytes ThrottledRequests" -fmt="%28s%20s%20s%20s%20s%20s\n" +fmt="%28s%12s%20s%20s%20s%20s%20s\n" echo "Event Hub capacity: $eh_capacity throughput units (this determines MAX VALUE below)." echo "Reporting aggregate metrics per minute, offset by $ofs minutes, for $REPORT_THROUGHPUT_MINUTES minutes." -printf "$fmt" "" $metric_names +printf "$fmt" "" "Event Hub #" $metric_names PER_MIN=60 MB=1000000 -printf "$fmt" "" $(tr -C " " "-" <<<$metric_names) -printf "$fmt" "MAX VALUE" "$((eh_capacity*1000*PER_MIN))" "$((eh_capacity*1*MB*PER_MIN))" "$((eh_capacity*4096*PER_MIN))" "$((eh_capacity*2*MB*PER_MIN))" "-" -printf "$fmt" "" $(tr -C " " "-" <<<$metric_names) +printf "$fmt" "" "-----------" $(tr -C " " "-" <<<$metric_names) +printf "$fmt" "MAX VALUE" "" "$((eh_capacity*1000*PER_MIN))" "$((eh_capacity*1*MB*PER_MIN))" "$((eh_capacity*4096*PER_MIN))" "$((eh_capacity*2*MB*PER_MIN))" "-" +printf "$fmt" "" "-----------" $(tr -C " " "-" <<<$metric_names) + for i in $(seq 1 $REPORT_THROUGHPUT_MINUTES) ; do - printf "$fmt" "$(date +%Y-%m-%dT%H:%M:%S%z)" $(az monitor metrics list --resource "$eh_resource" --interval PT1M --metrics $(tr " " "," <<< $metric_names) --offset ${ofs}M --query 'value[].timeseries[0].data[0].floor(total)' -o tsv) + eh_number=0 + date=$(date +%Y-%m-%dT%H:%M:%S%z) + for eh in $eh_resources; do + eh_number=$((eh_number+1)) + metrics=$( + az monitor metrics list --resource "$eh" --interval PT1M \ + --metrics $(tr " " "," <<< $metric_names) --offset ${ofs}M \ + --query 'value[].timeseries[0].data[0].floor(total)' -o tsv + ) + printf "$fmt" "$date" "$eh_number" $metrics + done # sleep until next full minute. "10#" is to force base 10 if string is e.g. "09" sleep "$((60 - 10#$(date +%S) ))" diff --git a/eventhubs-streamanalytics-eventhubs/README.md b/eventhubs-streamanalytics-eventhubs/README.md index 6020d072..18599f0a 100644 --- a/eventhubs-streamanalytics-eventhubs/README.md +++ b/eventhubs-streamanalytics-eventhubs/README.md @@ -130,6 +130,37 @@ The above settings has been chosen to sustain a 1000 msg/sec stream. Please use Metrics pane in Stream Analytics, see "Input/Output Events" for throughput and "Watermark Delay" metric to see if the job is keeping up with the input rate. You can also use Event Hub "Metrics" pane to see if there are any "Throttled Requests" and adjust the Threshold Units accordingly. "Watermark Delay" is one of the key metric that will help you to understand if Stream Analytics is keeping up with the incoming data. If delay is constantly increasing, you need to take a look at the destination to see if it can keep up with the speed or check if you need to increase SU: https://azure.microsoft.com/en-us/blog/new-metric-in-azure-stream-analytics-tracks-latency-of-your-streaming-pipeline/. + +The deployment script will also report performance, by default every minute for 30 minutes: + +``` +***** [M] Starting METRICS reporting +Event Hub capacity: 2 throughput units (this determines MAX VALUE below). +Reporting aggregate metrics per minute, offset by 2 minutes, for 30 minutes. + Event Hub # IncomingMessages IncomingBytes OutgoingMessages OutgoingBytes ThrottledRequests + ----------- ---------------- ------------- ---------------- ------------- ----------------- + MAX VALUE 120000 120000000 491520 240000000 - + ----------- ---------------- ------------- ---------------- ------------- ----------------- + 2019-10-03T07:57:00 1 0 0 0 0 0 + 2019-10-03T07:57:00 2 0 0 0 0 0 + 2019-10-03T07:58:00 1 24050 22809797 24050 22809797 0 + 2019-10-03T07:58:00 2 0 0 0 0 0 + 2019-10-03T07:59:01 1 60037 56940526 60037 56940526 0 + 2019-10-03T07:59:01 2 341 62393762 0 0 0 + 2019-10-03T08:00:00 1 60090 56989878 60090 56989878 0 + 2019-10-03T08:00:00 2 375 65683281 0 0 0 + 2019-10-03T08:01:00 1 60036 56940643 60036 56940643 0 + 2019-10-03T08:01:00 2 376 65708824 0 0 0 +``` + +In column "Event Hub #", 1 refers to the Event Hub used as input to Stream +Analytics, and 2 to the Event Hub used as output. After a few minutes of +ramp-up, the metrics for Event Hub 1 will show around 60k events/min +(depending on selected event rate, here 1k events/s). As Stream Analytics +batches up messages when outputting to Event Hubs, the rate in events/minute +on Event Hub 2 will be much lower, but you can see from the Incoming Bytes +metric that the data rate on both event hubs is similar. + ## Stream Analytics Note that the solution configurations have been verified with compatibility level 1.2. The deployed Stream Analytics solution doesn't do any analytics or projection, but it just inject an additional field using a simple Javascript UDF: diff --git a/eventhubs-streamanalytics-eventhubs/stream-analytics-job-simple-arm-template.json b/eventhubs-streamanalytics-eventhubs/stream-analytics-job-simple-arm-template.json index c14684e8..c7f69025 100644 --- a/eventhubs-streamanalytics-eventhubs/stream-analytics-job-simple-arm-template.json +++ b/eventhubs-streamanalytics-eventhubs/stream-analytics-job-simple-arm-template.json @@ -114,7 +114,8 @@ "serialization": { "type": "JSON", "properties": { - "encoding": "UTF8" + "encoding": "UTF8", + "format": "Array" } }, "datasource": { diff --git a/eventhubs-streamanalytics-eventhubs/test_spec.json b/eventhubs-streamanalytics-eventhubs/test_spec.json new file mode 100644 index 00000000..ea61d557 --- /dev/null +++ b/eventhubs-streamanalytics-eventhubs/test_spec.json @@ -0,0 +1,13 @@ +[ + { + "stage": "2", + "short": "ese1", + "steps": "CIPTMV", + "minutes": "10", + "throughput": "1", + "extra_args": [ + "a", + "simple" + ] + } +] diff --git a/integration-tests/azure-pipelines.yml b/integration-tests/azure-pipelines.yml index 08f70fba..61b7c5a4 100644 --- a/integration-tests/azure-pipelines.yml +++ b/integration-tests/azure-pipelines.yml @@ -10,7 +10,7 @@ jobs: azureSubscription: ARMConnection scriptLocation: 'inlineScript' inlineScript: az vm start -g "$AGENT_VM_RESOURCE_GROUP" -n "$AGENT_VM_NAME" - displayName: 'start agent' + displayName: 'start agent VM' - job: run_tests dependsOn: start_agent @@ -57,3 +57,13 @@ jobs: # Provide service principal (for Azure Data Explorer RBAC setup) addSpnToEnvironment: true displayName: 'pytest stage 3' + +- job: stop_agent + dependsOn: run_tests + steps: + - task: AzureCLI@1 + inputs: + azureSubscription: ARMConnection + scriptLocation: 'inlineScript' + inlineScript: az vm deallocate -g "$AGENT_VM_RESOURCE_GROUP" -n "$AGENT_VM_NAME" --no-wait + displayName: 'stop agent VM' diff --git a/streaming/databricks/job/run-databricks-job.sh b/streaming/databricks/job/run-databricks-job.sh index 36ae6962..cffc77e6 100755 --- a/streaming/databricks/job/run-databricks-job.sh +++ b/streaming/databricks/job/run-databricks-job.sh @@ -22,6 +22,7 @@ wait_for_run () { sleep 10 fi done + echo result_state=$(jq -r ".state.result_state" <<< "$run_info") state_message=$(jq -r ".state.state_message" <<< "$run_info") diff --git a/streaming/databricks/notebooks/verify-eventhubs.scala b/streaming/databricks/notebooks/verify-eventhubs.scala index 144a4c97..68410af9 100644 --- a/streaming/databricks/notebooks/verify-eventhubs.scala +++ b/streaming/databricks/notebooks/verify-eventhubs.scala @@ -37,10 +37,12 @@ val schema = StructType( StructField("processedAt", TimestampType) :: Nil) +val arrayOfEventsSchema = ArrayType(schema) + val stagingTable = "tempresult_" + randomUUID().toString.replace("-","_") var query = streamingData - .select(from_json(decode($"body", "UTF-8"), schema).as("eventData"), $"*") + .select(explode(from_json(decode($"body", "UTF-8"), arrayOfEventsSchema)).as("eventData"), $"*") // When consuming from the output of eventhubs-streamanalytics-eventhubs pipeline, 'enqueuedAt' will haven been // set when reading from the first eventhub, and the enqueued timestamp of the second eventhub is then the 'storedAt' time .select($"eventData.*", $"offset", $"sequenceNumber", $"publisher", $"partitionKey", $"enqueuedTime".as("storedAt"))