diff --git a/examples/minimal/minimal_ride_hailing.ipynb b/examples/minimal/minimal_ride_hailing.ipynb index 0d4d9c54b0..bc170fa1f8 100644 --- a/examples/minimal/minimal_ride_hailing.ipynb +++ b/examples/minimal/minimal_ride_hailing.ipynb @@ -571,6 +571,15 @@ " files = [\"s3://\" + path for path in fs.glob(uri + '/part-*')]\n", " ds = ParquetDataset(files, filesystem=fs)\n", " return ds.read().to_pandas()\n", + " elif parsed_uri.scheme == 'wasbs':\n", + " import adlfs\n", + " fs = adlfs.AzureBlobFileSystem(\n", + " account_name=os.getenv('FEAST_AZURE_BLOB_ACCOUNT_NAME'), account_key=os.getenv('FEAST_AZURE_BLOB_ACCOUNT_ACCESS_KEY')\n", + " )\n", + " uripath = parsed_uri.username + parsed_uri.path\n", + " files = fs.glob(uripath + '/part-*')\n", + " ds = ParquetDataset(files, filesystem=fs)\n", + " return ds.read().to_pandas()\n", " else:\n", " raise ValueError(f\"Unsupported URL scheme {uri}\")" ] @@ -1275,6 +1284,12 @@ "metadata": {}, "outputs": [], "source": [ + "# Note: depending on the Kafka configuration you may need to create the Kafka topic first, like below:\n", + "#from confluent_kafka.admin import AdminClient, NewTopic\n", + "#admin = AdminClient({'bootstrap.servers': KAFKA_BROKER})\n", + "#new_topic = NewTopic('driver_trips', num_partitions=1, replication_factor=3)\n", + "#admin.create_topics(new_topic)\n", + "\n", "for record in trips_df.drop(columns=['created']).to_dict('record'):\n", " record[\"datetime\"] = (\n", " record[\"datetime\"].to_pydatetime().replace(tzinfo=pytz.utc)\n", diff --git a/infra/terraform/azure/README.md b/infra/terraform/azure/README.md new file mode 100644 index 0000000000..b22c870d12 --- /dev/null +++ b/infra/terraform/azure/README.md @@ -0,0 +1,36 @@ +# Terraform config for Feast on Azure + +This serves as a guide on how to deploy Feast on Azure. At the end of this guide, we will have provisioned: +1. AKS cluster +2. Feast services running on AKS +3. Azure Cache (Redis) as online store +4. Spark operator on AKS +5. Kafka running on HDInsight. + +# Steps + +1. Create a tfvars file, e.g. `my.tfvars`. A sample configuration is as below: + +``` +name_prefix = "feast09" +resource_group = "Feast" # pre-exisiting resource group +``` + +3. Configure tf state backend, e.g.: +``` +terraform { + backend "azurerm" { + storage_account_name = "" + container_name = "" + key = "" + } +} +``` + +3. Use `terraform apply -var-file="my.tfvars"` to deploy. + +Note: to get the list of Kafka brokers needed for streaming ingestion, use + +`curl -sS -u : -G https://.azurehdinsight.net/api/v1/clusters//services/KAFKA/components/KAFKA_BROKER | jq -r '["\(.host_components[].HostRoles.host_name):9092"] | join(",")'` + +where the Kafka gateway username is -kafka-gateway, the Kafka cluster name is -kafka, and the Kafka gateway password is a kubectl secret under the name feast-kafka-gateway. diff --git a/infra/terraform/azure/aks.tf b/infra/terraform/azure/aks.tf new file mode 100644 index 0000000000..c0899d49c9 --- /dev/null +++ b/infra/terraform/azure/aks.tf @@ -0,0 +1,15 @@ +resource "azurerm_kubernetes_cluster" "main" { + name = "${var.name_prefix}-aks" + location = data.azurerm_resource_group.main.location + resource_group_name = data.azurerm_resource_group.main.name + dns_prefix = var.name_prefix + default_node_pool { + name = var.name_prefix + vm_size = var.aks_machine_type + node_count = var.aks_node_count + vnet_subnet_id = azurerm_subnet.main.id + } + identity { + type = "SystemAssigned" + } +} diff --git a/infra/terraform/azure/helm.tf b/infra/terraform/azure/helm.tf new file mode 100644 index 0000000000..8c28762a43 --- /dev/null +++ b/infra/terraform/azure/helm.tf @@ -0,0 +1,101 @@ +locals { + feast_postgres_secret_name = "${var.name_prefix}-postgres-secret" + feast_helm_values = { + redis = { + enabled = false + } + + grafana = { + enabled = false + } + + kafka = { + enabled = false + } + + postgresql = { + existingSecret = local.feast_postgres_secret_name + } + + feast-core = { + postgresql = { + existingSecret = local.feast_postgres_secret_name + } + } + + feast-online-serving = { + enabled = true + "application-override.yaml" = { + feast = { + core-host = "${var.name_prefix}-feast-core" + core-grpc-port = 6565 + active_store = "online_store" + stores = [ + { + name = "online_store" + type = "REDIS" + config = { + host = azurerm_redis_cache.main.hostname + port = azurerm_redis_cache.main.ssl_port + ssl = true + } + } + ] + } + } + } + + feast-jupyter = { + enabled = true + envOverrides = { + feast_redis_host = azurerm_redis_cache.main.hostname, + feast_redis_port = azurerm_redis_cache.main.ssl_port, + feast_redis_ssl = true + feast_spark_launcher = "k8s" + feast_spark_staging_location = "wasbs://${azurerm_storage_container.staging.name}@${azurerm_storage_account.main.name}.blob.core.windows.net/artifacts/" + feast_historical_feature_output_location : "wasbs://${azurerm_storage_container.staging.name}@${azurerm_storage_account.main.name}.blob.core.windows.net/out/" + feast_historical_feature_output_format : "parquet" + demo_data_location : "wasbs://${azurerm_storage_container.staging.name}@${azurerm_storage_account.main.name}.blob.core.windows.net/test-data/" + feast_azure_blob_account_name = azurerm_storage_account.main.name + feast_azure_blob_account_access_key = azurerm_storage_account.main.primary_access_key + } + } + } +} + +resource "random_password" "feast-postgres-password" { + length = 16 + special = false +} + +resource "kubernetes_secret" "feast-postgres-secret" { + metadata { + name = local.feast_postgres_secret_name + } + data = { + postgresql-password = random_password.feast-postgres-password.result + } +} + +resource "helm_release" "feast" { + depends_on = [kubernetes_secret.feast-postgres-secret] + + name = var.name_prefix + namespace = var.aks_namespace + chart = "../../charts/feast" + + values = [ + yamlencode(local.feast_helm_values) + ] +} + +resource "helm_release" "sparkop" { + name = "sparkop" + namespace = "default" + repository = "https://googlecloudplatform.github.io/spark-on-k8s-operator" + chart = "spark-operator" + set { + name = "serviceAccounts.spark.name" + value = "spark" + } +} diff --git a/infra/terraform/azure/kafka.tf b/infra/terraform/azure/kafka.tf new file mode 100644 index 0000000000..a7403ff709 --- /dev/null +++ b/infra/terraform/azure/kafka.tf @@ -0,0 +1,75 @@ +resource "azurerm_hdinsight_kafka_cluster" "main" { + name = "${var.name_prefix}-kafka" + location = data.azurerm_resource_group.main.location + resource_group_name = data.azurerm_resource_group.main.name + cluster_version = "4.0" + tier = "Standard" + + component_version { + kafka = "2.1" + } + + gateway { + enabled = true + username = "${var.name_prefix}-kafka-gateway" + password = random_password.feast-kafka-gateway-password.result + } + + storage_account { + is_default = true + storage_account_key = azurerm_storage_account.main.primary_access_key + storage_container_id = azurerm_storage_container.kafka.id + } + + roles { + head_node { + vm_size = var.kafka_head_vm_size + username = "${var.name_prefix}-kafka-user" + password = random_password.feast-kafka-role-password.result + subnet_id = azurerm_subnet.kafka.id + virtual_network_id = azurerm_virtual_network.main.id + } + worker_node { + vm_size = var.kafka_worker_vm_size + username = "${var.name_prefix}-kafka-user" + password = random_password.feast-kafka-role-password.result + number_of_disks_per_node = var.kafka_worker_disks_per_node + target_instance_count = var.kafka_worker_target_instance_count + subnet_id = azurerm_subnet.kafka.id + virtual_network_id = azurerm_virtual_network.main.id + } + zookeeper_node { + vm_size = var.kafka_zookeeper_vm_size + username = "${var.name_prefix}-kafka-user" + password = random_password.feast-kafka-role-password.result + subnet_id = azurerm_subnet.kafka.id + virtual_network_id = azurerm_virtual_network.main.id + } + } +} + +resource "random_password" "feast-kafka-role-password" { + length = 16 + special = false + min_upper = 1 + min_lower = 1 + min_numeric = 1 +} + +resource "random_password" "feast-kafka-gateway-password" { + length = 16 + special = true + min_upper = 1 + min_lower = 1 + min_special = 1 + min_numeric = 1 +} + +resource "kubernetes_secret" "feast-kafka-gateway-secret" { + metadata { + name = "feast-kafka-gateway" + } + data = { + kafka-gateway-password = random_password.feast-kafka-gateway-password.result + } +} diff --git a/infra/terraform/azure/provider.tf b/infra/terraform/azure/provider.tf new file mode 100644 index 0000000000..916c10143f --- /dev/null +++ b/infra/terraform/azure/provider.tf @@ -0,0 +1,28 @@ +provider "azurerm" { + version = "=2.40.0" + features {} +} + +provider "helm" { + version = "~> 1.3.2" + kubernetes { + host = azurerm_kubernetes_cluster.main.kube_config.0.host + username = azurerm_kubernetes_cluster.main.kube_config.0.username + password = azurerm_kubernetes_cluster.main.kube_config.0.password + client_certificate = base64decode(azurerm_kubernetes_cluster.main.kube_config.0.client_certificate) + client_key = base64decode(azurerm_kubernetes_cluster.main.kube_config.0.client_key) + cluster_ca_certificate = base64decode(azurerm_kubernetes_cluster.main.kube_config.0.cluster_ca_certificate) + load_config_file = false + } +} + +provider "kubernetes" { + version = "~> 1.13.3" + host = azurerm_kubernetes_cluster.main.kube_config.0.host + username = azurerm_kubernetes_cluster.main.kube_config.0.username + password = azurerm_kubernetes_cluster.main.kube_config.0.password + client_certificate = base64decode(azurerm_kubernetes_cluster.main.kube_config.0.client_certificate) + client_key = base64decode(azurerm_kubernetes_cluster.main.kube_config.0.client_key) + cluster_ca_certificate = base64decode(azurerm_kubernetes_cluster.main.kube_config.0.cluster_ca_certificate) + load_config_file = false +} diff --git a/infra/terraform/azure/redis.tf b/infra/terraform/azure/redis.tf new file mode 100644 index 0000000000..c6e85a4a0b --- /dev/null +++ b/infra/terraform/azure/redis.tf @@ -0,0 +1,12 @@ +resource "azurerm_redis_cache" "main" { + name = "${var.name_prefix}-redis" + location = data.azurerm_resource_group.main.location + resource_group_name = data.azurerm_resource_group.main.name + capacity = var.redis_capacity + family = "P" + sku_name = "Premium" + redis_configuration { + enable_authentication = false + } + subnet_id = azurerm_subnet.redis.id +} diff --git a/infra/terraform/azure/sparkop.tf b/infra/terraform/azure/sparkop.tf new file mode 100644 index 0000000000..e4aa8d7aca --- /dev/null +++ b/infra/terraform/azure/sparkop.tf @@ -0,0 +1,27 @@ +resource "kubernetes_role" "sparkop-user" { + metadata { + name = "use-spark-operator" + namespace = var.aks_namespace + } + rule { + api_groups = ["sparkoperator.k8s.io"] + resources = ["sparkapplications"] + verbs = ["create", "delete", "deletecollection", "get", "list", "update", "watch", "patch"] + } +} + +resource "kubernetes_role_binding" "sparkop-user" { + metadata { + name = "use-spark-operator" + namespace = var.aks_namespace + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "Role" + name = kubernetes_role.sparkop-user.metadata[0].name + } + subject { + kind = "ServiceAccount" + name = "default" + } +} diff --git a/infra/terraform/azure/storage.tf b/infra/terraform/azure/storage.tf new file mode 100644 index 0000000000..08db2386a4 --- /dev/null +++ b/infra/terraform/azure/storage.tf @@ -0,0 +1,21 @@ +resource "azurerm_storage_account" "main" { + name = "${var.name_prefix}storage" + resource_group_name = data.azurerm_resource_group.main.name + location = data.azurerm_resource_group.main.location + account_kind = "StorageV2" + account_tier = "Standard" + account_replication_type = var.storage_account_replication_type + allow_blob_public_access = true +} + +resource "azurerm_storage_container" "staging" { + name = "staging" + storage_account_name = azurerm_storage_account.main.name + container_access_type = "blob" +} + +resource "azurerm_storage_container" "kafka" { + name = "kafkastorage" + storage_account_name = azurerm_storage_account.main.name + container_access_type = "blob" +} diff --git a/infra/terraform/azure/variables.tf b/infra/terraform/azure/variables.tf new file mode 100644 index 0000000000..be4e7f2c19 --- /dev/null +++ b/infra/terraform/azure/variables.tf @@ -0,0 +1,57 @@ +variable "resource_group" { + type = string +} + +variable "name_prefix" { + type = string +} + +variable "aks_machine_type" { + type = string + default = "Standard_DS2_v2" +} + +variable "aks_node_count" { + type = number + default = 2 +} + +variable "redis_capacity" { + type = number + default = 2 +} + +variable "storage_account_replication_type" { + type = string + default = "LRS" +} + +variable "aks_namespace" { + type = string + default = "default" +} + +variable "kafka_head_vm_size" { + type = string + default = "Standard_DS3_v2" +} + +variable "kafka_worker_vm_size" { + type = string + default = "A5" +} + +variable "kafka_zookeeper_vm_size" { + type = string + default = "Standard_DS3_v2" +} + +variable "kafka_worker_disks_per_node" { + type = number + default = 3 +} + +variable "kafka_worker_target_instance_count" { + type = number + default = 3 +} diff --git a/infra/terraform/azure/vnet.tf b/infra/terraform/azure/vnet.tf new file mode 100644 index 0000000000..db790991e0 --- /dev/null +++ b/infra/terraform/azure/vnet.tf @@ -0,0 +1,31 @@ +data "azurerm_resource_group" "main" { + name = var.resource_group +} + +resource "azurerm_virtual_network" "main" { + name = "${var.name_prefix}-vnet" + location = data.azurerm_resource_group.main.location + resource_group_name = data.azurerm_resource_group.main.name + address_space = ["10.1.0.0/16"] +} + +resource "azurerm_subnet" "main" { + name = "${var.name_prefix}-aks-subnet" + resource_group_name = data.azurerm_resource_group.main.name + virtual_network_name = azurerm_virtual_network.main.name + address_prefixes = ["10.1.0.0/24"] +} + +resource "azurerm_subnet" "redis" { + name = "${var.name_prefix}-redis-subnet" + resource_group_name = data.azurerm_resource_group.main.name + virtual_network_name = azurerm_virtual_network.main.name + address_prefixes = ["10.1.128.0/24"] +} + +resource "azurerm_subnet" "kafka" { + name = "${var.name_prefix}-kafka-subnet" + resource_group_name = data.azurerm_resource_group.main.name + virtual_network_name = azurerm_virtual_network.main.name + address_prefixes = ["10.1.64.0/24"] +}