Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: New module volumes on existing catalogs #645

Merged
merged 28 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions databricks-s3-volume-existing-catalog/grants.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
locals {
volume_r_grants = flatten([
for bucket in var.volume_buckets : [
for principal in bucket.volume_r_grant_principals : {
bucket_name = bucket.bucket_name
principal = principal
}
]
])

volume_rw_grants = flatten([
for bucket in var.volume_buckets : [
for principal in bucket.volume_rw_grant_principals : {
bucket_name = bucket.bucket_name
principal = principal
}
]
])
}

# Read-only access grants
resource "databricks_grant" "volume_r" {
for_each = { for grant in local.volume_r_grants : grant.volume_name => grant }

volume = databricks_volume.volume[each.value.volume_name].id
principal = each.value.principal
privileges = ["READ_VOLUME"]

depends_on = [databricks_volume.volume]
}

# Read/write access grants
resource "databricks_grant" "volume_rw" {
for_each = { for grant in local.volume_rw_grants : grant.volume_name => grant }

volume = databricks_volume.volume[each.value.volume_name].id
principal = each.value.principal
privileges = ["READ_VOLUME", "WRITE_VOLUME"]

depends_on = [databricks_volume.volume]
}
90 changes: 90 additions & 0 deletions databricks-s3-volume-existing-catalog/iam.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
## Databricks external location and IAM

data "aws_caller_identity" "current" {
provider = aws
}

data "aws_iam_policy_document" "volume_dbx_unity_aws_role_assume_role" {
statement {
principals {
type = "AWS"
identifiers = ["arn:aws:iam::414351767826:role/unity-catalog-prod-UCMasterRole-14S5ZJVKOTYTL"]
}

actions = ["sts:AssumeRole"]
condition {
test = "StringEquals"
variable = "sts:ExternalId"

values = ["4a2f419c-ae7a-49f1-b774-8f3113d9834d"]
}
}
statement {
principals {
type = "AWS"
identifiers = ["arn:aws:iam::${data.aws_caller_identity.current.account_id}:root"]
}

actions = ["sts:AssumeRole"]
condition {
test = "ArnEquals"
variable = "aws:PrincipalArn"
values = ["arn:aws:iam::${data.aws_caller_identity.current.account_id}:role${local.path}${local.dbx_volume_aws_role_name}"]
}
}
}

resource "aws_iam_role" "volume_dbx_unity_aws_role" {
name = local.dbx_volume_aws_role_name
path = local.path
assume_role_policy = data.aws_iam_policy_document.volume_dbx_unity_aws_role_assume_role.json
}


### Policy document to access default volume bucket and assume role
data "aws_iam_policy_document" "volume_bucket_dbx_unity_access" {
statement {
sid = "dbxSCBucketAccess"
effect = "Allow"
actions = [
"s3:ListBucket",
"s3:GetBucketLocation",
"s3:GetLifecycleConfiguration",
"s3:PutLifecycleConfiguration"
]
resources = [
for bucket in var.volume_buckets : "arn:aws:s3:::${bucket.bucket_name}"
]
}
statement {
sid = "dbxSCObjAccess"
effect = "Allow"
actions = [
"s3:GetObject",
"s3:PutObject",
"s3:DeleteObject",
]
resources = [
for bucket in var.volume_buckets : "arn:aws:s3:::${bucket.bucket_name}/*"
]
}
statement {
sid = "databricksAssumeRole"
effect = "Allow"
actions = [
"sts:AssumeRole"
]
resources = [
"arn:aws:iam::${data.aws_caller_identity.current.account_id}:role${local.path}${local.dbx_volume_aws_role_name}"
]
}
}

resource "aws_iam_policy" "volume_dbx_unity_access_policy" {
policy = data.aws_iam_policy_document.volume_bucket_dbx_unity_access.json
}

resource "aws_iam_role_policy_attachment" "volume_dbx_unity_aws_access" {
policy_arn = aws_iam_policy.volume_dbx_unity_access_policy.arn
role = aws_iam_role.volume_dbx_unity_aws_role.name
}
54 changes: 54 additions & 0 deletions databricks-s3-volume-existing-catalog/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Volume bucket (UC supported)

// https://docs.databricks.com/administration-guide/multiworkspace/iam-role.html#language-Your%C2%A0VPC,%C2%A0custom
locals {
dbx_volume_aws_role_name = "${var.catalog_name}-volumes-role"
path = "/databricks/"
databricks_aws_account = "414351767826" # Databricks' own AWS account, not CZI's. See https://docs.databricks.com/en/administration-guide/account-settings-e2/credentials.html#step-1-create-a-cross-account-iam-role
}

### Databricks storage credential - allows workspace to access an external location.
### NOTE: names need to be unique across an account, not just a workspace

resource "databricks_storage_credential" "volume" {
depends_on = [
resource.aws_iam_role.volume_dbx_unity_aws_role,
resource.aws_iam_role_policy_attachment.volume_dbx_unity_aws_access
]

name = "${var.catalog_name}-volumes-storage-credential"
aws_iam_role {
role_arn = aws_iam_role.volume_dbx_unity_aws_role.arn
}
comment = "Managed by Terraform - access for the volumes in ${var.catalog_name}"
}

# upstream external location sometimes takes a moment to register
resource "time_sleep" "wait_30_seconds" {
depends_on = [databricks_storage_credential.volume]

create_duration = "30s"
}

resource "databricks_external_location" "volume" {
for_each = { for bucket in var.volume_buckets : bucket.volume_name => bucket }
depends_on = [time_sleep.wait_30_seconds]

name = "${each.value.volume_name}-external-location"
url = "s3://${each.value.bucket_name}"
credential_name = databricks_storage_credential.volume.name
comment = "Managed by Terraform - access for the volume named ${each.value.bucket_name} in ${var.catalog_name}"
}

# New volume
resource "databricks_volume" "volume" {
for_each = { for bucket in var.volume_buckets : bucket.volume_name => bucket }
depends_on = [databricks_external_location.volume]
name = each.value.volume_name
catalog_name = var.catalog_name
schema_name = var.schema_name
volume_type = "EXTERNAL"
storage_location = "s3://${each.value.bucket_name}${each.value.bucket_prefix != "" ? "/${each.value.bucket_prefix}" : ""}"
owner = var.catalog_owner
comment = "Managed by Terraform - access for the volume named ${each.value.bucket_name} in ${var.catalog_name}"
}
36 changes: 36 additions & 0 deletions databricks-s3-volume-existing-catalog/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
variable "catalog_name" {
description = "Name of the Databricks existing catalog to add the volume to"
type = string
}

variable "catalog_owner" {
description = "User or group name of the catalog owner"
type = string
}

variable "schema_name" {
description = "Name of the Databricks schema to add the volume to"
type = string
}

variable "volume_buckets" {
description = "List of external buckets and their corresponding groups that should have r/rw access to it"
type = list(object({
volume_name : string
bucket_name : string
bucket_prefix: optional(string, "")
volume_r_grant_principals: optional(list(string), [])
volume_rw_grant_principals: optional(list(string), [])
}))
}

variable "tags" {
description = "REQUIRED: Tags to include for this environment."
type = object({
project : string
env : string
service : string
owner : string
managedBy : string
})
}
11 changes: 11 additions & 0 deletions databricks-s3-volume-existing-catalog/versions.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
}
databricks = {
source = "databricks/databricks"
}
}
required_version = ">= 1.3.0"
}
Loading