Skip to content

Commit

Permalink
Upload git info with metadata file (#1) (#37802)
Browse files Browse the repository at this point in the history
## What
<!--
* Describe what the change is solving. Link all GitHub issues related to this change.
-->
Adds git commit info to the metadata file during upload.

![image.png](https://graphite-user-uploaded-assets-prod.s3.amazonaws.com/PTsI7qAmiIMkhFQg04QF/b7de4cce-ffe8-4506-a13d-027b1ba21a34.png)


Spun out of #32715 as a stack
  • Loading branch information
bnchrch authored May 9, 2024
1 parent 99ab869 commit c0492b0
Show file tree
Hide file tree
Showing 21 changed files with 1,417 additions and 487 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@
import base64
import hashlib
import json
import logging
import os
import re
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple

import git
import yaml
from google.cloud import storage
from google.oauth2 import service_account
Expand All @@ -23,8 +26,10 @@
METADATA_FOLDER,
)
from metadata_service.models.generated.ConnectorMetadataDefinitionV0 import ConnectorMetadataDefinitionV0
from metadata_service.models.generated.GitInfo import GitInfo
from metadata_service.models.transform import to_json_sanitized_dict
from metadata_service.validators.metadata_validator import POST_UPLOAD_VALIDATORS, ValidatorOptions, validate_and_load
from pydash import set_
from pydash.objects import get


Expand Down Expand Up @@ -172,27 +177,95 @@ def _doc_upload(
return doc_uploaded, doc_blob_id


def create_prerelease_metadata_file(metadata_file_path: Path, validator_opts: ValidatorOptions) -> Path:
metadata, error = validate_and_load(metadata_file_path, [], validator_opts)
if metadata is None:
raise ValueError(f"Metadata file {metadata_file_path} is invalid for uploading: {error}")
def _apply_prerelease_overrides(metadata_dict: dict, validator_opts: ValidatorOptions) -> dict:
"""Apply any prerelease overrides to the metadata file before uploading it to GCS."""
if validator_opts.prerelease_tag is None:
return metadata_dict

# replace any dockerImageTag references with the actual tag
# this includes metadata.data.dockerImageTag, metadata.data.registries[].dockerImageTag
# where registries is a dictionary of registry name to registry object
metadata_dict = to_json_sanitized_dict(metadata, exclude_none=True)
metadata_dict["data"]["dockerImageTag"] = validator_opts.prerelease_tag
for registry in get(metadata_dict, "data.registries", {}).values():
if "dockerImageTag" in registry:
registry["dockerImageTag"] = validator_opts.prerelease_tag

# write metadata to yaml file in system tmp folder
tmp_metadata_file_path = Path("/tmp") / metadata.data.dockerRepository / validator_opts.prerelease_tag / METADATA_FILE_NAME
tmp_metadata_file_path.parent.mkdir(parents=True, exist_ok=True)
with open(tmp_metadata_file_path, "w") as f:
yaml.dump(metadata_dict, f)
return metadata_dict


def _commit_to_git_info(commit: git.Commit) -> GitInfo:
return GitInfo(
commit_sha=commit.hexsha,
commit_timestamp=commit.authored_datetime,
commit_author=commit.author.name,
commit_author_email=commit.author.email,
)


def _get_git_info_for_file(original_metadata_file_path: Path) -> Optional[GitInfo]:
"""
Add additional information to the metadata file before uploading it to GCS.
e.g. The git commit hash, the date of the commit, the author of the commit, etc.
"""
try:
repo = git.Repo(search_parent_directories=True)

# get the commit hash for the last commit that modified the metadata file
commit_sha = repo.git.log("-1", "--format=%H", str(original_metadata_file_path))

commit = repo.commit(commit_sha)
return _commit_to_git_info(commit)
except git.exc.InvalidGitRepositoryError:
logging.warning(f"Metadata file {original_metadata_file_path} is not in a git repository, skipping author info attachment.")
return None
except git.exc.GitCommandError as e:
if "unknown revision or path not in the working tree" in str(e):
logging.warning(f"Metadata file {original_metadata_file_path} is not tracked by git, skipping author info attachment.")
return None
else:
raise e


return tmp_metadata_file_path
def _apply_author_info_to_metadata_file(metadata_dict: dict, original_metadata_file_path: Path) -> dict:
"""Apply author info to the metadata file before uploading it to GCS."""
git_info = _get_git_info_for_file(original_metadata_file_path)
if git_info:
# Apply to the nested / optional field at metadata.data.generated.git
git_info_dict = to_json_sanitized_dict(git_info, exclude_none=True)
metadata_dict = set_(metadata_dict, "data.generated.git", git_info_dict)
return metadata_dict


def _write_metadata_to_tmp_file(metadata_dict: dict) -> Path:
"""Write the metadata to a temporary file."""
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as tmp_file:
yaml.dump(metadata_dict, tmp_file)
return Path(tmp_file.name)


def _safe_load_metadata_file(metadata_file_path: Path) -> dict:
try:
metadata = yaml.safe_load(metadata_file_path.read_text())
if metadata is None or not isinstance(metadata, dict):
raise ValueError(f"Validation error: Metadata file {metadata_file_path} is invalid yaml.")
return metadata
except Exception as e:
raise ValueError(f"Validation error: Metadata file {metadata_file_path} is invalid yaml: {e}")


def _apply_modifications_to_metadata_file(original_metadata_file_path: Path, validator_opts: ValidatorOptions) -> Path:
"""Apply modifications to the metadata file before uploading it to GCS.
e.g. The git commit hash, the date of the commit, the author of the commit, etc.
"""
metadata = _safe_load_metadata_file(original_metadata_file_path)
metadata = _apply_prerelease_overrides(metadata, validator_opts)
metadata = _apply_author_info_to_metadata_file(metadata, original_metadata_file_path)

return _write_metadata_to_tmp_file(metadata)


def upload_metadata_to_gcs(bucket_name: str, metadata_file_path: Path, validator_opts: ValidatorOptions) -> MetadataUploadInfo:
Expand All @@ -209,11 +282,10 @@ def upload_metadata_to_gcs(bucket_name: str, metadata_file_path: Path, validator
Returns:
Tuple[bool, str]: Whether the metadata file was uploaded and its blob id.
"""
if validator_opts.prerelease_tag:
metadata_file_path = create_prerelease_metadata_file(metadata_file_path, validator_opts)

metadata, error = validate_and_load(metadata_file_path, POST_UPLOAD_VALIDATORS, validator_opts)
metadata_file_path = _apply_modifications_to_metadata_file(metadata_file_path, validator_opts)

metadata, error = validate_and_load(metadata_file_path, POST_UPLOAD_VALIDATORS, validator_opts)
if metadata is None:
raise ValueError(f"Metadata file {metadata_file_path} is invalid for uploading: {error}")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from __future__ import annotations

from datetime import date
from datetime import date, datetime
from typing import Any, Dict, List, Optional
from uuid import UUID

Expand Down Expand Up @@ -104,6 +104,28 @@ class Config:
packageName: str = Field(..., description="The name of the package on PyPi.")


class GitInfo(BaseModel):
class Config:
extra = Extra.forbid

commit_sha: Optional[str] = Field(
None,
description="The git commit sha of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_timestamp: Optional[datetime] = Field(
None,
description="The git commit timestamp of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_author: Optional[str] = Field(
None,
description="The git commit author of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_author_email: Optional[str] = Field(
None,
description="The git commit author email of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)


class JobTypeResourceLimit(BaseModel):
class Config:
extra = Extra.forbid
Expand All @@ -123,6 +145,10 @@ class Config:
pypi: Optional[PyPi] = None


class GeneratedFields(BaseModel):
git: Optional[GitInfo] = None


class ActorDefinitionResourceRequirements(BaseModel):
class Config:
extra = Extra.forbid
Expand Down Expand Up @@ -232,7 +258,8 @@ class Config:
resourceRequirements: Optional[ActorDefinitionResourceRequirements] = None
ab_internal: Optional[AirbyteInternal] = None
remoteRegistries: Optional[RemoteRegistries] = None
supportsRefreshes: Optional[bool] = None
supportsRefreshes: Optional[bool] = False
generated: Optional[GeneratedFields] = None


class ConnectorMetadataDefinitionV0(BaseModel):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,4 @@ class Config:
allowedHosts: Optional[AllowedHosts] = None
releases: Optional[ConnectorReleases] = None
ab_internal: Optional[AirbyteInternal] = None
supportsRefreshes: Optional[bool] = False
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ class Config:
allowedHosts: Optional[AllowedHosts] = None
releases: Optional[ConnectorReleases] = None
ab_internal: Optional[AirbyteInternal] = None
supportsRefreshes: Optional[bool] = False


class ConnectorRegistryV0(BaseModel):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# generated by datamodel-codegen:
# filename: GeneratedFields.yaml

from __future__ import annotations

from datetime import datetime
from typing import Optional

from pydantic import BaseModel, Extra, Field


class GitInfo(BaseModel):
class Config:
extra = Extra.forbid

commit_sha: Optional[str] = Field(
None,
description="The git commit sha of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_timestamp: Optional[datetime] = Field(
None,
description="The git commit timestamp of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_author: Optional[str] = Field(
None,
description="The git commit author of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_author_email: Optional[str] = Field(
None,
description="The git commit author email of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)


class GeneratedFields(BaseModel):
git: Optional[GitInfo] = None
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# generated by datamodel-codegen:
# filename: GitInfo.yaml

from __future__ import annotations

from datetime import datetime
from typing import Optional

from pydantic import BaseModel, Extra, Field


class GitInfo(BaseModel):
class Config:
extra = Extra.forbid

commit_sha: Optional[str] = Field(
None,
description="The git commit sha of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_timestamp: Optional[datetime] = Field(
None,
description="The git commit timestamp of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_author: Optional[str] = Field(
None,
description="The git commit author of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
commit_author_email: Optional[str] = Field(
None,
description="The git commit author email of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.",
)
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from .ConnectorRegistrySourceDefinition import *
from .ConnectorRegistryV0 import *
from .ConnectorReleases import *
from .GeneratedFields import *
from .GitInfo import *
from .JobType import *
from .NormalizationDestinationDefinitionConfig import *
from .RegistryOverrides import *
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,5 @@ properties:
supportsRefreshes:
type: boolean
default: false
generated:
"$ref": GeneratedFields.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
"$schema": http://json-schema.org/draft-07/schema#
"$id": https://github.com/airbytehq/airbyte/airbyte-ci/connectors_ci/metadata_service/lib/models/src/GeneratedFields.yaml
title: GeneratedFields
description: Optional schema for fields generated at metadata upload time
type: object
properties:
git:
"$ref": GitInfo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
"$schema": http://json-schema.org/draft-07/schema#
"$id": https://github.com/airbytehq/airbyte/airbyte-ci/connectors/metadata_service/lib/metadata_service/models/src/GitInfo.yaml
title: GitInfo
description: Information about the author of the last commit that modified this file
type: object
additionalProperties: false
properties:
commit_sha:
type: string
description: The git commit sha of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.
commit_timestamp:
type: string
format: date-time
description: The git commit timestamp of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.
commit_author:
type: string
description: The git commit author of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.
commit_author_email:
type: string
description: The git commit author email of the last commit that modified this file. DO NOT DEFINE THIS FIELD MANUALLY. It will be overwritten by the CI.
Loading

0 comments on commit c0492b0

Please sign in to comment.