Skip to content

Commit

Permalink
New DB schema for GitHub metrics script (#23606)
Browse files Browse the repository at this point in the history
### Details:
Improvements and fixes for the script which sends GitHub Workflow
metrics to a database. See also:
[23484](#23484)
  • Loading branch information
ababushk authored Mar 21, 2024
1 parent 578d692 commit 5559ee7
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 32 deletions.
90 changes: 59 additions & 31 deletions .github/scripts/collect_github_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,37 +6,49 @@
import logging
import psycopg2
import dateutil
import argparse

def init_logger():
LOGLEVEL = os.environ.get('LOGLEVEL', 'INFO').upper()
logging.basicConfig(level=LOGLEVEL,
format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
datefmt='%m-%d-%Y %H:%M:%S')

def make_parser():
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--repository-name', type=str, required=True,
help='Repository name in OWNER/REPOSITORY format')
parser.add_argument('--run-id', type=str, required=True,
help='Workflow Run ID')

return parser

def create_db_tables(conn, cur):
cur.execute('''CREATE TABLE IF NOT EXISTS github_workflow_runs_test(
id SERIAL,
run_id BIGINT PRIMARY KEY,
cur.execute('''CREATE TABLE IF NOT EXISTS workflow_runs(
id SERIAL PRIMARY KEY,
run_id BIGINT,
html_url TEXT,
name VARCHAR(255),
run_started_at TIMESTAMP,
created_at TIMESTAMP,
updated_at TIMESTAMP,
triggering_actor_login VARCHAR(255),
conclusion VARCHAR(25),
run_number INT,
event VARCHAR(50),
run_attempt INT,
repository_full_name VARCHAR(255),
head_repository_full_name VARCHAR(255),
head_branch VARCHAR(255),
status VARCHAR(25),
display_title TEXT,
path TEXT
path TEXT,
total_duration_seconds INT
);
''')
cur.execute('''CREATE TABLE IF NOT EXISTS github_workflow_jobs_test(
id SERIAL,
job_id BIGINT PRIMARY KEY,
parent_run_id BIGINT REFERENCES github_workflow_runs_test(run_id),
cur.execute('''CREATE TABLE IF NOT EXISTS workflow_jobs(
id SERIAL PRIMARY KEY,
job_id BIGINT,
parent_run_id BIGINT,
html_url TEXT,
name VARCHAR(255),
created_at TIMESTAMP,
Expand All @@ -47,12 +59,14 @@ def create_db_tables(conn, cur):
runner_name VARCHAR(255),
status VARCHAR(25),
conclusion VARCHAR(25),
head_branch VARCHAR(255)
head_branch VARCHAR(255),
run_attempt INT,
workflow_name TEXT
);
''')
cur.execute('''CREATE TABLE IF NOT EXISTS github_workflow_steps_test(
cur.execute('''CREATE TABLE IF NOT EXISTS workflow_steps(
id SERIAL PRIMARY KEY,
parent_job_id BIGINT REFERENCES github_workflow_jobs_test(job_id),
parent_job_id BIGINT,
name VARCHAR(255),
conclusion VARCHAR(25),
number INT,
Expand All @@ -65,20 +79,16 @@ def create_db_tables(conn, cur):

def main():
init_logger()

parser = make_parser()
args = parser.parse_args()
logger = logging.getLogger(__name__)

github_token = os.environ.get('GITHUB_TOKEN')
if not github_token:
raise ValueError('GITHUB_TOKEN environment variable is not set!')

run_id = os.environ.get('RUN_ID')
if not run_id:
raise ValueError('RUN_ID environment variable is not set!')

repo_name = os.environ.get('GITHUB_REPOSITORY')
if not repo_name:
raise ValueError('GITHUB_REPOSITORY environment variable is not set!')
run_id = args.run_id
repo_name = args.repository_name


# this should be specified in runner's env
Expand All @@ -102,18 +112,30 @@ def main():
repo = g.get_repo(repo_name)

run = repo.get_workflow_run(int(run_id))

workflow_data_query = f'''INSERT INTO github_workflow_runs_test(
if run.status != 'completed':
logger.error('Run %s is not completed! Only completed runs should be in the database', run_id)
raise SystemExit(1)

# We rely on the following assumptions:
# - The workflow run is completed. When run.status != 'completed' we should not add it to the database
# theoretically the second attempt can be triggerred right after the completion of the first one
# or while the runner which executes this script is deploying
#
# - Job's queued duration equals "job.started_at - job.created_at" if started_at > created_at.
# Otherwise the job should not be added to the database
total_duration_seconds = round(run.timing().run_duration_ms / 1000)
workflow_data_query = f'''INSERT INTO workflow_runs(
run_id, html_url, name,
run_started_at, triggering_actor_login, conclusion,
run_number, event, run_attempt, repository_full_name,
head_branch, display_title, path)
run_started_at, created_at, updated_at, triggering_actor_login, conclusion,
event, run_attempt, repository_full_name,
head_branch, display_title, path, total_duration_seconds)
VALUES(
'{run_id}', '{run.html_url}', '{run.name}', '{run.run_started_at}',
'{run.created_at}', '{run.updated_at}',
'{run.raw_data['triggering_actor']['login']}',
'{run.conclusion}', '{run.run_number}', '{run.event}',
'{run.conclusion}', '{run.event}',
'{run.run_attempt}', '{run.raw_data['repository']['full_name']}',
'{run.head_branch}', '{run.display_title}', '{run.path}'
'{run.head_branch}', '{run.display_title}', '{run.path}', '{total_duration_seconds}'
);
'''

Expand All @@ -126,6 +148,10 @@ def main():
duration_seconds = 0

job_created_at_date = dateutil.parser.parse(job.raw_data['created_at'])
if job_created_at_date > job.started_at:
logger.warning('Skipping job %s of run %s - most likely a stub \
job created after workflow restart', job.name, run_id)
continue

queued_duration_timedelta = job.started_at - job_created_at_date
queued_duration_seconds = round(queued_duration_timedelta.total_seconds())
Expand All @@ -134,17 +160,19 @@ def main():
duration_seconds = round(duration_timedelta.total_seconds())

job_data_query = f'''
INSERT INTO github_workflow_jobs_test(
INSERT INTO workflow_jobs(
job_id, parent_run_id, html_url, name,
created_at, started_at, completed_at,
queued_duration_seconds, duration_seconds,
runner_name, status, conclusion, head_branch)
runner_name, status, conclusion, head_branch,
run_attempt, workflow_name
)
VALUES(
'{job_id}', '{run_id}', '{job.html_url}', '{job.name}',
'{job.raw_data['created_at']}', '{job.started_at}', '{job.completed_at}',
'{queued_duration_seconds}', '{duration_seconds}',
'{job.raw_data['runner_name']}', '{job.status}', '{job.conclusion}',
'{job.raw_data['head_branch']}'
'{job.raw_data['head_branch']}', '{job.raw_data['run_attempt']}', '{job.raw_data['workflow_name']}'
);
'''
logger.debug('Job query: %s', job_data_query)
Expand All @@ -154,7 +182,7 @@ def main():
duration_seconds = round(duration_seconds_timedelta.total_seconds())

step_data_query = f'''
INSERT INTO github_workflow_steps_test(
INSERT INTO workflow_steps(
parent_job_id, name, conclusion,
number, started_at, completed_at,
duration_seconds)
Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/send_workflows_to_opentelemetry.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ jobs:
- name: Install deps
run: |
pip3 install -r .github/scripts/requirements.txt
# dependency review action has these as an exception
# yet it still complains, so install them here
pip3 install PyGithub==2.2.0 psycopg2-binary==2.9.9
- name: Send metrics to SQL database
Expand All @@ -58,6 +60,9 @@ jobs:
PGHOST: ${{ secrets.METRICS_DATABASE_HOST }}
PGUSER: ${{ secrets.METRICS_DATABASE_USERNAME }}
PGPASSWORD: ${{ secrets.METRICS_DATABASE_PASSWORD }}
PGDATABASE: ${{ secrets.METRICS_DATABASE_NAME }}
PGPORT: 5432
run: |
python3 .github/scripts/collect_github_metrics.py
python3 .github/scripts/collect_github_metrics.py \
--run-id ${{ github.event.workflow_run.id }} \
--repository-name ${GITHUB_REPOSITORY}

0 comments on commit 5559ee7

Please sign in to comment.