New DB schema for GitHub metrics script (#23606)

### Details: Improvements and fixes for the script which sends GitHub Workflow metrics to a database. See also: [23484](#23484)
openvinotoolkit · Mar 21, 2024 · 5559ee7 · 5559ee7
1 parent 578d692
commit 5559ee7
Show file tree

Hide file tree

Showing 2 changed files with 65 additions and 32 deletions.
diff --git a/.github/scripts/collect_github_metrics.py b/.github/scripts/collect_github_metrics.py
@@ -6,37 +6,49 @@
 import logging
 import psycopg2
 import dateutil
+import argparse
 
 def init_logger():
     LOGLEVEL = os.environ.get('LOGLEVEL', 'INFO').upper()
     logging.basicConfig(level=LOGLEVEL,
                         format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                         datefmt='%m-%d-%Y %H:%M:%S')
 
+def make_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-r', '--repository-name', type=str, required=True,
+                        help='Repository name in OWNER/REPOSITORY format')
+    parser.add_argument('--run-id', type=str, required=True,
+                        help='Workflow Run ID')
+
+    return parser
+
 def create_db_tables(conn, cur):
-    cur.execute('''CREATE TABLE IF NOT EXISTS github_workflow_runs_test(
-    id SERIAL,
-    run_id BIGINT PRIMARY KEY,
+    cur.execute('''CREATE TABLE IF NOT EXISTS workflow_runs(
+    id SERIAL PRIMARY KEY,
+    run_id BIGINT,
     html_url TEXT,
     name VARCHAR(255),
     run_started_at TIMESTAMP,
+    created_at TIMESTAMP,
+    updated_at TIMESTAMP,
     triggering_actor_login VARCHAR(255),
     conclusion VARCHAR(25),
-    run_number INT,
     event VARCHAR(50),
     run_attempt INT,
     repository_full_name VARCHAR(255),
     head_repository_full_name VARCHAR(255),
     head_branch VARCHAR(255),
     status VARCHAR(25),
     display_title TEXT,
-    path TEXT
+    path TEXT,
+    total_duration_seconds INT
     );
     ''')
-    cur.execute('''CREATE TABLE IF NOT EXISTS github_workflow_jobs_test(
-    id SERIAL,
-    job_id BIGINT PRIMARY KEY,
-    parent_run_id BIGINT REFERENCES github_workflow_runs_test(run_id),
+    cur.execute('''CREATE TABLE IF NOT EXISTS workflow_jobs(
+    id SERIAL PRIMARY KEY,
+    job_id BIGINT,
+    parent_run_id BIGINT,
     html_url TEXT,
     name VARCHAR(255),
     created_at TIMESTAMP,
@@ -47,12 +59,14 @@ def create_db_tables(conn, cur):
     runner_name VARCHAR(255),
     status VARCHAR(25),
     conclusion VARCHAR(25),
-    head_branch VARCHAR(255)
+    head_branch VARCHAR(255),
+    run_attempt INT,
+    workflow_name TEXT
     );
     ''')
-    cur.execute('''CREATE TABLE IF NOT EXISTS github_workflow_steps_test(
+    cur.execute('''CREATE TABLE IF NOT EXISTS workflow_steps(
     id SERIAL PRIMARY KEY,
-    parent_job_id BIGINT REFERENCES github_workflow_jobs_test(job_id),
+    parent_job_id BIGINT,
     name VARCHAR(255),
     conclusion VARCHAR(25),
     number INT,
@@ -65,20 +79,16 @@ def create_db_tables(conn, cur):
 
 def main():
     init_logger()
-
+    parser = make_parser()
+    args = parser.parse_args()
     logger = logging.getLogger(__name__)
 
     github_token = os.environ.get('GITHUB_TOKEN')
     if not github_token:
         raise ValueError('GITHUB_TOKEN environment variable is not set!')
 
-    run_id = os.environ.get('RUN_ID')
-    if not run_id:
-        raise ValueError('RUN_ID environment variable is not set!')
-
-    repo_name = os.environ.get('GITHUB_REPOSITORY')
-    if not repo_name:
-        raise ValueError('GITHUB_REPOSITORY environment variable is not set!')
+    run_id = args.run_id
+    repo_name = args.repository_name
 
 
     # this should be specified in runner's env
@@ -102,18 +112,30 @@ def main():
     repo = g.get_repo(repo_name)
 
     run = repo.get_workflow_run(int(run_id))
-
-    workflow_data_query = f'''INSERT INTO github_workflow_runs_test(
+    if run.status != 'completed':
+        logger.error('Run %s is not completed! Only completed runs should be in the database', run_id)
+        raise SystemExit(1)
+
+    # We rely on the following assumptions:
+    # - The workflow run is completed. When run.status != 'completed' we should not add it to the database
+    #   theoretically the second attempt can be triggerred right after the completion of the first one
+    #   or while the runner which executes this script is deploying
+    #
+    # - Job's queued duration equals "job.started_at - job.created_at" if started_at > created_at.
+    #   Otherwise the job should not be added to the database
+    total_duration_seconds = round(run.timing().run_duration_ms / 1000)
+    workflow_data_query = f'''INSERT INTO workflow_runs(
     run_id, html_url, name,
-    run_started_at, triggering_actor_login, conclusion,
-    run_number, event, run_attempt, repository_full_name,
-    head_branch, display_title, path)
+    run_started_at, created_at, updated_at, triggering_actor_login, conclusion,
+    event, run_attempt, repository_full_name,
+    head_branch, display_title, path, total_duration_seconds)
     VALUES(
     '{run_id}', '{run.html_url}', '{run.name}', '{run.run_started_at}',
+    '{run.created_at}', '{run.updated_at}',
     '{run.raw_data['triggering_actor']['login']}',
-    '{run.conclusion}', '{run.run_number}', '{run.event}',
+    '{run.conclusion}', '{run.event}',
     '{run.run_attempt}', '{run.raw_data['repository']['full_name']}',
-    '{run.head_branch}', '{run.display_title}', '{run.path}'
+    '{run.head_branch}', '{run.display_title}', '{run.path}', '{total_duration_seconds}'
     );
     '''
 
@@ -126,6 +148,10 @@ def main():
         duration_seconds = 0
 
         job_created_at_date = dateutil.parser.parse(job.raw_data['created_at'])
+        if job_created_at_date > job.started_at:
+            logger.warning('Skipping job %s of run %s - most likely a stub \
+            job created after workflow restart', job.name, run_id)
+            continue
 
         queued_duration_timedelta = job.started_at - job_created_at_date
         queued_duration_seconds = round(queued_duration_timedelta.total_seconds())
@@ -134,17 +160,19 @@ def main():
         duration_seconds = round(duration_timedelta.total_seconds())
 
         job_data_query = f'''
-        INSERT INTO github_workflow_jobs_test(
+        INSERT INTO workflow_jobs(
         job_id, parent_run_id, html_url, name,
         created_at, started_at, completed_at,
         queued_duration_seconds, duration_seconds,
-        runner_name, status, conclusion, head_branch)
+        runner_name, status, conclusion, head_branch,
+        run_attempt, workflow_name
+        )
         VALUES(
         '{job_id}', '{run_id}', '{job.html_url}', '{job.name}',
         '{job.raw_data['created_at']}', '{job.started_at}', '{job.completed_at}',
         '{queued_duration_seconds}', '{duration_seconds}',
         '{job.raw_data['runner_name']}', '{job.status}', '{job.conclusion}',
-        '{job.raw_data['head_branch']}'
+        '{job.raw_data['head_branch']}', '{job.raw_data['run_attempt']}', '{job.raw_data['workflow_name']}'
         );
         '''
         logger.debug('Job query: %s', job_data_query)
@@ -154,7 +182,7 @@ def main():
             duration_seconds = round(duration_seconds_timedelta.total_seconds())
 
             step_data_query = f'''
-            INSERT INTO github_workflow_steps_test(
+            INSERT INTO workflow_steps(
             parent_job_id, name, conclusion,
             number, started_at, completed_at,
             duration_seconds)

diff --git a/.github/workflows/send_workflows_to_opentelemetry.yml b/.github/workflows/send_workflows_to_opentelemetry.yml
@@ -49,6 +49,8 @@ jobs:
       - name: Install deps
         run: |
           pip3 install -r .github/scripts/requirements.txt
+          # dependency review action has these as an exception
+          # yet it still complains, so install them here
           pip3 install PyGithub==2.2.0 psycopg2-binary==2.9.9
 
       - name: Send metrics to SQL database
@@ -58,6 +60,9 @@ jobs:
           PGHOST: ${{ secrets.METRICS_DATABASE_HOST }}
           PGUSER: ${{ secrets.METRICS_DATABASE_USERNAME }}
           PGPASSWORD: ${{ secrets.METRICS_DATABASE_PASSWORD }}
+          PGDATABASE: ${{ secrets.METRICS_DATABASE_NAME }}
           PGPORT: 5432
         run: |
-          python3 .github/scripts/collect_github_metrics.py
+          python3 .github/scripts/collect_github_metrics.py \
+          --run-id ${{ github.event.workflow_run.id }} \
+          --repository-name ${GITHUB_REPOSITORY}