diff --git a/.docker-setup.sh b/.docker-setup.sh index d9c9e7f8ce..dfdc797a38 100755 --- a/.docker-setup.sh +++ b/.docker-setup.sh @@ -24,7 +24,6 @@ missingModules="" #Check everything that needs to be in the $PATH is in there. #Bash doesn't let this work if this is in an if statement for some reason it has to be chained type -P "docker" &>/dev/null && echo "docker found..." || missingModules="${missingModules} docker" -type -P "docker-compose" &>/dev/null && echo "docker-compose found..." || missingModules="${missingModules} docker-compose" type -P "ifconfig" &>/dev/null && echo "ifconfig found..." || missingModules="${missingModules} ifconfig (part of net-tools)" type -P "psql" &>/dev/null && echo "psql found..." || missingModules="${missingModules} psql" type -P "watch" &>/dev/null && echo "watch found..." || missingModules="${missingModules} watch" diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml new file mode 100644 index 0000000000..c23bfd7bb3 --- /dev/null +++ b/.github/workflows/checks.yml @@ -0,0 +1,31 @@ +name: "run-linting-checks" +on: + pull_request: + branches: [main, dev] + +jobs: + run-pylint: + name: runner / pylint + permissions: write-all + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: dciborow/action-pylint@0.1.0 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + reporter: github-pr-review + level: warning + glob_pattern: "**/*.py" + filter_mode: "file" + + misspell: + name: runner / misspell + runs-on: ubuntu-latest + steps: + - name: Highlight any misspellings in changes. + uses: actions/checkout@v4 + - name: misspell + uses: reviewdog/action-misspell@v1 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + locale: "US" \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2feaa98037..46e492cd47 100644 --- a/.gitignore +++ b/.gitignore @@ -4,13 +4,12 @@ env.txt docker_env.txt pyenv.txt augur_export_env.sh -.DS_Store +*DS_Store *.config.json !docker.config.json config.yml reports.yml - node_modules/ .idea/ logs/ diff --git a/.pylintrc b/.pylintrc index 0b1b7d2049..0056af873b 100644 --- a/.pylintrc +++ b/.pylintrc @@ -12,7 +12,7 @@ #refactoring checker #enable=R -disable=E0611,E1101,W1203,R0801,W0614,W0611,C0411,C0103,C0301,C0303,C0304,C0305,W0311 +disable=E0611,E1101,W1203,R0801,W0614,W0611,C0411,C0103,C0301,C0303,C0304,C0305,W0311,E0401 # Analyse import fallback blocks. This can be used to support both Python 2 and diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000000..3b0e387327 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,37 @@ +#SPDX-License-Identifier: MIT +# .readthedocs.yml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 +build: + os: ubuntu-22.04 # <- add this line + tools: + python: "3.10" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/source/conf.py + +# Build documentation with MkDocs +#mkdocs: +# configuration: mkdocs.yml + +# Optionally build your docs in additional formats such as PDF and ePub +formats: all + +# Optionally set the version of Python and requirements required to build your docs +python: + install: + - method: pip + path: . + extra_requirements: + - dev + - method: setuptools + path: . + +# build: +# os: ubuntu-22.04 +# tools: +# python:3.10 diff --git a/Makefile b/Makefile index f67aac4676..d9f199a867 100644 --- a/Makefile +++ b/Makefile @@ -143,16 +143,16 @@ docs-view: docs compose-run: - @ docker-compose -f docker-compose.yml up --build + @ docker compose -f docker-compose.yml up --build compose-run-database: @ echo "**************************************************************************" @ echo "Make sure there are no database credentials in docker_env.txt!" @ echo "**************************************************************************" @ echo - @ docker-compose -f docker-compose.yml -f database-compose.yml up --build + @ docker compose -f docker-compose.yml -f database-compose.yml up --build -docker-build: docker-build-backend docker-build-frontend docker-build-database +docker-build: docker-build-backend docker-build-frontend docker-build-database docker-build-rabbitmq docker-build-backend: @ docker build -t augurlabs/augur:backend -f util/docker/backend/Dockerfile . @@ -163,6 +163,8 @@ docker-build-frontend: docker-build-database: @ docker build -t augurlabs/augur:database -f util/docker/database/Dockerfile . +docker-build-rabbitmq: + @ docker build -t augurlabs/augur:rabbitmq -f util/docker/rabbitmq/Dockerfile . docker-run-backend: @ - docker stop augur_backend @@ -178,3 +180,8 @@ docker-run-database: @ - docker stop augur_database @ - docker rm augur_database docker run -p 5434:5432 --name augur_database augurlabs/augur:database + +docker-run-rabbitmq: + @ - docker stop augur_rabbitmq + @ - docker rm augur_rabbitmq + docker run -p 5434:5432 --name augur_rabbitmq augurlabs/augur:rabbitmq \ No newline at end of file diff --git a/README.md b/README.md index 811157100a..13fbe0dca3 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ -# Augur NEW Release v0.51.1 +# Augur NEW Release v0.62.4 + +Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! +The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io [![first-timers-only](https://img.shields.io/badge/first--timers--only-friendly-blue.svg?style=flat-square)](https://www.firsttimersonly.com/) We follow the [First Timers Only](https://www.firsttimersonly.com/) philosophy of tagging issues for first timers only, and walking one newcomer through the resolution process weekly. [You can find these issues tagged with "first timers only" on our issues list.](https://github.com/chaoss/augur/labels/first-timers-only). @@ -7,8 +10,7 @@ ## NEW RELEASE ALERT! ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md) - -Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.51.1 +Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.62.4 - The `main` branch is a stable version of our new architecture, which features: - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks. - A new job management architecture that uses Celery and Redis to manage queues, and enables users to run a Flower job monitoring dashboard @@ -95,6 +97,7 @@ Contributors - `Dawn Foster `_ - `Ivana Atanasova `_ - `Georg J.P. Link `_ +- `Gary P White `_ GSoC 2022 participants ----------------------- diff --git a/add.md b/add.md new file mode 100644 index 0000000000..eaf2a3fac6 --- /dev/null +++ b/add.md @@ -0,0 +1 @@ +dfadffd diff --git a/augur/api/metrics/README.md b/augur/api/metrics/README.md index cabcc4475a..5990291bf1 100644 --- a/augur/api/metrics/README.md +++ b/augur/api/metrics/README.md @@ -26,7 +26,8 @@ from augur.application.db.engine import engine 4. Define any queries with the structure show below ```py repo_sql = s.sql.text(""" SELECT repo.repo_name FROM repo WHERE repo.repo_id = :repo_id """) -results = pd.read_sql(repo_sql, engine, params={'repo_id': repo_id}) +with engine.connect() as conn: + results = pd.read_sql(repo_sql, conn, params={'repo_id': repo_id}) ``` 5. Return either a pandas dataframe, dict, or json. - Note: If you return a pandas dataframe or dict it will be automatically converted into json diff --git a/augur/api/metrics/commit.py b/augur/api/metrics/commit.py index c143cd9f6e..41d86abbff 100644 --- a/augur/api/metrics/commit.py +++ b/augur/api/metrics/commit.py @@ -90,8 +90,9 @@ def committers(repo_group_id, repo_id=None, begin_date=None, end_date=None, peri """ ) - results = pd.read_sql(committersSQL, engine, params={'repo_id': repo_id, - 'repo_group_id': repo_group_id,'begin_date': begin_date, 'end_date': end_date, 'period':period}) + with engine.connect() as conn: + results = pd.read_sql(committersSQL, conn, params={'repo_id': repo_id, + 'repo_group_id': repo_group_id,'begin_date': begin_date, 'end_date': end_date, 'period':period}) return results @@ -167,8 +168,9 @@ def annual_commit_count_ranked_by_new_repo_in_repo_group(repo_group_id, repo_id= ORDER BY YEAR ASC """.format(table, period)) - results = pd.read_sql(cdRgNewrepRankedCommitsSQL, engine, params={'repo_id': repo_id, - 'repo_group_id': repo_group_id,'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(cdRgNewrepRankedCommitsSQL, conn, params={'repo_id': repo_id, + 'repo_group_id': repo_group_id,'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -265,8 +267,9 @@ def annual_commit_count_ranked_by_repo_in_repo_group(repo_group_id, repo_id=None LIMIT 10 """) - results = pd.read_sql(cdRgTpRankedCommitsSQL, engine, params={ "repo_group_id": repo_group_id, - "repo_id": repo_id}) + with engine.connect() as conn: + results = pd.read_sql(cdRgTpRankedCommitsSQL, conn, params={ "repo_group_id": repo_group_id, + "repo_id": repo_id}) return results @register_metric() @@ -296,8 +299,9 @@ def top_committers(repo_group_id, repo_id=None, year=None, threshold=0.8): ORDER BY patches DESC) a """) - results = pd.read_sql(total_commits_SQL, engine, - params={'year': year, 'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(total_commits_SQL, conn, + params={'year': year, 'repo_group_id': repo_group_id}) else: total_commits_SQL = s.sql.text(""" SELECT SUM(patches)::int @@ -308,8 +312,9 @@ def top_committers(repo_group_id, repo_id=None, year=None, threshold=0.8): ORDER BY patches DESC) a """) - results = pd.read_sql(total_commits_SQL, engine, - params={'year': year, 'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(total_commits_SQL, conn, + params={'year': year, 'repo_id': repo_id}) if not results.iloc[0]['sum']: return pd.DataFrame() @@ -334,8 +339,9 @@ def top_committers(repo_group_id, repo_id=None, year=None, threshold=0.8): ORDER BY commits DESC """) - results = pd.read_sql(committers_SQL, engine, - params={'year': year, 'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(committers_SQL, conn, + params={'year': year, 'repo_group_id': repo_group_id}) else: committers_SQL = s.sql.text(""" SELECT @@ -353,8 +359,9 @@ def top_committers(repo_group_id, repo_id=None, year=None, threshold=0.8): ORDER BY commits DESC """) - results = pd.read_sql(committers_SQL, engine, - params={'year': year, 'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(committers_SQL, conn, + params={'year': year, 'repo_id': repo_id}) cumsum = 0 for i, row in results.iterrows(): diff --git a/augur/api/metrics/contributor.py b/augur/api/metrics/contributor.py index 7d255ecb46..3f25236d0f 100644 --- a/augur/api/metrics/contributor.py +++ b/augur/api/metrics/contributor.py @@ -125,8 +125,9 @@ def contributors(repo_group_id, repo_id=None, period='day', begin_date=None, end ORDER BY total DESC """) - results = pd.read_sql(contributorsSQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(contributorsSQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) else: contributorsSQL = s.sql.text(""" SELECT id::text AS user_id, @@ -211,8 +212,9 @@ def contributors(repo_group_id, repo_id=None, period='day', begin_date=None, end ORDER BY total DESC """) - results = pd.read_sql(contributorsSQL, engine, params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(contributorsSQL, conn, params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -281,8 +283,9 @@ def contributors_new(repo_group_id, repo_id=None, period='day', begin_date=None, GROUP BY date, repo.repo_id, repo_name """) - results = pd.read_sql(contributorsNewSQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(contributorsNewSQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) else: contributorsNewSQL = s.sql.text(""" SELECT date_trunc(:period, b.created_at::DATE) AS date, COUNT(id) AS new_contributors, repo.repo_id, repo_name @@ -330,8 +333,9 @@ def contributors_new(repo_group_id, repo_id=None, period='day', begin_date=None, GROUP BY date, repo.repo_id, repo_name """) - results = pd.read_sql(contributorsNewSQL, engine, params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(contributorsNewSQL, conn, params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -351,7 +355,8 @@ def lines_changed_by_author(repo_group_id, repo_id=None): GROUP BY commits.repo_id, date_trunc('week', cmt_author_date::date), cmt_author_affiliation, cmt_author_email, repo_name ORDER BY date_trunc('week', cmt_author_date::date) ASC; """) - results = pd.read_sql(linesChangedByAuthorSQL, engine, params={"repo_id": repo_id}) + with engine.connect() as conn: + results = pd.read_sql(linesChangedByAuthorSQL, conn, params={"repo_id": repo_id}) return results else: linesChangedByAuthorSQL = s.sql.text(""" @@ -362,7 +367,8 @@ def lines_changed_by_author(repo_group_id, repo_id=None): GROUP BY repo_id, date_trunc('week', cmt_author_date::date), cmt_author_affiliation, cmt_author_email ORDER BY date_trunc('week', cmt_author_date::date) ASC; """) - results = pd.read_sql(linesChangedByAuthorSQL, engine, params={"repo_group_id": repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(linesChangedByAuthorSQL, conn, params={"repo_group_id": repo_group_id}) return results @register_metric() @@ -420,8 +426,9 @@ def contributors_code_development(repo_group_id, repo_id=None, period='all', beg GROUP BY a.email, a.repo_id, repo_name """) - results = pd.read_sql(contributorsSQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(contributorsSQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) else: contributorsSQL = s.sql.text(""" SELECT @@ -455,6 +462,7 @@ def contributors_code_development(repo_group_id, repo_id=None, period='all', beg ORDER BY commits desc, email """) - results = pd.read_sql(contributorsSQL, engine, params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(contributorsSQL, conn, params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results diff --git a/augur/api/metrics/deps.py b/augur/api/metrics/deps.py index deb5ac89fd..d92371d896 100644 --- a/augur/api/metrics/deps.py +++ b/augur/api/metrics/deps.py @@ -6,6 +6,7 @@ import sqlalchemy as s import pandas as pd from augur.api.util import register_metric +import datetime from ..server import engine @@ -45,7 +46,8 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No AND repo_dependencies.repo_id = :repo_id """) - results = pd.read_sql(depsSQL, engine) + with engine.connect() as conn: + results = pd.read_sql(depsSQL, conn) else: @@ -69,7 +71,8 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No AND repo.repo_group_id = :repo_group_id """) - results = pd.read_sql(depsSQL, engine) + with engine.connect() as conn: + results = pd.read_sql(depsSQL, conn) return results diff --git a/augur/api/metrics/insight.py b/augur/api/metrics/insight.py index 874f656f75..848161e1a8 100644 --- a/augur/api/metrics/insight.py +++ b/augur/api/metrics/insight.py @@ -29,5 +29,6 @@ def top_insights(repo_group_id, num_repos=6): LIMIT :num_repos ) """) - results = pd.read_sql(topInsightsSQL, engine, params={'repo_group_id': repo_group_id, 'num_repos': num_repos}) + with engine.connect() as conn: + results = pd.read_sql(topInsightsSQL, conn, params={'repo_group_id': repo_group_id, 'num_repos': num_repos}) return results diff --git a/augur/api/metrics/issue.py b/augur/api/metrics/issue.py index 72108bc20b..22ee2630b5 100644 --- a/augur/api/metrics/issue.py +++ b/augur/api/metrics/issue.py @@ -50,8 +50,10 @@ def issues_first_time_opened(repo_group_id, repo_id=None, period='day', begin_da GROUP BY issue_date, repo_name ORDER BY issue_date """) - results = pd.read_sql(issueNewContributor, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + + with engine.connect() as conn: + results = pd.read_sql(issueNewContributor, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) else: issueNewContributor = s.sql.text(""" SELECT @@ -76,9 +78,10 @@ def issues_first_time_opened(repo_group_id, repo_id=None, period='day', begin_da GROUP BY repo.repo_id, issue_date ORDER BY issue_date """) - results = pd.read_sql(issueNewContributor, engine, - params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issueNewContributor, conn, + params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -119,8 +122,9 @@ def issues_first_time_closed(repo_group_id, repo_id=None, period='day', begin_da ) AS iss_close GROUP BY issue_date, repo_name """) - results = pd.read_sql(issuesClosedSQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issuesClosedSQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) else: issuesClosedSQL = s.sql.text(""" SELECT date_trunc(:period, new_date::DATE) AS issue_date, @@ -141,8 +145,10 @@ def issues_first_time_closed(repo_group_id, repo_id=None, period='day', begin_da ) AS iss_close GROUP BY repo_id, repo_name,issue_date """) - results = pd.read_sql(issuesClosedSQL, engine, params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + + with engine.connect() as conn: + results = pd.read_sql(issuesClosedSQL, conn, params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @@ -179,8 +185,9 @@ def issues_new(repo_group_id, repo_id=None, period='day', begin_date=None, end_d ORDER BY issues.repo_id, date """) - results = pd.read_sql(issues_new_SQL, engine, params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issues_new_SQL, conn, params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @@ -198,8 +205,9 @@ def issues_new(repo_group_id, repo_id=None, period='day', begin_date=None, end_d ORDER BY date; """) - results = pd.read_sql(issues_new_SQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issues_new_SQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -235,8 +243,9 @@ def issues_active(repo_group_id, repo_id=None, period='day', begin_date=None, en ORDER BY issues.repo_id, date """) - results = pd.read_sql(issues_active_SQL, engine, params={'repo_group_id': repo_group_id, 'period':period, - 'begin_date': begin_date, 'end_date':end_date}) + with engine.connect() as conn: + results = pd.read_sql(issues_active_SQL, conn, params={'repo_group_id': repo_group_id, 'period':period, + 'begin_date': begin_date, 'end_date':end_date}) else: issues_active_SQL = s.sql.text(""" @@ -254,8 +263,9 @@ def issues_active(repo_group_id, repo_id=None, period='day', begin_date=None, en ORDER BY date """) - results = pd.read_sql(issues_active_SQL, engine, params={'repo_id': repo_id, 'period':period, - 'begin_date': begin_date, 'end_date':end_date}) + with engine.connect() as conn: + results = pd.read_sql(issues_active_SQL, conn, params={'repo_id': repo_id, 'period':period, + 'begin_date': begin_date, 'end_date':end_date}) return results @register_metric() @@ -290,8 +300,9 @@ def issues_closed(repo_group_id, repo_id=None, period='day', begin_date=None, en ORDER BY issues.repo_id, date """) - results = pd.read_sql(issues_closed_SQL, engine, params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issues_closed_SQL, conn, params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) else: issues_closed_SQL = s.sql.text(""" @@ -308,8 +319,9 @@ def issues_closed(repo_group_id, repo_id=None, period='day', begin_date=None, en ORDER BY date; """) - results = pd.read_sql(issues_closed_SQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issues_closed_SQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @@ -347,9 +359,10 @@ def issue_duration(repo_group_id, repo_id=None, begin_date=None, end_date=None): ORDER BY repo_id, issue_id """) - results = pd.read_sql(issue_duration_SQL, engine, params={'repo_group_id': repo_group_id, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issue_duration_SQL, conn, params={'repo_group_id': repo_group_id, + 'begin_date': begin_date, + 'end_date': end_date}) results['duration'] = results['duration'].astype(str) return results @@ -371,9 +384,10 @@ def issue_duration(repo_group_id, repo_id=None, begin_date=None, end_date=None): ORDER BY issue_id; """) - results = pd.read_sql(issue_duration_SQL, engine, params={'repo_id': repo_id, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issue_duration_SQL, conn, params={'repo_id': repo_id, + 'begin_date': begin_date, + 'end_date': end_date}) results['duration'] = results['duration'].astype(str) return results @@ -417,9 +431,10 @@ def issue_participants(repo_group_id, repo_id=None, begin_date=None, end_date=No ORDER BY issues.repo_id, issues.created_at """) - result = pd.read_sql(issue_participants_SQL, engine, params={'repo_group_id': repo_group_id, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + result = pd.read_sql(issue_participants_SQL, conn, params={'repo_group_id': repo_group_id, + 'begin_date': begin_date, + 'end_date': end_date}) return result else: issue_participants_SQL = s.sql.text(""" @@ -445,9 +460,10 @@ def issue_participants(repo_group_id, repo_id=None, begin_date=None, end_date=No ORDER BY issues.created_at """) - result = pd.read_sql(issue_participants_SQL, engine, params={'repo_id': repo_id, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + result = pd.read_sql(issue_participants_SQL, conn, params={'repo_id': repo_id, + 'begin_date': begin_date, + 'end_date': end_date}) return result @register_metric() @@ -468,7 +484,9 @@ def issue_backlog(repo_group_id, repo_id=None): GROUP BY issues.repo_id, repo_name ORDER BY issues.repo_id """) - result = pd.read_sql(issue_backlog_SQL, engine, params={'repo_group_id': repo_group_id}) + + with engine.connect() as conn: + result = pd.read_sql(issue_backlog_SQL, conn, params={'repo_group_id': repo_group_id}) return result else: @@ -481,7 +499,8 @@ def issue_backlog(repo_group_id, repo_id=None): GROUP BY repo_name """) - result = pd.read_sql(issue_backlog_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + result = pd.read_sql(issue_backlog_SQL, conn, params={'repo_id': repo_id}) return result @register_metric() @@ -509,7 +528,8 @@ def issue_throughput(repo_group_id, repo_id=None): AND table1.repo_id = repo.repo_id """) - results = pd.read_sql(issue_throughput_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(issue_throughput_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: @@ -525,7 +545,8 @@ def issue_throughput(repo_group_id, repo_id=None): WHERE table1.repo_id = repo.repo_id """) - result = pd.read_sql(issue_throughput_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + result = pd.read_sql(issue_throughput_SQL, conn, params={'repo_id': repo_id}) return result @register_metric() @@ -574,9 +595,10 @@ def issues_open_age(repo_group_id, repo_id=None, period='day', begin_date=None, ORDER BY open_date DESC """) - results = pd.read_sql(openAgeSQL, engine, - params={'repo_id': repo_id, 'repo_group_id': repo_group_id, - 'period': period, 'begin_date':begin_date, 'end_date':end_date}) + with engine.connect() as conn: + results = pd.read_sql(openAgeSQL, conn, + params={'repo_id': repo_id, 'repo_group_id': repo_group_id, + 'period': period, 'begin_date':begin_date, 'end_date':end_date}) return results @@ -634,11 +656,12 @@ def issues_closed_resolution_duration(repo_group_id, repo_id=None, period='day', ORDER BY gh_issue_number """) - results = pd.read_sql(issueSQL, engine, - params={'repo_id': repo_id, - 'repo_group_id': repo_group_id, - 'period': period, 'begin_date':begin_date, - 'end_date':end_date}) + with engine.connect() as conn: + results = pd.read_sql(issueSQL, conn, + params={'repo_id': repo_id, + 'repo_group_id': repo_group_id, + 'period': period, 'begin_date':begin_date, + 'end_date':end_date}) return results @@ -667,8 +690,9 @@ def average_issue_resolution_time(repo_group_id, repo_id=None): """) - results = pd.read_sql(avg_issue_resolution_SQL, engine, - params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(avg_issue_resolution_SQL, conn, + params={'repo_group_id': repo_group_id}) return results else: @@ -683,8 +707,9 @@ def average_issue_resolution_time(repo_group_id, repo_id=None): GROUP BY repo.repo_name """) - results = pd.read_sql(avg_issue_resolution_SQL, engine, - params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(avg_issue_resolution_SQL, conn, + params={'repo_id': repo_id}) return results @register_metric() @@ -757,7 +782,8 @@ def issues_maintainer_response_duration(repo_group_id, repo_id=None, begin_date= group by repo_id, repo_name """) - results = pd.read_sql(issuesSQL, engine, params={'repo_id': repo_id, 'repo_group_id': repo_group_id,'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(issuesSQL, conn, params={'repo_id': repo_id, 'repo_group_id': repo_group_id,'begin_date': begin_date, 'end_date': end_date}) return results @@ -780,7 +806,8 @@ def open_issues_count(repo_group_id, repo_id=None): GROUP BY date, repo_groups.rg_name ORDER BY date """) - results = pd.read_sql(openIssueCountSQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(openIssueCountSQL, conn, params={'repo_group_id': repo_group_id}) return results else: openIssueCountSQL = s.sql.text(""" @@ -794,7 +821,8 @@ def open_issues_count(repo_group_id, repo_id=None): GROUP BY date, repo.repo_id ORDER BY date """) - results = pd.read_sql(openIssueCountSQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(openIssueCountSQL, conn, params={'repo_id': repo_id}) return results @@ -817,7 +845,8 @@ def closed_issues_count(repo_group_id, repo_id=None): GROUP BY date, repo_groups.rg_name ORDER BY date """) - results = pd.read_sql(closedIssueCountSQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(closedIssueCountSQL, conn, params={'repo_group_id': repo_group_id}) return results else: closedIssueCountSQL = s.sql.text(""" @@ -831,7 +860,8 @@ def closed_issues_count(repo_group_id, repo_id=None): GROUP BY date, repo.repo_id ORDER BY date """) - results = pd.read_sql(closedIssueCountSQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(closedIssueCountSQL, conn, params={'repo_id': repo_id}) return results @register_metric() @@ -893,8 +923,9 @@ def issue_comments_mean(repo_group_id, repo_id=None, group_by='week'): else: raise ValueError("Incorrect value for 'group_by'") - results = pd.read_sql(issue_comments_mean_std_SQL, engine, - params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(issue_comments_mean_std_SQL, conn, + params={'repo_group_id': repo_group_id}) return results else: @@ -946,8 +977,9 @@ def issue_comments_mean(repo_group_id, repo_id=None, group_by='week'): else: raise ValueError("Incorrect value for 'group_by'") - results = pd.read_sql(issue_comments_mean_std_SQL, engine, - params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(issue_comments_mean_std_SQL, conn, + params={'repo_id': repo_id}) return results @register_metric() @@ -978,9 +1010,10 @@ def issue_comments_mean_std(repo_group_id, repo_id=None, group_by='week'): """) - results = pd.read_sql(issue_comments_mean_std_SQL, engine, - params={'repo_group_id': repo_group_id, - 'group_by': group_by}) + with engine.connect() as conn: + results = pd.read_sql(issue_comments_mean_std_SQL, conn, + params={'repo_group_id': repo_group_id, + 'group_by': group_by}) return results else: @@ -1006,8 +1039,9 @@ def issue_comments_mean_std(repo_group_id, repo_id=None, group_by='week'): ORDER BY date """) - results = pd.read_sql(issue_comments_mean_std_SQL, engine, - params={'repo_id': repo_id, 'group_by': group_by}) + with engine.connect() as conn: + results = pd.read_sql(issue_comments_mean_std_SQL, conn, + params={'repo_id': repo_id, 'group_by': group_by}) return results @register_metric() @@ -1057,6 +1091,7 @@ def abandoned_issues(repo_group_id, repo_id=None, period='day', begin_date=None, ''' ) - results = pd.read_sql(abandonedSQL, engine, params={'repo_id': repo_id, 'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(abandonedSQL, conn, params={'repo_id': repo_id, 'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results diff --git a/augur/api/metrics/message.py b/augur/api/metrics/message.py index 8c36c3a4c2..9988f5a0d5 100644 --- a/augur/api/metrics/message.py +++ b/augur/api/metrics/message.py @@ -56,9 +56,9 @@ def repo_messages(repo_group_id, repo_id=None, period='day', begin_date=None, en """) - - results = pd.read_sql(repomessagesSQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(repomessagesSQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) else: repomessagesSQL = s.sql.text(""" @@ -85,10 +85,11 @@ def repo_messages(repo_group_id, repo_id=None, period='day', begin_date=None, en rg_name, message_date """) - - results = pd.read_sql(repomessagesSQL, engine, - params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + + with engine.connect() as conn: + results = pd.read_sql(repomessagesSQL, conn, + params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results diff --git a/augur/api/metrics/pull_request.py b/augur/api/metrics/pull_request.py index 9fbcc61757..3b1798ec01 100644 --- a/augur/api/metrics/pull_request.py +++ b/augur/api/metrics/pull_request.py @@ -10,6 +10,53 @@ from ..server import engine +@register_metric() +def pull_requests_new(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None): + """ + Returns a time series of the number of new Pull Requests opened during a certain period. + + :param repo_id: The repository's id + :param repo_group_id: The repository's group id + :param period: To set the periodicity to 'day', 'week', 'month' or 'year', defaults to 'day' + :param begin_date: Specifies the begin date, defaults to '1970-1-1 00:00:01' + :param end_date: Specifies the end date, defaults to datetime.now() + :return: DataFrame of new Pull Requests/period + """ + if not begin_date: + begin_date = '1970-1-1 00:00:01' + if not end_date: + end_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + if repo_id: + new_pull_requests_query = s.sql.text(""" + SELECT DATE_TRUNC(:period, pr_created_at) AS created_date, + COUNT(pr_id) AS new_pull_requests + FROM pull_requests + WHERE repo_id = :repo_id + AND pr_created_at BETWEEN :begin_date AND :end_date + GROUP BY created_date + """) + + results = pd.read_sql(new_pull_requests_query, engine, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, + 'end_date': end_date}) + else: + new_pull_requests_query = s.sql.text(""" + SELECT DATE_TRUNC(:period, pr_created_at) AS created_date, + COUNT(pr_id) AS new_pull_requests + FROM pull_requests + WHERE repo_id IN (SELECT repo_id FROM repo WHERE repo_group_id = :repo_group_id) + AND pr_created_at BETWEEN :begin_date AND :end_date + GROUP BY created_date + """) + + results = pd.read_sql(new_pull_requests_query, engine, + params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, + 'end_date': end_date}) + + return results + @register_metric() def pull_requests_merge_contributor_new(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None): """ @@ -40,9 +87,10 @@ def pull_requests_merge_contributor_new(repo_group_id, repo_id=None, period='day """) - results = pd.read_sql(commitNewContributor, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(commitNewContributor, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, + 'end_date': end_date}) else: commitNewContributor = s.sql.text(""" SELECT abc.repo_id, repo_name ,date_trunc(:period, new_date::DATE) as commit_date, @@ -58,11 +106,11 @@ def pull_requests_merge_contributor_new(repo_group_id, repo_id=None, period='day GROUP BY abc.repo_id, repo_name, commit_date """) - - results = pd.read_sql(commitNewContributor, engine, - params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(commitNewContributor, conn, + params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, + 'end_date': end_date}) return results @register_metric() @@ -96,9 +144,10 @@ def pull_requests_closed_no_merge(repo_group_id, repo_id=None, period='day', beg - results = pd.read_sql(closedNoMerge, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(closedNoMerge, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, + 'end_date': end_date}) else: closedNoMerge = s.sql.text(""" @@ -110,11 +159,11 @@ def pull_requests_closed_no_merge(repo_group_id, repo_id=None, period='day', beg ORDER BY closed_date """) - - results = pd.read_sql(closedNoMerge, engine, - params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(closedNoMerge, conn, + params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, + 'end_date': end_date}) return results @register_metric() @@ -151,9 +200,10 @@ def reviews(repo_group_id, repo_id=None, period='day', begin_date=None, end_date """) - results = pd.read_sql(reviews_SQL, engine, - params={'period': period, 'repo_group_id': repo_group_id, - 'begin_date': begin_date, 'end_date': end_date }) + with engine.connect() as conn: + results = pd.read_sql(reviews_SQL, conn, + params={'period': period, 'repo_group_id': repo_group_id, + 'begin_date': begin_date, 'end_date': end_date }) return results else: @@ -171,10 +221,10 @@ def reviews(repo_group_id, repo_id=None, period='day', begin_date=None, end_date ORDER BY date """) - - results = pd.read_sql(reviews_SQL, engine, - params={'period': period, 'repo_id': repo_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(reviews_SQL, conn, + params={'period': period, 'repo_id': repo_id, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -211,10 +261,10 @@ def reviews_accepted(repo_group_id, repo_id=None, period='day', begin_date=None, ORDER BY pull_requests.repo_id, date """) - - results = pd.read_sql(reviews_accepted_SQL, engine, - params={'period': period, 'repo_group_id': repo_group_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(reviews_accepted_SQL, conn, + params={'period': period, 'repo_group_id': repo_group_id, + 'begin_date': begin_date, 'end_date': end_date}) return results else: reviews_accepted_SQL = s.sql.text(""" @@ -232,9 +282,10 @@ def reviews_accepted(repo_group_id, repo_id=None, period='day', begin_date=None, ORDER BY date """) - results = pd.read_sql(reviews_accepted_SQL, engine, - params={'period': period, 'repo_id': repo_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(reviews_accepted_SQL, conn, + params={'period': period, 'repo_id': repo_id, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -271,10 +322,10 @@ def reviews_declined(repo_group_id, repo_id=None, period='day', begin_date=None, ORDER BY pull_requests.repo_id, date """) - - results = pd.read_sql(reviews_declined_SQL, engine, - params={'period': period, 'repo_group_id': repo_group_id, - 'begin_date': begin_date, 'end_date': end_date }) + with engine.connect() as conn: + results = pd.read_sql(reviews_declined_SQL, conn, + params={'period': period, 'repo_group_id': repo_group_id, + 'begin_date': begin_date, 'end_date': end_date }) return results else: reviews_declined_SQL = s.sql.text(""" @@ -292,9 +343,10 @@ def reviews_declined(repo_group_id, repo_id=None, period='day', begin_date=None, ORDER BY date """) - results = pd.read_sql(reviews_declined_SQL, engine, - params={'period': period, 'repo_id': repo_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(reviews_declined_SQL, conn, + params={'period': period, 'repo_id': repo_id, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -331,11 +383,11 @@ def review_duration(repo_group_id, repo_id=None, begin_date=None, end_date=None) ORDER BY pull_requests.repo_id, pull_requests.pull_request_id """) - - results = pd.read_sql(review_duration_SQL, engine, - params={'repo_group_id': repo_group_id, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(review_duration_SQL, conn, + params={'repo_group_id': repo_group_id, + 'begin_date': begin_date, + 'end_date': end_date}) results['duration'] = results['duration'].astype(str) return results else: @@ -355,10 +407,11 @@ def review_duration(repo_group_id, repo_id=None, begin_date=None, end_date=None) ORDER BY pull_requests.repo_id, pull_request_id """) - results = pd.read_sql(review_duration_SQL, engine, - params={'repo_id': repo_id, - 'begin_date': begin_date, - 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(review_duration_SQL, conn, + params={'repo_id': repo_id, + 'begin_date': begin_date, + 'end_date': end_date}) results['duration'] = results['duration'].astype(str) return results @@ -408,8 +461,9 @@ def pull_request_acceptance_rate(repo_group_id, repo_id=None, begin_date=None, e ON opened.date_created = accepted.accepted_on """) - results = pd.read_sql(prAccRateSQL, engine, params={'repo_group_id': repo_group_id, 'group_by': group_by, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(prAccRateSQL, conn, params={'repo_group_id': repo_group_id, 'group_by': group_by, + 'begin_date': begin_date, 'end_date': end_date}) return results else: prAccRateSQL = s.sql.text(""" @@ -441,8 +495,9 @@ def pull_request_acceptance_rate(repo_group_id, repo_id=None, begin_date=None, e ON opened.date_created = accepted.accepted_on """) - results = pd.read_sql(prAccRateSQL, engine, params={'repo_id': repo_id, 'group_by': group_by, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(prAccRateSQL, conn, params={'repo_id': repo_id, 'group_by': group_by, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -546,9 +601,10 @@ def pull_request_average_time_to_close(repo_group_id, repo_id=None, group_by='mo - pr_all = pd.read_sql(pr_all_SQL, engine, - params={'repo_id': repo_id, 'repo_group_id':repo_group_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + pr_all = pd.read_sql(pr_all_SQL, conn, + params={'repo_id': repo_id, 'repo_group_id':repo_group_id, + 'begin_date': begin_date, 'end_date': end_date}) if not repo_id: pr_avg_time_to_close = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['average_{}_to_close'.format(time_unit)]] else: @@ -657,10 +713,11 @@ def pull_request_merged_status_counts(repo_group_id, repo_id=None, begin_date='1 GROUP BY closed_year, closed_month, merged_status, time_between_responses.pr_closed_at, time_between_responses.average_time_between_responses """) - - pr_all = pd.read_sql(pr_all_SQL, engine, - params={'repo_id': repo_id, 'repo_group_id':repo_group_id, - 'begin_date': begin_date, 'end_date': end_date}) + + with engine.connect() as conn: + pr_all = pd.read_sql(pr_all_SQL, conn, + params={'repo_id': repo_id, 'repo_group_id':repo_group_id, + 'begin_date': begin_date, 'end_date': end_date}) if not repo_id: pr_avg_time_between_responses = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['average_{}_between_responses'.format(time_unit)]] else: @@ -767,10 +824,11 @@ def pull_request_average_commit_counts(repo_group_id, repo_id=None, group_by='mo GROUP BY closed_year, merged_status, data.pr_closed_at, data.commit_count """) - - pr_all = pd.read_sql(pr_all_SQL, engine, - params={'repo_id': repo_id, 'repo_group_id':repo_group_id, - 'begin_date': begin_date, 'end_date': end_date}) + + with engine.connect() as conn: + pr_all = pd.read_sql(pr_all_SQL, conn, + params={'repo_id': repo_id, 'repo_group_id':repo_group_id, + 'begin_date': begin_date, 'end_date': end_date}) if not repo_id: pr_avg_commit_counts = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['average_commits_per_pull_request']] else: @@ -926,10 +984,11 @@ def pull_request_average_event_counts(repo_group_id, repo_id=None, group_by='mon ORDER BY merged_status, closed_year, closed_week, closed_day """) - - pr_all = pd.read_sql(pr_all_SQL, engine, - params={'repo_id': repo_id, 'repo_group_id':repo_group_id, - 'begin_date': begin_date, 'end_date': end_date}) + + with engine.connect() as conn: + pr_all = pd.read_sql(pr_all_SQL, conn, + params={'repo_id': repo_id, 'repo_group_id':repo_group_id, + 'begin_date': begin_date, 'end_date': end_date}) count_names = ['assigned_count', 'review_requested_count', 'labeled_count', 'unlabeled_count', 'subscribed_count', 'mentioned_count', 'referenced_count', 'closed_count', 'head_ref_force_pushed_count', 'head_ref_deleted_count', 'milestoned_count', 'merged_count', 'comment_count'] average_count_names = [] @@ -1050,9 +1109,10 @@ def pull_request_average_time_to_responses_and_close(repo_group_id, repo_id=None GROUP BY closed_year, merged_status, response_times.first_response_time, response_times.last_response_time, response_times.pr_created_at, response_times.pr_closed_at """) - pr_all = pd.read_sql(pr_all_SQL, engine, - params={'repo_id': repo_id, 'repo_group_id':repo_group_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + pr_all = pd.read_sql(pr_all_SQL, conn, + params={'repo_id': repo_id, 'repo_group_id':repo_group_id, + 'begin_date': begin_date, 'end_date': end_date}) if not repo_id: avg_pr_time_to_responses_and_close = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['average_{}_to_first_response'.format(time_unit), 'average_{}_to_last_response'.format(time_unit), 'average_{}_to_close'.format(time_unit)]] @@ -1132,9 +1192,10 @@ def pull_request_merged_status_counts(repo_group_id, repo_id=None, begin_date='1 AND pr_closed_at::date <= :end_date ::date """) - - pr_all = pd.read_sql(pr_all_sql, engine, params={'repo_group_id': repo_group_id, - 'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) + + with engine.connect() as conn: + pr_all = pd.read_sql(pr_all_sql, conn, params={'repo_group_id': repo_group_id, + 'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) if not repo_id: pr_merged_counts = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).count().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['pull_request_count']] diff --git a/augur/api/metrics/release.py b/augur/api/metrics/release.py index 60f7793652..5594f7ef08 100644 --- a/augur/api/metrics/release.py +++ b/augur/api/metrics/release.py @@ -50,10 +50,10 @@ def releases(repo_group_id, repo_id=None, period='day', begin_date=None, end_dat ORDER BY releases.release_published_at DESC """) - - results = pd.read_sql(releases_SQL, engine, - params={'period': period, 'repo_group_id': repo_group_id, - 'begin_date': begin_date, 'end_date': end_date }) + with engine.connect() as conn: + results = pd.read_sql(releases_SQL, conn, + params={'period': period, 'repo_group_id': repo_group_id, + 'begin_date': begin_date, 'end_date': end_date }) return results else: @@ -80,10 +80,10 @@ def releases(repo_group_id, repo_id=None, period='day', begin_date=None, end_dat ORDER BY releases.release_published_at DESC """) - - results = pd.read_sql(releases_SQL, engine, - params={'period': period, 'repo_id': repo_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(releases_SQL, conn, + params={'period': period, 'repo_id': repo_id, + 'begin_date': begin_date, 'end_date': end_date}) return results @register_metric() @@ -127,10 +127,10 @@ def tag_only_releases(repo_group_id, repo_id=None, period='day', begin_date=None ORDER BY releases.release_published_at DESC """) - - results = pd.read_sql(releases_SQL, engine, - params={'period': period, 'repo_group_id': repo_group_id, - 'begin_date': begin_date, 'end_date': end_date }) + with engine.connect() as conn: + results = pd.read_sql(releases_SQL, conn, + params={'period': period, 'repo_group_id': repo_group_id, + 'begin_date': begin_date, 'end_date': end_date }) return results else: @@ -150,10 +150,11 @@ def tag_only_releases(repo_group_id, repo_id=None, period='day', begin_date=None ORDER BY releases.release_published_at DESC """) - results = pd.read_sql(releases_SQL, engine, - params={'period': period, 'repo_id': repo_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(releases_SQL, conn, + params={'period': period, 'repo_id': repo_id, + 'begin_date': begin_date, 'end_date': end_date}) return results -def create_release_metrics(metrics): - add_metrics(metrics, __name__) +#def create_release_metrics(metrics): +# add_metrics(metrics, __name__) diff --git a/augur/api/metrics/repo_meta.py b/augur/api/metrics/repo_meta.py index ca4d9668e2..c5d8e1138d 100644 --- a/augur/api/metrics/repo_meta.py +++ b/augur/api/metrics/repo_meta.py @@ -46,8 +46,8 @@ def code_changes(repo_group_id, repo_id=None, period='week', begin_date=None, en ORDER BY week """) - - results = pd.read_sql(code_changes_SQL, engine, params={'repo_group_id': repo_group_id, 'period': period, + with engine.connect() as conn: + results = pd.read_sql(code_changes_SQL, conn, params={'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) results['week'] = results['week'].apply(lambda x: x - 1) results['date'] = results['year'].astype(str) + ' ' + results['week'].astype(str) + ' 0' @@ -68,9 +68,9 @@ def code_changes(repo_group_id, repo_id=None, period='week', begin_date=None, en ORDER BY week """) - - results = pd.read_sql(code_changes_SQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(code_changes_SQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) results['week'] = results['week'].apply(lambda x: x - 1) results['date'] = results['year'].astype(str) + ' ' + results['week'].astype(str) + ' 0' @@ -111,8 +111,9 @@ def code_changes_lines(repo_group_id, repo_id=None, period='day', begin_date=Non ORDER BY commits.repo_id, date """) - results = pd.read_sql(code_changes_lines_SQL, engine, params={'repo_group_id': repo_group_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(code_changes_lines_SQL, conn, params={'repo_group_id': repo_group_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @@ -130,9 +131,9 @@ def code_changes_lines(repo_group_id, repo_id=None, period='day', begin_date=Non ORDER BY date; """) - - results = pd.read_sql(code_changes_lines_SQL, engine, params={'repo_id': repo_id, 'period': period, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(code_changes_lines_SQL, conn, params={'repo_id': repo_id, 'period': period, + 'begin_date': begin_date, 'end_date': end_date}) return results @@ -163,8 +164,9 @@ def sub_projects(repo_group_id, repo_id=None, begin_date=None, end_date=None): AND repo_added BETWEEN :begin_date AND :end_date """) - results = pd.read_sql(sub_projectsSQL, engine, params={'repo_id': repo_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(sub_projectsSQL, conn, params={'repo_id': repo_id, + 'begin_date': begin_date, 'end_date': end_date}) else: sub_projectsSQL = s.sql.text(""" SELECT COUNT(*) AS sub_project_count @@ -173,8 +175,9 @@ def sub_projects(repo_group_id, repo_id=None, begin_date=None, end_date=None): AND repo_added BETWEEN :begin_date AND :end_date """) - results = pd.read_sql(sub_projectsSQL, engine, params={'repo_group_id': repo_group_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(sub_projectsSQL, conn, params={'repo_group_id': repo_group_id, + 'begin_date': begin_date, 'end_date': end_date}) return results @@ -194,8 +197,8 @@ def sbom_download(repo_group_id, repo_id=None): logger.debug(dosocs_SQL) params = {'repo_id': repo_id} - - return pd.read_sql(dosocs_SQL, engine, params=params) + with engine.connect() as conn: + return pd.read_sql(dosocs_SQL, conn, params=params) #return [json.dumps(license_information)] @register_metric() @@ -223,7 +226,8 @@ def cii_best_practices_badge(repo_group_id, repo_id=None): LIMIT 1 """) - raw_df = pd.read_sql(cii_best_practices_badge_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + raw_df = pd.read_sql(cii_best_practices_badge_SQL, conn, params={'repo_id': repo_id}) if len(raw_df) == 0: return [] @@ -263,8 +267,8 @@ def forks(repo_group_id, repo_id=None): ORDER BY repo_info.repo_id, date """) - - results = pd.read_sql(forks_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(forks_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: @@ -278,8 +282,8 @@ def forks(repo_group_id, repo_id=None): ORDER BY date """) - - results = pd.read_sql(forks_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(forks_SQL, conn, params={'repo_id': repo_id}) return results @register_metric() @@ -303,8 +307,8 @@ def fork_count(repo_group_id, repo_id=None): WHERE repo_group_id = :repo_group_id) """) - - results = pd.read_sql(fork_count_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(fork_count_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: fork_count_SQL = s.sql.text(""" @@ -315,8 +319,8 @@ def fork_count(repo_group_id, repo_id=None): LIMIT 1 """) - - results = pd.read_sql(fork_count_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(fork_count_SQL, conn, params={'repo_id': repo_id}) return results @register_metric() @@ -334,7 +338,8 @@ def languages(repo_group_id, repo_id=None): WHERE repo_id IN (SELECT repo_id FROM repo WHERE repo_group_id = :repo_group_id) """) - results = pd.read_sql(languages_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(languages_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: @@ -344,8 +349,8 @@ def languages(repo_group_id, repo_id=None): WHERE repo_id = :repo_id """) - - results = pd.read_sql(languages_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(languages_SQL, conn, params={'repo_id': repo_id}) return results @register_metric(type="license") @@ -381,7 +386,8 @@ def license_files(license_id, spdx_binary, repo_group_id, repo_id=None,): b.license_id in ( 369,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482)); """) - results = pd.read_sql(license_data_SQL, engine, params={'repo_id': repo_id, 'spdx_binary': spdx_binary, 'license_id': license_id}) + with engine.connect() as conn: + results = pd.read_sql(license_data_SQL, conn, params={'repo_id': repo_id, 'spdx_binary': spdx_binary, 'license_id': license_id}) return results @register_metric() @@ -450,7 +456,8 @@ def license_declared(repo_group_id, repo_id=None): short_name; """) - results = pd.read_sql(license_declared_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(license_declared_SQL, conn, params={'repo_id': repo_id}) return results @register_metric() @@ -534,7 +541,8 @@ def license_coverage(repo_group_id, repo_id=None): GROUP BY a.name, a.licensed, a.licensed, b.total """) - results = pd.read_sql(license_declared_SQL, engine, params={'repo_id': repo_id, 'repo_group_id':repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(license_declared_SQL, conn, params={'repo_id': repo_id, 'repo_group_id':repo_group_id}) return results @@ -595,8 +603,8 @@ def license_count(repo_group_id, repo_id=None): GROUP BY a.name, a.number_of_license, a.licensed, b.total """) - - results = pd.read_sql(license_declared_SQL, engine, params={'repo_id': repo_id, 'repo_group_id':repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(license_declared_SQL, conn, params={'repo_id': repo_id, 'repo_group_id':repo_group_id}) return results @@ -624,8 +632,8 @@ def stars(repo_group_id, repo_id=None): ORDER BY repo_info.repo_id, date """) - - results = pd.read_sql(stars_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(stars_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: @@ -639,7 +647,8 @@ def stars(repo_group_id, repo_id=None): ORDER BY date """) - results = pd.read_sql(stars_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(stars_SQL, conn, params={'repo_id': repo_id}) return results @register_metric() @@ -663,8 +672,8 @@ def stars_count(repo_group_id, repo_id=None): WHERE repo_group_id = :repo_group_id) """) - - results = pd.read_sql(stars_count_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(stars_count_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: stars_count_SQL = s.sql.text(""" @@ -675,7 +684,8 @@ def stars_count(repo_group_id, repo_id=None): LIMIT 1 """) - results = pd.read_sql(stars_count_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(stars_count_SQL, conn, params={'repo_id': repo_id}) return results @register_metric() @@ -701,8 +711,8 @@ def watchers(repo_group_id, repo_id=None): ORDER BY repo_info.repo_id, date """) - - results = pd.read_sql(watchers_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(watchers_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: @@ -716,8 +726,8 @@ def watchers(repo_group_id, repo_id=None): ORDER BY date """) - - results = pd.read_sql(watchers_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(watchers_SQL, conn, params={'repo_id': repo_id}) return results @register_metric() @@ -741,8 +751,8 @@ def watchers_count(repo_group_id, repo_id=None): WHERE repo_group_id = :repo_group_id) """) - - results = pd.read_sql(watchers_count_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(watchers_count_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: watchers_count_SQL = s.sql.text(""" @@ -753,8 +763,8 @@ def watchers_count(repo_group_id, repo_id=None): LIMIT 1 """) - - results = pd.read_sql(watchers_count_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(watchers_count_SQL, conn, params={'repo_id': repo_id}) return results @register_metric() @@ -798,8 +808,9 @@ def annual_lines_of_code_count_ranked_by_new_repo_in_repo_group(repo_group_id, r LIMIT 10 """) - results = pd.read_sql(cdRgNewrepRankedCommitsSQL, engine, params={ "repo_group_id": repo_group_id, - "repo_id": repo_id, "calendar_year": calendar_year}) + with engine.connect() as conn: + results = pd.read_sql(cdRgNewrepRankedCommitsSQL, conn, params={ "repo_group_id": repo_group_id, + "repo_id": repo_id, "calendar_year": calendar_year}) return results @register_metric() @@ -894,9 +905,9 @@ def annual_lines_of_code_count_ranked_by_repo_in_repo_group(repo_group_id, repo_ """) - - results = pd.read_sql(cdRgTpRankedCommitsSQL, engine, params={ "repo_group_id": repo_group_id, - "repo_id": repo_id}) + with engine.connect() as conn: + results = pd.read_sql(cdRgTpRankedCommitsSQL, conn, params={ "repo_group_id": repo_group_id, + "repo_id": repo_id}) return results @register_metric() @@ -948,8 +959,8 @@ def lines_of_code_commit_counts_by_calendar_year_grouped(repo_url, calendar_year GROUP BY week """) - - results = pd.read_sql(cdRepTpIntervalLocCommitsSQL, engine, params={"repourl": '%{}%'.format(repo_url), 'calendar_year': calendar_year}) + with engine.connect() as conn: + results = pd.read_sql(cdRepTpIntervalLocCommitsSQL, conn, params={"repourl": '%{}%'.format(repo_url), 'calendar_year': calendar_year}) return results @register_metric() @@ -969,9 +980,9 @@ def average_weekly_commits(repo_group_id=None, repo_id=None, calendar_year=None) ORDER BY repo_name """.format(extra_and)) - - results = pd.read_sql(average_weekly_commits_sql, engine, params={"repo_group_id": repo_group_id, - "repo_id": repo_id, "calendar_year": calendar_year}) + with engine.connect() as conn: + results = pd.read_sql(average_weekly_commits_sql, conn, params={"repo_group_id": repo_group_id, + "repo_id": repo_id, "calendar_year": calendar_year}) return results @register_metric() @@ -1054,8 +1065,9 @@ def aggregate_summary(repo_group_id, repo_id=None, begin_date=None, end_date=Non ) commit_data """) - results = pd.read_sql(summarySQL, engine, params={'repo_group_id': repo_group_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(summarySQL, conn, params={'repo_group_id': repo_group_id, + 'begin_date': begin_date, 'end_date': end_date}) return results else: summarySQL = s.sql.text(""" @@ -1123,6 +1135,7 @@ def aggregate_summary(repo_group_id, repo_id=None, begin_date=None, end_date=Non ) commit_data """) - results = pd.read_sql(summarySQL, engine, params={'repo_id': repo_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(summarySQL, conn, params={'repo_id': repo_id, + 'begin_date': begin_date, 'end_date': end_date}) return results diff --git a/augur/api/metrics/toss.py b/augur/api/metrics/toss.py index 122cb35679..d3e91ad405 100644 --- a/augur/api/metrics/toss.py +++ b/augur/api/metrics/toss.py @@ -57,8 +57,9 @@ def toss_pull_request_acceptance_rate(repo_id, begin_date=None, end_date=None, g ) opened ON merged.repo_id = opened.repo_id """) - results = pd.read_sql(pr_acceptance_rate_sql, engine, params={'repo_id': repo_id, 'group_by': group_by, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(pr_acceptance_rate_sql, conn, params={'repo_id': repo_id, 'group_by': group_by, + 'begin_date': begin_date, 'end_date': end_date}) return results @@ -89,8 +90,9 @@ def toss_review_duration(repo_id, begin_date=None, end_date=None): AND :end_date """) - results = pd.read_sql(pr_acceptance_rate_sql, engine, params={'repo_id': repo_id, - 'begin_date': begin_date, 'end_date': end_date}) + with engine.connect() as conn: + results = pd.read_sql(pr_acceptance_rate_sql, conn, params={'repo_id': repo_id, + 'begin_date': begin_date, 'end_date': end_date}) if results.iloc[0]['duration'] is None: results.iloc[0]['duration'] = -1 else: @@ -120,5 +122,6 @@ def toss_repo_info(repo_id): LIMIT 1; """) - results = pd.read_sql(license_file_sql, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + results = pd.read_sql(license_file_sql, conn, params={'repo_id': repo_id}) return results diff --git a/augur/api/routes/__init__.py b/augur/api/routes/__init__.py index 5e601f54e8..03c2e2fa71 100644 --- a/augur/api/routes/__init__.py +++ b/augur/api/routes/__init__.py @@ -11,3 +11,4 @@ from .user import * from .dei import * from .util import * +from .complexity import * diff --git a/augur/api/routes/collection_status.py b/augur/api/routes/collection_status.py index 58e17311fe..8afd8eb2da 100644 --- a/augur/api/routes/collection_status.py +++ b/augur/api/routes/collection_status.py @@ -25,7 +25,9 @@ def commit_collection_status(): # TODO: make this name automatic - wrapper? AND c.facade_status = 'Success'; """) - results = pd.read_sql(commit_collection_sql, engine) + + with engine.connect() as conn: + results = pd.read_sql(commit_collection_sql, conn) data = results.to_json( orient="records", date_format='iso', date_unit='ms') return Response(response=data, @@ -86,7 +88,9 @@ def issue_collection_status(): # TODO: make this name automatic - wrapper? ) D WHERE d.issues_enabled = 'true'; """) - results = pd.read_sql(issue_collection_sql, engine) + + with engine.connect() as conn: + results = pd.read_sql(issue_collection_sql, conn) data = results.to_json( orient="records", date_format='iso', date_unit='ms') parsed_data = json.loads(data) @@ -156,7 +160,9 @@ def pull_request_collection_status(): # TODO: make this name automatic - wrappe ORDER BY ratio_abs; """) - results = pd.read_sql(pull_request_collection_sql, engine) + + with engine.connect() as conn: + results = pd.read_sql(pull_request_collection_sql, conn) data = results.to_json( orient="records", date_format='iso', date_unit='ms') parsed_data = json.loads(data) diff --git a/augur/api/routes/complexity.py b/augur/api/routes/complexity.py index 81045720a1..bee39eb923 100644 --- a/augur/api/routes/complexity.py +++ b/augur/api/routes/complexity.py @@ -6,32 +6,113 @@ import os import requests -AUGUR_API_VERSION = 'api/unstable' +from augur.api.routes import AUGUR_API_VERSION +from ..server import app, engine -def create_routes(server): - @server.app.route('/{}/complexity/project_languages'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_languages(): - project_languages_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.programming_language, - e.code_lines, - e.files +@app.route('/{}/complexity/project_languages'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_languages(): + project_languages_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.programming_language, + e.code_lines, + e.files + FROM + augur_data.repo, + (SELECT + d.repo_id, + d.programming_language, + SUM(d.code_lines) AS code_lines, + COUNT(*)::int AS files + FROM + (SELECT + augur_data.repo_labor.repo_id, + augur_data.repo_labor.programming_language, + augur_data.repo_labor.code_lines + FROM + augur_data.repo_labor, + ( SELECT + augur_data.repo_labor.repo_id, + MAX ( data_collection_date ) AS last_collected + FROM + augur_data.repo_labor + GROUP BY augur_data.repo_labor.repo_id) recent + WHERE + augur_data.repo_labor.repo_id = recent.repo_id + AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + GROUP BY d.repo_id, d.programming_language) e + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id + """) + + with engine.connect() as conn: + results = pd.read_sql(project_languages_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") + +@app.route('/{}/complexity/project_files'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_files(): + project_files_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.files + FROM + augur_data.repo, + (SELECT + d.repo_id, + count(*) AS files FROM - augur_data.repo, - (SELECT + (SELECT + augur_data.repo_labor.repo_id + FROM + augur_data.repo_labor, + ( SELECT + augur_data.repo_labor.repo_id, + MAX ( data_collection_date ) AS last_collected + FROM + augur_data.repo_labor + GROUP BY augur_data.repo_labor.repo_id) recent + WHERE + augur_data.repo_labor.repo_id = recent.repo_id + AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + GROUP BY d.repo_id) e + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id + """) + + with engine.connect() as conn: + results = pd.read_sql(project_files_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") + +@app.route('/{}/complexity/project_lines'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_lines(): + project_lines_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.total_lines, + e.average_lines + FROM + augur_data.repo, + (SELECT d.repo_id, - d.programming_language, - SUM(d.code_lines) AS code_lines, - COUNT(*)::int AS files + SUM(d.total_lines) AS total_lines, + AVG(d.total_lines)::INT AS average_lines FROM (SELECT augur_data.repo_labor.repo_id, - augur_data.repo_labor.programming_language, - augur_data.repo_labor.code_lines + augur_data.repo_labor.total_lines FROM augur_data.repo_labor, ( SELECT @@ -43,113 +124,80 @@ def get_project_languages(): WHERE augur_data.repo_labor.repo_id = recent.repo_id AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d - GROUP BY d.repo_id, d.programming_language) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id - """) - results = pd.read_sql(project_languages_sql, server.engine) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") + GROUP BY d.repo_id) e + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id + """) - @server.app.route('/{}/complexity/project_files'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_files(): - project_files_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.files - FROM - augur_data.repo, - (SELECT - d.repo_id, - count(*) AS files - FROM - (SELECT - augur_data.repo_labor.repo_id - FROM - augur_data.repo_labor, - ( SELECT - augur_data.repo_labor.repo_id, - MAX ( data_collection_date ) AS last_collected - FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent - WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d - GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id - """) - results = pd.read_sql(project_files_sql, server.engine) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") + with engine.connect() as conn: + results = pd.read_sql(project_lines_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") - @server.app.route('/{}/complexity/project_lines'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_lines(): - project_lines_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.total_lines, - e.average_lines +@app.route('/{}/complexity/project_comment_lines'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_comment_lines(): + comment_lines_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.comment_lines, + e.avg_comment_lines + FROM + augur_data.repo, + (SELECT + d.repo_id, + SUM(d.comment_lines) AS comment_lines, + AVG(d.comment_lines)::INT AS avg_comment_lines FROM - augur_data.repo, - (SELECT - d.repo_id, - SUM(d.total_lines) AS total_lines, - AVG(d.total_lines)::INT AS average_lines - FROM - (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.total_lines - FROM - augur_data.repo_labor, - ( SELECT - augur_data.repo_labor.repo_id, - MAX ( data_collection_date ) AS last_collected - FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent - WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d - GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id - """) - results = pd.read_sql(project_lines_sql, server.engine) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") + (SELECT + augur_data.repo_labor.repo_id, + augur_data.repo_labor.comment_lines + FROM + augur_data.repo_labor, + ( SELECT + augur_data.repo_labor.repo_id, + MAX ( data_collection_date ) AS last_collected + FROM + augur_data.repo_labor + GROUP BY augur_data.repo_labor.repo_id) recent + WHERE + augur_data.repo_labor.repo_id = recent.repo_id + AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + GROUP BY d.repo_id) e + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id + """) + + with engine.connect() as conn: + results = pd.read_sql(comment_lines_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") - @server.app.route('/{}/complexity/project_comment_lines'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_comment_lines(): - comment_lines_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.comment_lines, - e.avg_comment_lines +@app.route('/{}/complexity/project_blank_lines'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_blank_lines(): + blank_lines_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.blank_lines, + e.avg_blank_lines + FROM + augur_data.repo, + (SELECT + d.repo_id, + SUM(d.blank_lines) AS blank_lines, + AVG(d.blank_lines)::int AS avg_blank_lines FROM - augur_data.repo, - (SELECT - d.repo_id, - SUM(d.comment_lines) AS comment_lines, - AVG(d.comment_lines)::INT AS avg_comment_lines - FROM - (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.comment_lines - FROM + (SELECT + augur_data.repo_labor.repo_id, + augur_data.repo_labor.blank_lines + FROM augur_data.repo_labor, ( SELECT augur_data.repo_labor.repo_id, @@ -161,93 +209,57 @@ def get_project_comment_lines(): augur_data.repo_labor.repo_id = recent.repo_id AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id """) - results = pd.read_sql(comment_lines_sql, server.engine) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") - @server.app.route('/{}/complexity/project_blank_lines'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_blank_lines(): - blank_lines_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.blank_lines, - e.avg_blank_lines - FROM - augur_data.repo, - (SELECT - d.repo_id, - SUM(d.blank_lines) AS blank_lines, - AVG(d.blank_lines)::int AS avg_blank_lines - FROM - (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.blank_lines - FROM - augur_data.repo_labor, - ( SELECT - augur_data.repo_labor.repo_id, - MAX ( data_collection_date ) AS last_collected - FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent - WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d - GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id - """) - results = pd.read_sql(blank_lines_sql, server.engine) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") - + with engine.connect() as conn: + results = pd.read_sql(blank_lines_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") + - @server.app.route('/{}/complexity/project_file_complexity'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_file_complexity(): - project_file_complexity_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.sum_code_complexity, - e.average_code_complexity +@app.route('/{}/complexity/project_file_complexity'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_file_complexity(): + project_file_complexity_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.sum_code_complexity, + e.average_code_complexity + FROM + augur_data.repo, + (SELECT + d.repo_id, + SUM(d.code_complexity) AS sum_code_complexity, + AVG(d.code_complexity)::int AS average_code_complexity FROM - augur_data.repo, - (SELECT - d.repo_id, - SUM(d.code_complexity) AS sum_code_complexity, - AVG(d.code_complexity)::int AS average_code_complexity - FROM - (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.code_complexity - FROM - augur_data.repo_labor, - ( SELECT - augur_data.repo_labor.repo_id, - MAX ( data_collection_date ) AS last_collected - FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent - WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d - GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id - """) - results = pd.read_sql(project_file_complexity_sql, server.engine) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") + (SELECT + augur_data.repo_labor.repo_id, + augur_data.repo_labor.code_complexity + FROM + augur_data.repo_labor, + ( SELECT + augur_data.repo_labor.repo_id, + MAX ( data_collection_date ) AS last_collected + FROM + augur_data.repo_labor + GROUP BY augur_data.repo_labor.repo_id) recent + WHERE + augur_data.repo_labor.repo_id = recent.repo_id + AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + GROUP BY d.repo_id) e + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id + """) + with engine.connect() as conn: + results = pd.read_sql(project_file_complexity_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") + diff --git a/augur/api/routes/contributor_reports.py b/augur/api/routes/contributor_reports.py index 896e00fc0e..c600e81416 100644 --- a/augur/api/routes/contributor_reports.py +++ b/augur/api/routes/contributor_reports.py @@ -293,7 +293,9 @@ def new_contributor_data_collection(repo_id, required_contributions): WHERE RANK IN {rank_tuple} """) - df = pd.read_sql(contributor_query, engine) + + with engine.connect() as conn: + df = pd.read_sql(contributor_query, conn) df = df.loc[~df['full_name'].str.contains('bot', na=False)] df = df.loc[~df['login'].str.contains('bot', na=False)] @@ -334,7 +336,9 @@ def months_data_collection(start_date, end_date): FROM generate_series (TIMESTAMP '{start_date}', TIMESTAMP '{end_date}', INTERVAL '1 month' ) created_month ) d ) x ) y """) - months_df = pd.read_sql(months_query, engine) + + with engine.connect() as conn: + months_df = pd.read_sql(months_query, conn) # add yearmonths to months_df months_df[['year', 'month']] = months_df[['year', 'month']].astype(float).astype(int).astype(str) diff --git a/augur/api/routes/dei.py b/augur/api/routes/dei.py index da724197df..82324a8d62 100644 --- a/augur/api/routes/dei.py +++ b/augur/api/routes/dei.py @@ -17,7 +17,7 @@ from augur.application.db.session import DatabaseSession from augur.application.config import AugurConfig -from augur.tasks.util.collection_util import start_block_of_repos, get_enabled_phase_names_from_config, core_task_success_util +from augur.tasks.util.collection_util import CollectionRequest,AugurTaskRoutine, get_enabled_phase_names_from_config, core_task_success_util from augur.tasks.start_tasks import prelim_phase, primary_repo_collect_phase from augur.tasks.github.util.github_task_session import GithubTaskSession from augur.tasks.init.redis_connection import redis_connection as redis @@ -52,7 +52,7 @@ def dei_track_repo(application: ClientApplication): return jsonify({"status": "Repo already exists"}) frontend_repo_group: RepoGroup = session.query(RepoGroup).filter(RepoGroup.rg_name == FRONTEND_REPO_GROUP_NAME).first() - repo_id = Repo.insert(session, repo_url, frontend_repo_group.repo_group_id, "API.DEI", repo_type="") + repo_id = Repo.insert_github_repo(session, repo_url, frontend_repo_group.repo_group_id, "API.DEI", repo_type="") if not repo_id: return jsonify({"status": "Error adding repo"}) @@ -96,7 +96,13 @@ def core_task_success_util_gen(repo_git): record = BadgingDEI(**record) session.add(record) - start_block_of_repos(logger, session, [repo_url], primary_enabled_phases, "new") + + deiHook = CollectionRequest("core",primary_enabled_phases) + deiHook.repo_list = [repo_url] + + singleRoutine = AugurTaskRoutine(session,[deiHook]) + singleRoutine.start_data_collection() + #start_block_of_repos(logger, session, [repo_url], primary_enabled_phases, "new") session.close() diff --git a/augur/api/routes/metadata.py b/augur/api/routes/metadata.py index 389a3d9d18..f49dbb88f8 100644 --- a/augur/api/routes/metadata.py +++ b/augur/api/routes/metadata.py @@ -47,7 +47,9 @@ def get_repo_info(): ORDER BY repo.repo_name; """) - results = pd.read_sql(repo_info_sql, engine) + + with engine.connect() as conn: + results = pd.read_sql(repo_info_sql, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') parsed_data = json.loads(data) return Response(response=data, @@ -61,7 +63,9 @@ def contributions_count(): group by repo_git order by contributions desc; """) - results = pd.read_sql(repo_info_sql, engine) + + with engine.connect() as conn: + results = pd.read_sql(repo_info_sql, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') parsed_data = json.loads(data) return Response(response=data, @@ -75,7 +79,9 @@ def contributors_count(): group by repo_git order by contributors desc; """) - results = pd.read_sql(repo_info_sql, engine) + + with engine.connect() as conn: + results = pd.read_sql(repo_info_sql, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') parsed_data = json.loads(data) return Response(response=data, diff --git a/augur/api/routes/pull_request_reports.py b/augur/api/routes/pull_request_reports.py index 02f6e235cd..9e65779542 100644 --- a/augur/api/routes/pull_request_reports.py +++ b/augur/api/routes/pull_request_reports.py @@ -53,7 +53,7 @@ def pull_request_data_collection(repo_id, start_date, end_date): ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_last_response, first_response_time, last_response_time, - average_time_between_responses, + EXTRACT ( EPOCH FROM average_time_between_responses), assigned_count, review_requested_count, labeled_count, @@ -62,15 +62,15 @@ def pull_request_data_collection(repo_id, start_date, end_date): referenced_count, closed_count, head_ref_force_pushed_count, - merged_count, + merged_count::INT, milestoned_count, unlabeled_count, head_ref_deleted_count, comment_count, - lines_added, - lines_removed, + COALESCE(lines_added, 0), + COALESCE(lines_removed, 0), commit_count, - file_count + COALESCE(file_count, 0) FROM repo, repo_groups, @@ -87,46 +87,47 @@ def pull_request_data_collection(repo_id, start_date, end_date): count(*) FILTER (WHERE action = 'head_ref_force_pushed') AS head_ref_force_pushed_count, count(*) FILTER (WHERE action = 'head_ref_deleted') AS head_ref_deleted_count, count(*) FILTER (WHERE action = 'milestoned') AS milestoned_count, - count(*) FILTER (WHERE action = 'merged') AS merged_count, - MIN(message.msg_timestamp) AS first_response_time, - COUNT(DISTINCT message.msg_timestamp) AS comment_count, - MAX(message.msg_timestamp) AS last_response_time, - (MAX(message.msg_timestamp) - MIN(message.msg_timestamp)) / COUNT(DISTINCT message.msg_timestamp) AS average_time_between_responses - FROM pull_request_events, pull_requests, repo, pull_request_message_ref, message - WHERE repo.repo_id = {repo_id} - AND repo.repo_id = pull_requests.repo_id - AND pull_requests.pull_request_id = pull_request_events.pull_request_id - AND pull_requests.pull_request_id = pull_request_message_ref.pull_request_id - AND pull_request_message_ref.msg_id = message.msg_id + COALESCE(count(*) FILTER (WHERE action = 'merged'), 0) AS merged_count, + COALESCE(MIN(message.msg_timestamp), pull_requests.pr_merged_at, pull_requests.pr_closed_at) AS first_response_time, + COALESCE(COUNT(DISTINCT message.msg_timestamp), 0) AS comment_count, + COALESCE(MAX(message.msg_timestamp), pull_requests.pr_closed_at) AS last_response_time, + COALESCE((MAX(message.msg_timestamp) - MIN(message.msg_timestamp)) / COUNT(DISTINCT message.msg_timestamp), pull_requests.pr_created_at - pull_requests.pr_closed_at) AS average_time_between_responses + FROM pull_requests + LEFT OUTER JOIN pull_request_events on pull_requests.pull_request_id = pull_request_events.pull_request_id + JOIN repo on repo.repo_id = pull_requests.repo_id + LEFT OUTER JOIN pull_request_message_ref on pull_requests.pull_request_id = pull_request_message_ref.pull_request_id + LEFT OUTER JOIN message on pull_request_message_ref.msg_id = message.msg_id + WHERE repo.repo_id = 1 GROUP BY pull_requests.pull_request_id ) response_times ON pull_requests.pull_request_id = response_times.pull_request_id - LEFT OUTER JOIN ( - SELECT pull_request_commits.pull_request_id, count(DISTINCT pr_cmt_sha) AS commit_count FROM pull_request_commits, pull_requests, pull_request_meta + LEFT JOIN ( + SELECT pull_request_commits.pull_request_id, count(DISTINCT pr_cmt_sha) AS commit_count + FROM pull_request_commits, pull_requests, pull_request_meta WHERE pull_requests.pull_request_id = pull_request_commits.pull_request_id AND pull_requests.pull_request_id = pull_request_meta.pull_request_id - AND pull_requests.repo_id = {repo_id} + AND pull_requests.repo_id = 1 AND pr_cmt_sha <> pull_requests.pr_merge_commit_sha AND pr_cmt_sha <> pull_request_meta.pr_sha GROUP BY pull_request_commits.pull_request_id ) all_commit_counts ON pull_requests.pull_request_id = all_commit_counts.pull_request_id - LEFT OUTER JOIN ( + LEFT JOIN ( SELECT MAX(pr_repo_meta_id), pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label FROM pull_requests, pull_request_meta WHERE pull_requests.pull_request_id = pull_request_meta.pull_request_id - AND pull_requests.repo_id = {repo_id} + AND pull_requests.repo_id = 1 AND pr_head_or_base = 'base' GROUP BY pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label ) base_labels ON base_labels.pull_request_id = all_commit_counts.pull_request_id - LEFT OUTER JOIN ( + LEFT JOIN ( SELECT sum(cmt_added) AS lines_added, sum(cmt_removed) AS lines_removed, pull_request_commits.pull_request_id, count(DISTINCT cmt_filename) AS file_count FROM pull_request_commits, commits, pull_requests, pull_request_meta WHERE cmt_commit_hash = pr_cmt_sha AND pull_requests.pull_request_id = pull_request_commits.pull_request_id AND pull_requests.pull_request_id = pull_request_meta.pull_request_id - AND pull_requests.repo_id = {repo_id} + AND pull_requests.repo_id = 1 AND commits.repo_id = pull_requests.repo_id AND commits.cmt_commit_hash <> pull_requests.pr_merge_commit_sha AND commits.cmt_commit_hash <> pull_request_meta.pr_sha @@ -136,11 +137,13 @@ def pull_request_data_collection(repo_id, start_date, end_date): WHERE repo.repo_group_id = repo_groups.repo_group_id AND repo.repo_id = pull_requests.repo_id - AND repo.repo_id = {repo_id} + AND repo.repo_id = 1 ORDER BY merged_count DESC """) - pr_all = pd.read_sql(pr_query, engine) + + with engine.connect() as conn: + pr_all = pd.read_sql(pr_query, conn) pr_all[['assigned_count', 'review_requested_count', diff --git a/augur/api/routes/user.py b/augur/api/routes/user.py index dfaeb81f7f..62bc44068a 100644 --- a/augur/api/routes/user.py +++ b/augur/api/routes/user.py @@ -227,7 +227,7 @@ def add_user_repo(): repo = request.args.get("repo_url") group_name = request.args.get("group_name") - result = current_user.add_repo(group_name, repo) + result = current_user.add_github_repo(group_name, repo) return jsonify(result[1]) @@ -260,7 +260,7 @@ def add_user_org(): org = request.args.get("org_url") group_name = request.args.get("group_name") - result = current_user.add_org(group_name, org) + result = current_user.add_github_org(group_name, org) return jsonify(result[1]) diff --git a/augur/api/routes/util.py b/augur/api/routes/util.py index cd6a8ad3bc..71d3526b96 100644 --- a/augur/api/routes/util.py +++ b/augur/api/routes/util.py @@ -1,10 +1,11 @@ #SPDX-License-Identifier: MIT +from augur.api.routes import AUGUR_API_VERSION +from ..server import app, engine import base64 import sqlalchemy as s import pandas as pd import json from flask import Response -import logging from augur.application.db.session import DatabaseSession from augur.application.logs import AugurLogger @@ -12,10 +13,6 @@ logger = AugurLogger("augur").get_logger() -from augur.api.routes import AUGUR_API_VERSION -from ..server import app, engine - - @app.route('/{}/repo-groups'.format(AUGUR_API_VERSION)) def get_all_repo_groups(): #TODO: make this name automatic - wrapper? repoGroupsSQL = s.sql.text(""" @@ -23,7 +20,9 @@ def get_all_repo_groups(): #TODO: make this name automatic - wrapper? FROM repo_groups ORDER BY rg_name """) - results = pd.read_sql(repoGroupsSQL, engine) + + with engine.connect() as conn: + results = pd.read_sql(repoGroupsSQL, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, status=200, @@ -52,13 +51,15 @@ def get_all_repos(): (select * from api_get_all_repos_issues) b on repo.repo_id = b.repo_id - left outer join - (select * from api_get_all_repo_prs) c - on repo.repo_id=c.repo_id + left outer join + (select * from api_get_all_repo_prs) c + on repo.repo_id=c.repo_id JOIN repo_groups ON repo_groups.repo_group_id = repo.repo_group_id order by repo_name """) - results = pd.read_sql(get_all_repos_sql, engine) + + with engine.connect() as conn: + results = pd.read_sql(get_all_repos_sql, conn) results['url'] = results['url'].apply(lambda datum: datum.split('//')[1]) b64_urls = [] @@ -91,21 +92,65 @@ def get_repos_in_repo_group(repo_group_id): (select * from api_get_all_repos_issues) b on repo.repo_id = b.repo_id - left outer join - (select * from api_get_all_repo_prs) c - on repo.repo_id=c.repo_id + left outer join + (select * from api_get_all_repo_prs) c + on repo.repo_id=c.repo_id JOIN repo_groups ON repo_groups.repo_group_id = repo.repo_group_id WHERE repo_groups.repo_group_id = :repo_group_id ORDER BY repo.repo_git """) - results = pd.read_sql(repos_in_repo_groups_SQL, engine, params={'repo_group_id': repo_group_id}) + with engine.connect() as conn: + results = pd.read_sql(repos_in_repo_groups_SQL, conn, params={'repo_group_id': repo_group_id}) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, status=200, mimetype="application/json") +@app.route('/{}/repos/'.format(AUGUR_API_VERSION)) +def get_repo_by_id(repo_id: int) -> Response: + repo_by_id_SQL = s.sql.text(""" + SELECT + repo.repo_id, + repo.repo_name, + repo.description, + repo.repo_git AS url, + a.commits_all_time, + b.issues_all_time, + c.pull_requests_all_time, + rg_name, + repo.repo_group_id + FROM + repo + LEFT OUTER JOIN + (SELECT * FROM api_get_all_repos_commits) a + ON repo.repo_id = a.repo_id + LEFT OUTER JOIN + (SELECT * FROM api_get_all_repos_issues) b + ON repo.repo_id = b.repo_id + LEFT OUTER JOIN + (SELECT * FROM api_get_all_repo_prs) c + ON repo.repo_id = c.repo_id + JOIN repo_groups ON repo_groups.repo_group_id = repo.repo_group_id + WHERE + repo.repo_id = :id + """) + + results = pd.read_sql(repo_by_id_SQL, engine, params={"id": repo_id}) + results["url"] = results["url"].apply(lambda datum: datum.split("//")[1]) # cut "https://" off the URL + results["base64_url"] = [base64.b64encode(results.at[i, "url"].encode()) for i in results.index] + data = results.to_json(orient="records", date_format="iso", date_unit="ms") + + if not data or data == "[]": + return Response(response='{"status": "Repository ' + str(repo_id) + ' does not exist"}', + status=400, + mimetype="application/json") + + return Response(response=data[1:-1], # cut off brackets at each end, turns list of length 1 into single value + status=200, + mimetype="application/json") + @app.route('/{}/owner//repo/'.format(AUGUR_API_VERSION)) def get_repo_by_git_name(owner, repo): @@ -116,7 +161,8 @@ def get_repo_by_git_name(owner, repo): GROUP BY repo_id, rg_name """) - results = pd.read_sql(get_repo_by_git_name_sql, engine, params={'owner': '%{}_'.format(owner), 'repo': repo,}) + with engine.connect() as conn: + results = pd.read_sql(get_repo_by_git_name_sql, conn, params={'owner': '%{}%'.format(owner), 'repo': repo,}) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, status=200, @@ -132,7 +178,9 @@ def get_repo_by_name(rg_name, repo_name): AND LOWER(rg_name) = LOWER(:rg_name) AND LOWER(repo_name) = LOWER(:repo_name) """) - results = pd.read_sql(get_repo_by_name_sql, engine, params={'rg_name': rg_name, 'repo_name': repo_name}) + + with engine.connect() as conn: + results = pd.read_sql(get_repo_by_name_sql, conn, params={'rg_name': rg_name, 'repo_name': repo_name}) results['url'] = results['url'].apply(lambda datum: datum.split('//')[1]) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, @@ -146,7 +194,9 @@ def get_group_by_name(rg_name): FROM repo_groups WHERE lower(rg_name) = lower(:rg_name) """) - results = pd.read_sql(groupSQL, engine, params={'rg_name': rg_name}) + + with engine.connect() as conn: + results = pd.read_sql(groupSQL, conn, params={'rg_name': rg_name}) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, status=200, @@ -160,7 +210,8 @@ def get_repos_for_dosocs(): WHERE a.setting='repo_directory' """) - results = pd.read_sql(get_repos_for_dosocs_SQL, engine) + with engine.connect() as conn: + results = pd.read_sql(get_repos_for_dosocs_SQL, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, status=200, @@ -188,7 +239,9 @@ def get_issues(repo_group_id, repo_id=None): GROUP BY issues.issue_id ORDER by OPEN_DAY DESC """) - results = pd.read_sql(get_issues_sql, engine, params={'repo_group_id': repo_group_id}) + + with engine.connect() as conn: + results = pd.read_sql(get_issues_sql, conn, params={'repo_group_id': repo_group_id}) else: get_issues_sql = s.sql.text(""" SELECT issue_title, @@ -208,7 +261,9 @@ def get_issues(repo_group_id, repo_id=None): GROUP BY issues.issue_id, repo_name ORDER by OPEN_DAY DESC """) - results = pd.read_sql(get_issues_sql, engine, params={'repo_id': repo_id}) + + with engine.connect() as conn: + results = pd.read_sql(get_issues_sql, conn, params={'repo_id': repo_id}) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, status=200, diff --git a/augur/api/server.py b/augur/api/server.py index e3c9663650..d3e92ad99e 100644 --- a/augur/api/server.py +++ b/augur/api/server.py @@ -10,24 +10,30 @@ import base64 import logging import importlib +import graphene from typing import Optional, List, Any, Tuple from pathlib import Path -from flask import Flask, request, Response, redirect +from flask import Flask, request, Response, redirect, jsonify from flask_cors import CORS import pandas as pd from beaker.util import parse_cache_config_options from beaker.cache import CacheManager, Cache from sqlalchemy import create_engine from sqlalchemy.pool import StaticPool +from flask_graphql import GraphQLView +from graphene_sqlalchemy import SQLAlchemyObjectType + from augur.application.logs import AugurLogger from augur.application.config import AugurConfig from augur.application.db.session import DatabaseSession from augur.application.db.engine import get_database_string, create_database_engine from metadata import __version__ as augur_code_version +from augur.application.db.models import Repo, Issue, PullRequest, Message, PullRequestReview, Commit, IssueAssignee, PullRequestAssignee, PullRequestCommit, PullRequestFile, Contributor, IssueLabel, PullRequestLabel, ContributorsAlias, Release, ClientApplication + # from augur.api.routes import AUGUR_API_VERSION AUGUR_API_VERSION = "api/unstable" @@ -327,6 +333,335 @@ def get_server_cache(config, cache_manager) -> Cache: db_session = DatabaseSession(logger, engine) augur_config = AugurConfig(logger, db_session) + +def get_connection(table, cursor_field_name, connection_class, after, limit, extra_condition=False): + + cursor_field = getattr(table, cursor_field_name) + query = db_session.query(table).order_by(cursor_field) + + if after: + cursor_id = after + query = query.filter(cursor_field > cursor_id) + + if extra_condition: + field = getattr(table, extra_condition["field_name"]) + query = query.filter(field == extra_condition["value"]) + + # get one more item to determine if there is a next page + items = query.limit(limit + 1).all() + has_next_page = len(items) > limit + items = items[:limit] + + + if items: + next_cursor = getattr(items[-1], cursor_field_name) + else: + next_cursor = None + + return connection_class(items=items, page_info=PageInfoType(next_cursor=next_cursor, has_next_page=has_next_page)) + + + + +########### Repo Types ################## +class RepoType(SQLAlchemyObjectType): + class Meta: + model = Repo + use_connection = True + + issues = graphene.Field(lambda: IssueConnection, after=graphene.String(), limit=graphene.Int(default_value=10)) + prs = graphene.Field(lambda: PullRequestConnection, after=graphene.String(), limit=graphene.Int(default_value=10)) + messages = graphene.Field(lambda: MessageConnection, after=graphene.String(), limit=graphene.Int(default_value=10)) + releases = graphene.List(lambda: ReleaseType) + cursor = graphene.String() + + def resolve_cursor(self, info): + return str(self.repo_id) + + def resolve_issues(self, info, after=None, limit=None): + condition = {"field_name": "repo_id", "value": self.repo_id} + issue_connection = get_connection(Issue, "issue_id", IssueConnection, after, limit, condition) + return issue_connection + + def resolve_prs(self, info, after=None, limit=None): + condition = {"field_name": "repo_id", "value": self.repo_id} + pr_connection = get_connection(PullRequest, "pull_request_id", PullRequestConnection, after, limit, condition) + return pr_connection + + def resolve_messages(self, info, after=None, limit=None): + condition = {"field_name": "repo_id", "value": self.repo_id} + messages_connection = get_connection(Message, "msg_id", MessageConnection, after, limit,condition) + return messages_connection + + def resolve_releases(self, info): + return self.releases + +class ReleaseType(SQLAlchemyObjectType): + + class Meta: + model = Release + use_connection = True + + +############### Issue Objects ############# +class IssueType(SQLAlchemyObjectType): + class Meta: + model = Issue + use_connection = True + + repo = graphene.Field(RepoType) + messages = graphene.List(lambda: MessageType) + labels = graphene.List(lambda: IssueLabelType) + assignees = graphene.List(lambda: IssueAssigneeType) + cursor = graphene.String() + + def resolve_cursor(self, info): + return str(self.issue_id) + + def resolve_repo(self, info): + return self.repo + + def resolve_messages(self, info): + messages = [ref.message for ref in self.message_refs] + return messages + + def resolve_labels(self, info): + return self.labels + + def resolve_assignees(self, info): + return self.assignees + +class IssueAssigneeType(SQLAlchemyObjectType): + + class Meta: + model = IssueAssignee + use_connection = True + +class IssueLabelType(SQLAlchemyObjectType): + + class Meta: + model = IssueLabel + use_connection = True + + +################ Pull Request Objects ############ +class PullRequestType(SQLAlchemyObjectType): + class Meta: + model = PullRequest + use_connection = True + + repo = graphene.Field(RepoType) + messages = graphene.List(lambda: MessageType) + reviews = graphene.List(lambda: PullRequestReviewType) + labels = graphene.List(lambda: PrLabelType) + assignees = graphene.List(lambda: PullRequestAssigneeType) + files = graphene.List(lambda: PullRequestFileType) + cursor = graphene.String() + + def resolve_cursor(self, info): + return str(self.pull_request_id) + + def resolve_repo(self, info): + return self.repo + + def resolve_messages(self, info): + messages = [ref.message for ref in self.message_refs] + return messages + + def resolve_reviews(self, info): + return self.reviews + + def resolve_labels(self, info): + return self.labels + + def resolve_assignees(self, info): + return self.assignees + + def resolve_files(self, info): + return self.files + +class PullRequestAssigneeType(SQLAlchemyObjectType): + + class Meta: + model = PullRequestAssignee + use_connection = True + +class PullRequestReviewType(SQLAlchemyObjectType): + + class Meta: + model = PullRequestReview + use_connection = True + +class PrLabelType(SQLAlchemyObjectType): + + class Meta: + model = PullRequestLabel + use_connection = True + + +class PullRequestFileType(SQLAlchemyObjectType): + + class Meta: + model = PullRequestFile + use_connection = True + +class PullRequestCommitType(SQLAlchemyObjectType): + + class Meta: + model = PullRequestCommit + use_connection = True + + + +########### Contributor Types ############# +class ContributorType(SQLAlchemyObjectType): + + class Meta: + model = Contributor + use_connection = True + + issues_opened = graphene.List(lambda: IssueType) + pull_requests = graphene.List(lambda: PullRequestType) + pull_request_reviews = graphene.List(lambda: PullRequestReviewType) + commits = graphene.List(lambda: CommitType) + cursor = graphene.String() + + def resolve_cursor(self, info): + return str(self.cntrb_id) + + def resolve_issues_opened(self, info): + return self.issues_opened + + def resolve_pull_requests(self, info): + return self.pull_requests + + def resolve_pull_request_reviews(self, info): + return self.pull_request_reviews + + def resolve_commits(self, info): + return self.commits + +class ContributorAliasType(SQLAlchemyObjectType): + + class Meta: + model = ContributorsAlias + use_connection = True + + + +########### Other Types ################ +class MessageType(SQLAlchemyObjectType): + + class Meta: + model = Message + use_connection = True + + def resolve_repo(self, info): + return self.repo + + cursor = graphene.String() + + def resolve_cursor(self, info): + return str(self.msg_id) + +class CommitType(SQLAlchemyObjectType): + + class Meta: + model = Commit + use_connection = True + + messages = graphene.List(MessageType) + + def resolve_repo(self, info): + return self.repo + +class PageInfoType(graphene.ObjectType): + next_cursor = graphene.String() + has_next_page = graphene.Boolean() + + + + +########### Connection Objects ############# +class GenericConnection(graphene.ObjectType): + page_info = graphene.Field(PageInfoType) + +class RepoConnection(GenericConnection): + items = graphene.List(RepoType) + +class IssueConnection(GenericConnection): + items = graphene.List(IssueType) + +class PullRequestConnection(GenericConnection): + items = graphene.List(PullRequestType) + +class CommitConnection(GenericConnection): + items = graphene.List(CommitType) + +class ContributorConnection(GenericConnection): + items = graphene.List(ContributorType) + +class MessageConnection(GenericConnection): + items = graphene.List(MessageType) + + +############### Base Query object ############## +class Query(graphene.ObjectType): + + repos = graphene.Field(RepoConnection, after=graphene.String(), limit=graphene.Int(default_value=10)) + repo = graphene.Field(RepoType, id=graphene.Int()) + + issues = graphene.Field(IssueConnection, after=graphene.String(), limit=graphene.Int(default_value=10)) + issue = graphene.Field(IssueType, id=graphene.Int()) + + prs = graphene.Field(PullRequestConnection, after=graphene.String(), limit=graphene.Int(default_value=10)) + pr = graphene.List(PullRequestType, id=graphene.Int()) + + messages = graphene.Field(MessageConnection, after=graphene.String(), limit=graphene.Int(default_value=10)) + commits = graphene.Field(CommitConnection, after=graphene.String(), limit=graphene.Int(default_value=10)) + + contributors = graphene.Field(ContributorConnection, after=graphene.String(), limit=graphene.Int(default_value=10)) + contributor = graphene.Field(ContributorType, id=graphene.UUID()) + + def resolve_repos(self, info, after=None, limit=None): + repo_connection = get_connection(Repo, "repo_id", RepoConnection, after, limit) + return repo_connection + + def resolve_repo(self, info, id): + return db_session.query(Repo).filter(Repo.repo_id==id).first() + + def resolve_issues(self, info, after=None, limit=None): + issue_connection = get_connection(Issue, "issue_id", IssueConnection, after, limit) + return issue_connection + + def resolve_issue(self, info, id): + return db_session.query(Issue).filter(Issue.issue_id==id).first() + + def resolve_prs(self, info, after=None, limit=None): + pr_connection = get_connection(PullRequest, "pull_request_id", PullRequestConnection, after, limit) + return pr_connection + + def resolve_pr(self, info, id): + return db_session.query(PullRequest).filter(PullRequest.pull_request_id==id).first() + + def resolve_messages(self, info, after=None, limit=None): + messages_connection = get_connection(Message, "msg_id", MessageConnection, after, limit) + return messages_connection + + def resolve_commits(self, info, after=None, limit=None): + commit_connection = get_connection(Commit, "cmt_id", CommitConnection, after, limit) + return commit_connection + + def resolve_contributors(self, info, after=None, limit=None): + contributors_connection = get_connection(Contributor, "cntrb_id", ContributorConnection, after, limit) + return contributors_connection + + def resolve_contributor(self, info, id): + return db_session.query(Contributor).filter(Contributor.cntrb_id==id).first() + + + + template_dir = str(Path(__file__).parent.parent / "templates") static_dir = str(Path(__file__).parent.parent / "static") @@ -371,6 +706,25 @@ def status(): status=200, mimetype="application/json") +schema = graphene.Schema(query=Query) + +class AuthenticatedGraphQLView(GraphQLView): + def dispatch_request(self): + + api_key = request.headers.get('x-api-key') + + client_applications = db_session.query(ClientApplication).all() + api_keys = [app.api_key for app in client_applications] + + if not api_key or api_key not in api_keys: + return jsonify(error="Invalid or missing API key"), 403 + + return super().dispatch_request() + +schema = graphene.Schema(query=Query) + +app.add_url_rule(f'/{app.augur_api_version}/graphql', view_func=AuthenticatedGraphQLView.as_view('graphql', schema=schema, graphiql=True)) + from .routes import * # import frontend routes diff --git a/augur/api/view/api.py b/augur/api/view/api.py index 287b079436..598c0cdb6d 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -102,7 +102,18 @@ def av_add_user_repo(): if rg_obj: # add the orgs repos to the group add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) - + + # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} + elif Repo.parse_gitlab_repo_url(url)[0]: + + org_name, repo_name = Repo.parse_github_repo_url(url) + repo_git = f"https://gitlab.com/{org_name}/{repo_name}" + + # TODO: gitlab ensure the whole repo git is inserted so it can be found here + repo_obj = Repo.get_by_repo_git(session, repo_git) + if repo_obj: + add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) + else: invalid_urls.append(url) diff --git a/augur/api/view/routes.py b/augur/api/view/routes.py index 8a9fc0597c..72164a9291 100644 --- a/augur/api/view/routes.py +++ b/augur/api/view/routes.py @@ -1,4 +1,8 @@ +""" +Defines the api routes for the augur views +""" import logging +import math from flask import Flask, render_template, render_template_string, request, abort, jsonify, redirect, url_for, session, flash from sqlalchemy.orm.exc import NoResultFound from .utils import * @@ -37,9 +41,9 @@ def root(path=""): def logo(brand=None): if brand is None: return redirect(url_for('static', filename='img/augur_logo.png')) - elif "augur" in brand: + if "augur" in brand: return logo(None) - elif "chaoss" in brand: + if "chaoss" in brand: return redirect(url_for('static', filename='img/Chaoss_Logo_white.png')) return "" @@ -75,10 +79,16 @@ def repo_table_view(): if current_user.is_authenticated: data = current_user.get_repos(page = page, sort = sorting, direction = direction, search=query)[0] - page_count = (current_user.get_repo_count(search = query)[0] or 0) // pagination_offset + repos_count = (current_user.get_repo_count(search = query)[0] or 0) else: data = get_all_repos(page = page, sort = sorting, direction = direction, search=query)[0] - page_count = (get_all_repos_count(search = query)[0] or 0) // pagination_offset + repos_count = (get_all_repos_count(search = query)[0] or 0) + + page_count = math.ceil(repos_count / pagination_offset) - 1 + + if not data: + data = None + return render_module("repos-table", title="Repos", repos=data, query_key=query, activePage=page, pages=page_count, offset=pagination_offset, PS="repo_table_view", reverse = rev, sorting = sorting) diff --git a/augur/api/view/utils.py b/augur/api/view/utils.py index 2289355743..298e9950ae 100644 --- a/augur/api/view/utils.py +++ b/augur/api/view/utils.py @@ -1,10 +1,24 @@ +""" +Defines utility functions used by the augur api views +""" from pathlib import Path from concurrent.futures import ThreadPoolExecutor from flask import render_template, flash, url_for, Flask +from .init import init_logging from .init import * from ..server import app, db_session from augur.application.config import AugurConfig -import urllib.request, urllib.error, json, os, math, yaml, urllib3, time, logging, re +import urllib.request, urllib.error, json, os, math, yaml, urllib3, time, logging, re, math + +from augur.application.db.session import DatabaseSession +from augur.application.db.engine import DatabaseEngine +from augur.application.db.models import User, Repo, RepoGroup, UserGroup, UserRepo +from sqlalchemy import Column, Table, Integer, MetaData, or_ +from sqlalchemy.sql.operators import ilike_op, distinct_op +from sqlalchemy.sql.functions import coalesce +from augur.application.db.models.base import Base + +from sqlalchemy.orm import Query init_logging() @@ -66,6 +80,8 @@ def getSetting(key, section = "View"): loadSettings() +#version_check(settings) + """ ---------------------------------------------------------------- """ def loadReports(): @@ -298,3 +314,6 @@ def render_message(messageTitle, messageBody = None, title = None, redirect = No def render_module(module, **args): args.setdefault("body", module) return render_template('index.j2', **args) + +""" ---------------------------------------------------------------- +""" diff --git a/augur/application/cli/_multicommand.py b/augur/application/cli/_multicommand.py index 2df6e8b115..c0d8b1a967 100644 --- a/augur/application/cli/_multicommand.py +++ b/augur/application/cli/_multicommand.py @@ -27,7 +27,7 @@ def get_command(self, ctx, name): try: module = importlib.import_module('.' + name, 'augur.application.cli') return module.cli - except ModuleNotFoundError: + except ModuleNotFoundError as e: pass @click.command(cls=AugurMultiCommand, context_settings=CONTEXT_SETTINGS) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 548c1eeff4..9b6894a7dd 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -19,7 +19,8 @@ from datetime import datetime from augur import instance_id -from augur.tasks.start_tasks import augur_collection_monitor, CollectionState, create_collection_status_records +from augur.tasks.util.collection_state import CollectionState +from augur.tasks.start_tasks import augur_collection_monitor, create_collection_status_records from augur.tasks.git.facade_tasks import clone_repos from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model from augur.tasks.init.redis_connection import redis_connection @@ -91,9 +92,12 @@ def start(disable_collection, development, port): logger.info("Deleting old task schedule") os.remove("celerybeat-schedule.db") - celery_beat_process = None - celery_command = "celery -A augur.tasks.init.celery_app.celery_app beat -l debug" - celery_beat_process = subprocess.Popen(celery_command.split(" ")) + with DatabaseSession(logger) as db_session: + config = AugurConfig(logger, db_session) + log_level = config.get_value("Logging", "log_level") + celery_beat_process = None + celery_command = f"celery -A augur.tasks.init.celery_app.celery_app beat -l {log_level.lower()}" + celery_beat_process = subprocess.Popen(celery_command.split(" ")) if not disable_collection: @@ -177,14 +181,14 @@ def determine_worker_processes(ratio,maximum): sleep_time += 6 #20% of estimate, Maximum value of 25 - secondary_num_processes = determine_worker_processes(.2, 25) + secondary_num_processes = determine_worker_processes(.25, 25) logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" process_list.append(subprocess.Popen(secondary_worker.split(" "))) sleep_time += 6 #15% of estimate, Maximum value of 20 - facade_num_processes = determine_worker_processes(.2, 20) + facade_num_processes = determine_worker_processes(.15, 20) logger.info(f"Starting facade worker processes with concurrency={facade_num_processes}") facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={facade_num_processes} -n facade:{uuid.uuid4().hex}@%h -Q facade" diff --git a/augur/application/cli/db.py b/augur/application/cli/db.py index f09aaabbd2..42d57ecc6b 100644 --- a/augur/application/cli/db.py +++ b/augur/application/cli/db.py @@ -99,7 +99,7 @@ def add_repo_groups(filename): """ Create new repo groups in Augur's database """ - with DatabaseEngine() as engine, engine.connect() as connection: + with DatabaseEngine() as engine, engine.begin() as connection: df = pd.read_sql( s.sql.text("SELECT repo_group_id FROM augur_data.repo_groups"), @@ -248,7 +248,7 @@ def update_api_key(api_key): """ ) - with DatabaseEngine() as engine, engine.connect() as connection: + with DatabaseEngine() as engine, engine.begin() as connection: connection.execute(update_api_key_sql, api_key=api_key) logger.info(f"Updated Augur API key to: {api_key}") diff --git a/augur/application/cli/tasks.py b/augur/application/cli/tasks.py index db31943ff1..b4bec994eb 100644 --- a/augur/application/cli/tasks.py +++ b/augur/application/cli/tasks.py @@ -36,8 +36,8 @@ def start(): secondary_worker_process = None scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" - core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=14 -n core:{uuid.uuid4().hex}@%h" - secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=5 -n secondary:{uuid.uuid4().hex}@%h -Q secondary" + core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n core:{uuid.uuid4().hex}@%h" + secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=25 -n secondary:{uuid.uuid4().hex}@%h -Q secondary" scheduling_worker_process = subprocess.Popen(scheduling_worker.split(" ")) core_worker_process = subprocess.Popen(core_worker.split(" ")) @@ -92,4 +92,4 @@ def clear(): else: logger.error("Invalid input") - \ No newline at end of file + diff --git a/augur/application/config.py b/augur/application/config.py index c9aff085b1..7cf1eca3fb 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -68,7 +68,7 @@ def get_development_flag(): }, "Celery": { "worker_process_vmem_cap": 0.25, - "refresh_materialized_views_interval_in_days": 7 + "refresh_materialized_views_interval_in_days": 1 }, "Redis": { "cache_group": 0, diff --git a/augur/application/db/data_parse.py b/augur/application/db/data_parse.py index abdc6de54c..7562181398 100644 --- a/augur/application/db/data_parse.py +++ b/augur/application/db/data_parse.py @@ -37,8 +37,63 @@ def extract_needed_pr_label_data(labels: List[dict], repo_id: int, tool_source: return label_dicts -# retrieve only the needed data for pr assignees from the api response + +def extract_needed_mr_label_data(labels: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + labels: List of dictionaries of label data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed label dicts + """ + + if len(labels) == 0: + return [] + + label_dicts = [] + for label in labels: + + label_dict = { + 'pr_src_id': label['id'], + 'pr_src_node_id': None, + 'pr_src_url': None, + 'pr_src_description': label['name'], + 'pr_src_color': label['color'], + # TODO: Populate this by making an api call for each label + 'pr_src_default_bool': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source, + 'repo_id': repo_id + } + + label_dicts.append(label_dict) + + return label_dicts + + def extract_needed_pr_assignee_data(assignees: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for pr assignees from the api response + + Arguments: + assignees: List of dictionaries of asignee data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed asignee dicts + """ if len(assignees) == 0: return [] @@ -48,7 +103,6 @@ def extract_needed_pr_assignee_data(assignees: List[dict], repo_id: int, tool_so for assignee in assignees: assignee_dict = { - # store the pr_url data on in the pr assignee data for now so we can relate it back to a pr later 'contrib_id': assignee["cntrb_id"], 'pr_assignee_src_id': int(assignee['id']), 'tool_source': tool_source, @@ -61,8 +115,59 @@ def extract_needed_pr_assignee_data(assignees: List[dict], repo_id: int, tool_so return assignee_dicts -# retrieve only the needed data for pr reviewers from the api response +def extract_needed_merge_request_assignee_data(assignees: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for merge request assignees from the api response + + Arguments: + assignees: List of dictionaries of asignee data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed asignee dicts + """ + + if len(assignees) == 0: + return [] + + assignee_dicts = [] + for assignee in assignees: + + assignee_dict = { + 'contrib_id': None, + 'repo_id': repo_id, + # TODO: Temporarily setting this to id which the id of the contributor, unitl we can get the contrib_id set and create a unique on the contrib_id and the pull_request_id + 'pr_assignee_src_id': assignee["id"], + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + assignee_dicts.append(assignee_dict) + + return assignee_dicts + + + def extract_needed_pr_reviewer_data(reviewers: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for pr reviewers from the api response + + Arguments: + reviewers: List of dictionaries of reviewer data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed reviewer dicts + """ if len(reviewers) == 0: return [] @@ -247,6 +352,42 @@ def extract_needed_issue_assignee_data(assignees: List[dict], repo_id: int, tool return assignee_dicts +def extract_needed_gitlab_issue_assignee_data(assignees: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for gitlab issue assignees from the api response + + Arguments: + assignees: List of dictionaries of gitlab assignee data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed assignee dicts + """ + + if len(assignees) == 0: + return [] + + assignee_dicts = [] + for assignee in assignees: + + assignee_dict = { + "cntrb_id": None, + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source, + "issue_assignee_src_id": assignee['id'], + "issue_assignee_src_node": None, + "repo_id": repo_id + } + + assignee_dicts.append(assignee_dict) + + return assignee_dicts + # retrieve only the needed data for pr labels from the api response @@ -277,9 +418,62 @@ def extract_needed_issue_label_data(labels: List[dict], repo_id: int, tool_sourc return label_dicts +def extract_needed_gitlab_issue_label_data(labels: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for gitlab issue labels from the api response + + Arguments: + labels: List of dictionaries of gitlab issue label data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed label dicts + """ + + if len(labels) == 0: + return [] + + label_dicts = [] + for label in labels: + + label_dict = { + "label_text": label["name"], + "label_description": label.get("description", None), + "label_color": label['color'], + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source, + "label_src_id": label['id'], + "label_src_node_id": None, + "repo_id": repo_id + } + + label_dicts.append(label_dict) + + return label_dicts + + -# retrieve only the needed data for pr labels from the api response def extract_needed_issue_message_ref_data(message: dict, issue_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for pr labels from the api response + + Arguments: + message: Message data dict + issue_id: id of the issue + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + Dict of message ref data. + """ message_ref_dict = { 'issue_id': issue_id, @@ -311,9 +505,21 @@ def extract_needed_pr_message_ref_data(comment: dict, pull_request_id: int, repo def extract_needed_pr_data(pr, repo_id, tool_source, tool_version): + """ + Retrieve only the needed data for the pr api response + + Arguments: + pr: PR data dict + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + + Returns: + Parsed pr dict + """ - pr_dict = { + pr = { 'repo_id': repo_id, 'pr_url': pr['url'], # 1-22-2022 inconsistent casting; sometimes int, sometimes float in bulk_insert @@ -367,9 +573,23 @@ def extract_needed_pr_data(pr, repo_id, tool_source, tool_version): 'data_source': 'GitHub API' } - return pr_dict + return pr def extract_needed_issue_data(issue: dict, repo_id: int, tool_source: str, tool_version: str, data_source: str): + """ + Retrieve only the needed data for the issue api response + + Arguments: + issue: Issue data dict + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: platform source + + + Returns: + Parsed issue dict + """ dict_data = { 'cntrb_id': None, # this the contributor who closed the issue @@ -513,8 +733,438 @@ def extract_needed_pr_review_data(review, pull_request_id, repo_id, platform_id, return review_row +def extract_needed_pr_data_from_gitlab_merge_request(pr, repo_id, tool_source, tool_version): + """ + Retrieve only the needed data for the pr gitlab api response - + Arguments: + pr: PR data dict + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + + + Returns: + Parsed pr dict + """ + + pr_dict = { + 'repo_id': repo_id, + 'pr_url': pr['web_url'], + 'pr_src_id': pr['id'], + 'pr_src_node_id': None, + 'pr_html_url': pr['web_url'], + 'pr_diff_url': None, + 'pr_patch_url': None, + 'pr_issue_url': None, + 'pr_augur_issue_id': None, + 'pr_src_number': pr['iid'], + 'pr_src_state': pr['state'], + 'pr_src_locked': pr['discussion_locked'], + 'pr_src_title': pr['title'], + # TODO: Add contributor logic for gitlab + 'pr_augur_contributor_id': None, + 'pr_body': pr['description'], + 'pr_created_at': pr['created_at'], + 'pr_updated_at': pr['updated_at'], + 'pr_closed_at': pr['closed_at'], + 'pr_merged_at': pr['merged_at'], + 'pr_merge_commit_sha': pr['merge_commit_sha'], + 'pr_teams': None, + 'pr_milestone': pr['milestone'].get('title') if pr['milestone'] else None, + 'pr_commits_url': None, + 'pr_review_comments_url': None, + 'pr_review_comment_url': None, + 'pr_comments_url': None, + 'pr_statuses_url': None, + 'pr_meta_head_id': None, + 'pr_meta_base_id': None, + 'pr_src_issue_url': None, + 'pr_src_comments_url': None, + 'pr_src_review_comments_url': None, + 'pr_src_commits_url': None, + 'pr_src_statuses_url': None, + 'pr_src_author_association': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': 'Gitlab API' + } + + return pr_dict + + +def extract_needed_issue_data_from_gitlab_issue(issue: dict, repo_id: int, tool_source: str, tool_version: str, data_source: str): + """ + Retrieve only the needed data for the issue gitlab api response + + Arguments: + issue: Issue data dict + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: platform source + + Returns: + Parsed issue dict + """ + + issue_dict = { + "repo_id": repo_id, + "reporter_id": None, + "pull_request": None, + "pull_request_id": None, + "created_at": issue['created_at'], + "issue_title": issue['title'], + "issue_body": issue['description'] if 'description' in issue else None, + "comment_count": issue['user_notes_count'], + "updated_at": issue['updated_at'], + "closed_at": issue['closed_at'], + "repository_url": issue['_links']['project'], + "issue_url": issue['_links']['self'], + "labels_url": None, + "comments_url": issue['_links']['notes'], + "events_url": None, + "html_url": issue['_links']['self'], + "issue_state": issue['state'], + "issue_node_id": None, + "gh_issue_id": issue['id'], + "gh_issue_number": issue['iid'], + "gh_user_id": issue['author']['id'], + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source + } + + return issue_dict + + + +def extract_gitlab_mr_event_data(event: dict, pr_id: int, platform_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> dict: + """ + Retrieve only the needed data for the mr event gitlab api response + + Arguments: + event: Event data dict + pr_id: id of the pr + platform_id: id of the platform + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: platform source + + + Returns: + Parsed event dict + """ + + mr_event = { + 'pull_request_id': pr_id, + 'cntrb_id': None, + 'action': event['action_name'], + 'action_commit_hash': None, + 'created_at': event['created_at'], + 'issue_event_src_id': event['target_id'], + 'repo_id': repo_id, + 'platform_id': platform_id, + 'node_id': None, + 'node_url': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + return mr_event + +def extract_gitlab_issue_event_data(event: dict, issue_id: int, platform_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> dict: + """ + Retrieve only the needed data for the issue event gitlab api response + + Arguments: + event: Event data dict + issue_id: id of the issue + platform_id: id of the platform + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: platform source + + + Returns: + Parsed event dict + """ + + issue_event = { + "issue_event_src_id": event['target_id'], + "issue_id": issue_id, + "node_id": None, + "node_url": None, + "cntrb_id": None, + "created_at": event['created_at'], + "action": event["action_name"], + "action_commit_hash": None, + "platform_id": platform_id, + "repo_id" : repo_id, + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source + } + + return issue_event + + +def extract_needed_mr_reviewer_data(data: List[dict], pull_request_id, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for pr reviewers from the api response + + Arguments: + data: List of dictionaries that contain mr reviewer data to parse + pull_request_id: id of the PR + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of extracted relevant data from needed mr reviwer data + """ + + if len(data) == 0: + return [] + + reviewer_dicts = [] + for x in data: + + for _ in x["suggested_approvers"]: + + reviewer_dict = { + 'pull_request_id': pull_request_id, + 'cntrb_id': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + reviewer_dicts.append(reviewer_dict) + + return reviewer_dicts + + +def extract_needed_mr_commit_data(commit, repo_id, pull_request_id, tool_source, tool_version, data_source): + """ + Retrieve only the needed data for mr commit data from the api response + + Arguments: + commit: commit data dictionary + repo_id: augur id of the repository + pull_request_id: id of the PR + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + Dictionary of the extracted commit data + """ + + commit = { + 'pull_request_id': pull_request_id, + 'pr_cmt_sha': commit['id'], + 'pr_cmt_node_id': None, + 'pr_cmt_message': commit['message'], + 'repo_id': repo_id, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source, + } + + return commit + + +def extract_needed_mr_file_data(gitlab_file_data, repo_id, pull_request_id, tool_source, tool_version, data_source): + """ + Retrieve only the needed data for mr file data from the api response + Arguments: + gitlab_file_data: file data dictionary + repo_id: augur id of the repository + pull_request_id: id of the PR + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of dicts of parsed gitlab file changes + """ + files = [] + + changes = gitlab_file_data["changes"] + for file_changes in changes: + try: + deletes = int(file_changes['diff'].split('@@')[1].strip().split(' ')[0].split(',')[1]) + adds = int(file_changes['diff'].split('@@')[1].strip().split(' ')[1].split(',')[1]) + except Exception: + deletes = 0 + adds = 0 + + file_dict = { + 'pull_request_id': pull_request_id, + 'repo_id': repo_id, + 'pr_file_additions': adds, + 'pr_file_deletions': deletes, + 'pr_file_path': file_changes['old_path'], + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source, + } + + files.append(file_dict) + + return files + + +def extract_needed_mr_metadata(mr_dict, repo_id, pull_request_id, tool_source, tool_version, data_source): + """ + Retrieve only the needed data for mr metadata from the api response + + Arguments: + mr_dict: mr data dictionary + repo_id: augur id of the repository + pull_request_id: id of the PR + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of dicts of parsed mr metadata + """ + head = {'sha': mr_dict['diff_refs']['head_sha'], + 'ref': mr_dict['target_branch'], + 'label': str(mr_dict['target_project_id']) + ':' + mr_dict['target_branch'], + 'author': mr_dict['author']['username'], + 'repo': str(mr_dict['target_project_id']) + } + + base = {'sha': mr_dict['diff_refs']['base_sha'], + 'ref': mr_dict['source_branch'], + 'label': str(mr_dict['source_project_id']) + ':' + mr_dict['source_branch'], + 'author': mr_dict['author']['username'], + 'repo': str(mr_dict['source_project_id']) + } + + pr_meta_dict = { + 'head': head, + 'base': base + } + all_meta = [] + for pr_side, pr_meta_data in pr_meta_dict.items(): + pr_meta = { + 'pull_request_id': pull_request_id, + 'repo_id': repo_id, + 'pr_head_or_base': pr_side, + 'pr_src_meta_label': pr_meta_data['label'], + 'pr_src_meta_ref': pr_meta_data['ref'], + 'pr_sha': pr_meta_data['sha'], + 'cntrb_id': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + all_meta.append(pr_meta) + + return all_meta + + +def extract_needed_gitlab_issue_message_ref_data(message: dict, issue_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Extract the message id for a given message on an issue from an api response + and connect it to the relevant repo id. + + Arguments: + message: message data dict + issue_id: id of the issue + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + Dict containing the message ref id as well as the repo id. + """ + + message_ref_dict = { + 'issue_id': issue_id, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source, + 'issue_msg_ref_src_comment_id': int(message['id']), + 'issue_msg_ref_src_node_id': None, + 'repo_id': repo_id + } + + return message_ref_dict + + +def extract_needed_gitlab_message_data(comment: dict, platform_id: int, tool_source: str, tool_version: str, data_source: str): + """ + Extract specific metadata for a comment from an api response + and connect it to the relevant platform id. + + Arguments: + comment: comment data dict + platform_id: augur id of the relevant platform + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + Dict containing parsed comment text and metadata + """ + + comment_dict = { + "pltfrm_id": platform_id, + "msg_text": comment['body'], + "msg_timestamp": comment['created_at'], + "cntrb_id": None, + "platform_msg_id": int(comment['id']), + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source + } + + return comment_dict + +def extract_needed_gitlab_mr_message_ref_data(comment: dict, pull_request_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for pr labels from the api response + + Arguments: + comment: comment data dict + pull_request_id: id of the PR + repo_id: augur id of the repository + platform_id: augur id of the relevant platform + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + Dict containing the comment, pr and repo id of the parsed comment data. + """ + + pr_msg_ref = { + 'pull_request_id': pull_request_id, + 'pr_message_ref_src_comment_id': comment['id'], + 'repo_id': repo_id, + 'pr_message_ref_src_node_id': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + return pr_msg_ref diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index 676a71deec..7f97e4bbdc 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -259,8 +259,15 @@ class Contributor(Base): TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) + issues_opened = relationship("Issue", primaryjoin="Issue.reporter_id == Contributor.cntrb_id", back_populates="reporter") + pull_requests = relationship("PullRequest", back_populates="cntrb") + pull_request_reviews = relationship("PullRequestReview", back_populates="cntrb") + commits = relationship("Commit", primaryjoin="Commit.cmt_author_platform_username == Contributor.cntrb_login", back_populates="contributor") + alias = relationship("ContributorsAlias", back_populates="cntrb") + @classmethod def from_github(cls, contributor, tool_source, tool_version, data_source): + from augur.tasks.util.AugurUUID import GithubUUID cntrb_id = GithubUUID() cntrb_id["user"] = contributor["id"] @@ -557,6 +564,8 @@ class RepoGroup(Base): data_source = Column(String) data_collection_date = Column(TIMESTAMP(precision=0)) + repo = relationship("Repo", back_populates="repo_group") + @staticmethod def is_valid_repo_group_id(session, repo_group_id: int) -> bool: """Deterime is repo_group_id exists. @@ -794,7 +803,7 @@ class ContributorsAlias(Base): TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) - cntrb = relationship("Contributor") + cntrb = relationship("Contributor", back_populates="alias") class Repo(Base): @@ -859,9 +868,14 @@ class Repo(Base): TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) - repo_group = relationship("RepoGroup") - user_repo = relationship("UserRepo") + repo_group = relationship("RepoGroup", back_populates="repo") + user_repo = relationship("UserRepo", back_populates="repo") collection_status = relationship("CollectionStatus", back_populates="repo") + issues = relationship("Issue", back_populates="repo") + prs = relationship("PullRequest", back_populates="repo") + messages = relationship("Message", back_populates="repo") + commits = relationship("Commit", back_populates="repo") + releases = relationship("Release", back_populates="repo") @staticmethod def get_by_id(session, repo_id): @@ -915,6 +929,44 @@ def is_valid_github_repo(gh_session, url: str) -> bool: return False, {"status": f"Github Error: {data['message']}"} return True, {"status": "Valid repo", "repo_type": data["owner"]["type"]} + + @staticmethod + def is_valid_gitlab_repo(gl_session, url: str) -> bool: + """Determine whether a GitLab repo URL is valid. + + Args: + gl_session: GitLab session object with API key + url: Repository URL + + Returns: + True if repo URL is valid, False otherwise + """ + from augur.tasks.github.util.github_paginator import hit_api + + REPO_ENDPOINT = "https://gitlab.com/api/v4/projects/{}/" + + owner, repo = Repo.parse_gitlab_repo_url(url) + if not owner or not repo: + return False, {"status": "Invalid repo URL"} + + # Encode namespace and project name for the API request + project_identifier = f"{owner}%2F{repo}" + url = REPO_ENDPOINT.format(project_identifier) + + attempts = 0 + while attempts < 10: + response = hit_api(gl_session.oauths, url, logger) + + if response.status_code == 404: + return False, {"status": "Invalid repo"} + + if response.status_code == 200: + return True, {"status": "Valid repo"} + + attempts += 1 + + return False, {"status": "Failed to validate repo after multiple attempts"} + @staticmethod def parse_github_repo_url(url: str) -> tuple: @@ -934,6 +986,29 @@ def parse_github_repo_url(url: str) -> tuple: capturing_groups = result.groups() + owner = capturing_groups[0] + repo = capturing_groups[1] + + return owner, repo + + @staticmethod + def parse_gitlab_repo_url(url: str) -> tuple: + """ Gets the owner and repo from a gitlab url. + + Args: + url: Gitlab url + + Returns: + Tuple of owner and repo. Or a tuple of None and None if the url is invalid. + """ + + result = re.search(r"https?:\/\/gitlab\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$", url) + + if not result: + return None, None + + capturing_groups = result.groups() + owner = capturing_groups[0] repo = capturing_groups[1] @@ -960,7 +1035,7 @@ def parse_github_org_url(url): return result.groups()[0] @staticmethod - def insert(session, url: str, repo_group_id: int, tool_source, repo_type): + def insert_gitlab_repo(session, url: str, repo_group_id: int, tool_source): """Add a repo to the repo table. Args: @@ -971,6 +1046,54 @@ def insert(session, url: str, repo_group_id: int, tool_source, repo_type): If repo row exists then it will update the repo_group_id if param repo_group_id is not a default. If it does not exist is will simply insert the repo. """ + if not isinstance(url, str) or not isinstance(repo_group_id, int) or not isinstance(tool_source, str): + return None + + if not RepoGroup.is_valid_repo_group_id(session, repo_group_id): + return None + + if url.endswith("/"): + url = url[:-1] + + url = url.lower() + + owner, repo = Repo.parse_gitlab_repo_url(url) + if not owner or not repo: + return None + + repo_data = { + "repo_group_id": repo_group_id, + "repo_git": url, + "repo_path": f"gitlab.com/{owner}/", + "repo_name": repo, + "repo_type": None, + "tool_source": tool_source, + "tool_version": "1.0", + "data_source": "Git" + } + + repo_unique = ["repo_git"] + return_columns = ["repo_id"] + result = session.insert_data(repo_data, Repo, repo_unique, return_columns, on_conflict_update=False) + + if not result: + return None + + return result[0]["repo_id"] + + @staticmethod + def insert_github_repo(session, url: str, repo_group_id: int, tool_source, repo_type): + """Add a repo to the repo table. + + Args: + url: repo url + repo_group_id: group to assign repo to + repo_type: github or gitlab + + Note: + If repo row exists then it will update the repo_group_id if param repo_group_id is not a default. If it does not exist is will simply insert the repo. + """ + if not isinstance(url, str) or not isinstance(repo_group_id, int) or not isinstance(tool_source, str) or not isinstance(repo_type, str): return None @@ -1194,12 +1317,10 @@ class Commit(Base): contributor = relationship( "Contributor", primaryjoin="Commit.cmt_author_platform_username == Contributor.cntrb_login", + back_populates="commits" ) - contributor1 = relationship( - "Contributor", - primaryjoin="Commit.cmt_author_platform_username == Contributor.cntrb_login", - ) - repo = relationship("Repo") + repo = relationship("Repo", back_populates="commits") + message_ref = relationship("CommitCommentRef", back_populates="cmt") class Issue(Base): @@ -1259,12 +1380,14 @@ class Issue(Base): ) cntrb = relationship( - "Contributor", primaryjoin="Issue.cntrb_id == Contributor.cntrb_id" - ) - repo = relationship("Repo") + "Contributor", primaryjoin="Issue.cntrb_id == Contributor.cntrb_id") + repo = relationship("Repo", back_populates="issues") reporter = relationship( - "Contributor", primaryjoin="Issue.reporter_id == Contributor.cntrb_id" + "Contributor", primaryjoin="Issue.reporter_id == Contributor.cntrb_id", back_populates="issues_opened" ) + message_refs = relationship("IssueMessageRef", back_populates="issue") + assignees = relationship("IssueAssignee", back_populates="issue") + labels = relationship("IssueLabel", back_populates="issue") # @classmethod # def from_github(cls): @@ -1408,8 +1531,11 @@ class Message(Base): cntrb = relationship("Contributor") pltfrm = relationship("Platform") - repo = relationship("Repo") + repo = relationship("Repo", back_populates="messages") rgls = relationship("RepoGroupsListServe") + pr_message_ref = relationship("PullRequestMessageRef", back_populates="message") + issue_message_ref = relationship("IssueMessageRef", back_populates="message") + commit_message_ref = relationship("CommitCommentRef", back_populates="msg") # @classmethod # def from_github(cls): @@ -1582,8 +1708,13 @@ class PullRequest(Base): TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) - pr_augur_contributor = relationship("Contributor") - repo = relationship("Repo") + cntrb = relationship("Contributor", back_populates="pull_requests") + repo = relationship("Repo", back_populates="prs") + message_refs = relationship("PullRequestMessageRef", back_populates="pr") + reviews = relationship("PullRequestReview", back_populates="pr") + labels = relationship("PullRequestLabel", back_populates="pull_request") + assignees = relationship("PullRequestAssignee", back_populates="pull_request") + files = relationship("PullRequestFile", back_populates="") @classmethod def from_github(cls, pr, repo_id, tool_source, tool_version): @@ -1661,7 +1792,7 @@ class Release(Base): TIMESTAMP(precision=6), server_default=text("CURRENT_TIMESTAMP") ) - repo = relationship("Repo") + repo = relationship("Repo", back_populates="releases") class RepoBadging(Base): @@ -2136,7 +2267,7 @@ class CommitCommentRef(Base): TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) - cmt = relationship("Commit") + cmt = relationship("Commit", back_populates="message_ref") msg = relationship("Message") @@ -2236,7 +2367,7 @@ class IssueAssignee(Base): ) cntrb = relationship("Contributor") - issue = relationship("Issue") + issue = relationship("Issue", back_populates="assignees") repo = relationship("Repo") @classmethod @@ -2379,7 +2510,7 @@ class IssueLabel(Base): TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) - issue = relationship("Issue") + issue = relationship("Issue", back_populates="labels") repo = relationship("Repo") @classmethod @@ -2456,8 +2587,8 @@ class IssueMessageRef(Base): TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) - issue = relationship("Issue") - msg = relationship("Message") + issue = relationship("Issue", back_populates="message_refs") + message = relationship("Message", back_populates="issue_message_ref") repo = relationship("Repo") @@ -2683,7 +2814,7 @@ class PullRequestAssignee(Base): ) contrib = relationship("Contributor") - pull_request = relationship("PullRequest") + pull_request = relationship("PullRequest", back_populates="assignees") repo = relationship("Repo") @classmethod @@ -2896,7 +3027,7 @@ class PullRequestFile(Base): TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) - pull_request = relationship("PullRequest") + pull_request = relationship("PullRequest", back_populates="files") repo = relationship("Repo") # @classmethod @@ -2945,7 +3076,7 @@ class PullRequestLabel(Base): ) - pull_request = relationship("PullRequest") + pull_request = relationship("PullRequest", back_populates="labels") repo = relationship("Repo") @classmethod @@ -3013,8 +3144,8 @@ class PullRequestMessageRef(Base): TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) - msg = relationship("Message") - pull_request = relationship("PullRequest") + message = relationship("Message", back_populates="pr_message_ref") + pr = relationship("PullRequest", back_populates="message_refs") repo = relationship("Repo") @@ -3209,9 +3340,9 @@ class PullRequestReview(Base): TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) - cntrb = relationship("Contributor") + cntrb = relationship("Contributor", back_populates="pull_request_reviews") platform = relationship("Platform") - pull_request = relationship("PullRequest") + pr = relationship("PullRequest", back_populates="reviews") repo = relationship("Repo") # @classmethod diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index a2e3a6c4d8..47f28b12f2 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -57,7 +57,7 @@ def retrieve_owner_repos(session, owner: str) -> List[str]: # collect repo urls for the given owner repos = [] - for page_data in GithubPaginator(url, session.oauths, logger).iter_pages(): + for page_data, _ in GithubPaginator(url, session.oauths, logger).iter_pages(): if page_data is None: break @@ -271,9 +271,9 @@ class User(Base): {"schema": "augur_operations"} ) - groups = relationship("UserGroup") - tokens = relationship("UserSessionToken") - applications = relationship("ClientApplication") + groups = relationship("UserGroup", back_populates="user") + tokens = relationship("UserSessionToken", back_populates="user") + applications = relationship("ClientApplication", back_populates="user") _is_authenticated = False _is_active = True @@ -449,17 +449,30 @@ def remove_group(self, group_name): return result - def add_repo(self, group_name, repo_url): + def add_github_repo(self, group_name, repo_url): from augur.tasks.github.util.github_task_session import GithubTaskSession from augur.tasks.github.util.github_api_key_handler import NoValidKeysError try: with GithubTaskSession(logger) as session: - result = UserRepo.add(session, repo_url, self.user_id, group_name) + result = UserRepo.add_github_repo(session, repo_url, self.user_id, group_name) except NoValidKeysError: return False, {"status": "No valid keys"} return result + + def add_gitlab_repo(self, group_name, repo_url): + + from augur.tasks.gitlab.gitlab_task_session import GitlabTaskSession + from augur.tasks.github.util.github_api_key_handler import NoValidKeysError + try: + with GitlabTaskSession(logger) as session: + result = UserRepo.add_gitlab_repo(session, repo_url, self.user_id, group_name) + except NoValidKeysError: + return False, {"status": "No valid keys"} + + return result + def remove_repo(self, group_name, repo_id): @@ -468,14 +481,14 @@ def remove_repo(self, group_name, repo_id): return result - def add_org(self, group_name, org_url): + def add_github_org(self, group_name, org_url): from augur.tasks.github.util.github_task_session import GithubTaskSession from augur.tasks.github.util.github_api_key_handler import NoValidKeysError try: with GithubTaskSession(logger) as session: - result = UserRepo.add_org_repos(session, org_url, self.user_id, group_name) + result = UserRepo.add_github_org_repos(session, org_url, self.user_id, group_name) except NoValidKeysError: return False, {"status": "No valid keys"} @@ -628,8 +641,8 @@ class UserGroup(Base): {"schema": "augur_operations"} ) - user = relationship("User") - repos = relationship("UserRepo") + user = relationship("User", back_populates="groups") + repos = relationship("UserRepo", back_populates="group") @staticmethod def insert(session, user_id:int, group_name:str) -> dict: @@ -739,8 +752,8 @@ class UserRepo(Base): ForeignKey("augur_data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False ) - repo = relationship("Repo") - group = relationship("UserGroup") + repo = relationship("Repo", back_populates="user_repo") + group = relationship("UserGroup", back_populates="repos") @staticmethod def insert(session, repo_id: int, group_id:int = 1) -> bool: @@ -769,9 +782,69 @@ def insert(session, repo_id: int, group_id:int = 1) -> bool: return False return data[0]["group_id"] == group_id and data[0]["repo_id"] == repo_id + + @staticmethod + def add_gitlab_repo(session, url: List[str], user_id: int, group_name=None, group_id=None, from_org_list=False, repo_group_id=None) -> dict: + """Add repo to the user repo table + + Args: + urls: list of repo urls + user_id: id of user_id from users table + group_name: name of group to add repo to. + group_id: id of the group + valid_repo: boolean that indicates whether the repo has already been validated + + Note: + Either the group_name or group_id can be passed not both + + Returns: + Dict that contains the key "status" and additional useful data + """ + + if group_name and group_id: + return False, {"status": "Pass only the group name or group id not both"} + + if not group_name and not group_id: + return False, {"status": "Need group name or group id to add a repo"} + + if group_id is None: + + group_id = UserGroup.convert_group_name_to_id(session, user_id, group_name) + if group_id is None: + return False, {"status": "Invalid group name"} + + if not from_org_list: + result = Repo.is_valid_gitlab_repo(session, url) + if not result[0]: + return False, {"status": result[1]["status"], "repo_url": url} + + # if no repo_group_id is passed then assign the repo to the frontend repo group + if repo_group_id is None: + + frontend_repo_group = session.query(RepoGroup).filter(RepoGroup.rg_name == FRONTEND_REPO_GROUP_NAME).first() + if not frontend_repo_group: + return False, {"status": "Could not find repo group with name 'Frontend Repos'", "repo_url": url} + + repo_group_id = frontend_repo_group.repo_group_id + + + repo_id = Repo.insert_gitlab_repo(session, url, repo_group_id, "Frontend") + if not repo_id: + return False, {"status": "Repo insertion failed", "repo_url": url} + + result = UserRepo.insert(session, repo_id, group_id) + if not result: + return False, {"status": "repo_user insertion failed", "repo_url": url} + + #collection_status records are now only added during collection -IM 5/1/23 + #status = CollectionStatus.insert(session, repo_id) + #if not status: + # return False, {"status": "Failed to create status for repo", "repo_url": url} + + return True, {"status": "Repo Added", "repo_url": url} @staticmethod - def add(session, url: List[str], user_id: int, group_name=None, group_id=None, from_org_list=False, repo_type=None, repo_group_id=None) -> dict: + def add_github_repo(session, url: List[str], user_id: int, group_name=None, group_id=None, from_org_list=False, repo_type=None, repo_group_id=None) -> dict: """Add repo to the user repo table Args: @@ -820,7 +893,7 @@ def add(session, url: List[str], user_id: int, group_name=None, group_id=None, f repo_group_id = frontend_repo_group.repo_group_id - repo_id = Repo.insert(session, url, repo_group_id, "Frontend", repo_type) + repo_id = Repo.insert_github_repo(session, url, repo_group_id, "Frontend", repo_type) if not repo_id: return False, {"status": "Repo insertion failed", "repo_url": url} @@ -862,7 +935,7 @@ def delete(session, repo_id:int, user_id:int, group_name:str) -> dict: return True, {"status": "Repo Removed"} @staticmethod - def add_org_repos(session, url: List[str], user_id: int, group_name: int): + def add_github_org_repos(session, url: List[str], user_id: int, group_name: int): """Add list of orgs and their repos to a users repos. Args: @@ -911,7 +984,7 @@ def add_org_repos(session, url: List[str], user_id: int, group_name: int): failed_repos = [] for repo in repos: - result = UserRepo.add(session, repo, user_id, group_id=group_id, from_org_list=True, repo_type=type, repo_group_id=repo_group_id) + result = UserRepo.add_github_repo(session, repo, user_id, group_id=group_id, from_org_list=True, repo_type=type, repo_group_id=repo_group_id) # keep track of all the repos that failed if not result[0]: @@ -949,9 +1022,9 @@ class UserSessionToken(Base): application_id = Column(ForeignKey("augur_operations.client_applications.id", name="user_session_token_application_id_fkey"), nullable=False) created_at = Column(BigInteger) - user = relationship("User") - application = relationship("ClientApplication") - refresh_tokens = relationship("RefreshToken") + user = relationship("User", back_populates="tokens") + application = relationship("ClientApplication", back_populates="sessions") + refresh_tokens = relationship("RefreshToken", back_populates="user_session") @staticmethod def create(session, user_id, application_id, seconds_to_expire=86400): @@ -991,9 +1064,9 @@ class ClientApplication(Base): redirect_url = Column(String, nullable=False) api_key = Column(String, nullable=False) - user = relationship("User") + user = relationship("User", back_populates="applications") sessions = relationship("UserSessionToken") - subscriptions = relationship("Subscription") + subscriptions = relationship("Subscription", back_populates="application") def __eq__(self, other): return isinstance(other, ClientApplication) and str(self.id) == str(other.id) @@ -1013,8 +1086,8 @@ class Subscription(Base): application_id = Column(ForeignKey("augur_operations.client_applications.id", name="subscriptions_application_id_fkey"), primary_key=True) type_id = Column(ForeignKey("augur_operations.subscription_types.id", name="subscriptions_type_id_fkey"), primary_key=True) - application = relationship("ClientApplication") - type = relationship("SubscriptionType") + application = relationship("ClientApplication", back_populates="subscriptions") + type = relationship("SubscriptionType", back_populates="subscriptions") class SubscriptionType(Base): __tablename__ = "subscription_types" @@ -1027,7 +1100,7 @@ class SubscriptionType(Base): id = Column(BigInteger, primary_key=True) name = Column(String, nullable=False) - subscriptions = relationship("Subscription") + subscriptions = relationship("Subscription", back_populates="type") class RefreshToken(Base): @@ -1040,7 +1113,7 @@ class RefreshToken(Base): id = Column(String, primary_key=True) user_session_token = Column(ForeignKey("augur_operations.user_session_tokens.token", name="refresh_token_session_token_id_fkey"), nullable=False) - user_session = relationship("UserSessionToken") + user_session = relationship("UserSessionToken", back_populates="refresh_tokens") @staticmethod def create(session, user_session_token_id): @@ -1159,16 +1232,28 @@ def insert(session, repo_id): repo_git = repo.repo_git collection_status_unique = ["repo_id"] + pr_issue_count = 0 + github_weight = 0 + if "github" in repo_git: - try: - pr_issue_count = get_repo_weight_by_issue(session.logger, repo_git) - #session.logger.info(f"date weight: {calculate_date_weight_from_timestamps(repo.repo_added, None)}") - github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) - except Exception as e: - pr_issue_count = None - github_weight = None - session.logger.error( - ''.join(traceback.format_exception(None, e, e.__traceback__))) + try: + pr_issue_count = get_repo_weight_by_issue(session.logger, repo_git) + #session.logger.info(f"date weight: {calculate_date_weight_from_timestamps(repo.repo_added, None)}") + github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) + except Exception as e: + pr_issue_count = None + github_weight = None + session.logger.error( + ''.join(traceback.format_exception(None, e, e.__traceback__))) + else: + try: + pr_issue_count = 0 + github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) + except Exception as e: + pr_issue_count = None + github_weight = None + session.logger.error( + ''.join(traceback.format_exception(None, e, e.__traceback__))) record = { @@ -1178,6 +1263,7 @@ def insert(session, repo_id): "secondary_weight": github_weight, "ml_weight": github_weight } + result = session.insert_data(record, CollectionStatus, collection_status_unique, on_conflict_update=False) diff --git a/augur/application/db/session.py b/augur/application/db/session.py index 2212c1fdc1..22379ad050 100644 --- a/augur/application/db/session.py +++ b/augur/application/db/session.py @@ -85,7 +85,7 @@ def __del__(self): def execute_sql(self, sql_text): - with self.engine.connect() as connection: + with self.engine.begin() as connection: return_data = connection.execute(sql_text) @@ -93,10 +93,10 @@ def execute_sql(self, sql_text): def fetchall_data_from_sql_text(self,sql_text): - with self.engine.connect() as connection: + with self.engine.begin() as connection: - result = connection.execute(sql_text).fetchall() - return [dict(zip(row.keys(), row)) for row in result] + result = connection.execute(sql_text) + return [dict(row) for row in result.mappings()] def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[str], return_columns: Optional[List[str]] = None, string_fields: Optional[List[str]] = None, on_conflict_update:bool = True) -> Optional[List[dict]]: @@ -174,7 +174,9 @@ def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[s while attempts < 10: try: - with EngineConnection(self.engine) as connection: + #begin keyword is needed for sqlalchemy 2.x + #this is because autocommit support was removed in 2.0 + with self.engine.begin() as connection: connection.execute(stmnt) break except OperationalError as e: @@ -191,14 +193,16 @@ def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[s raise e except Exception as e: - if(len(data) == 1): + #self.logger.info(e) + if len(data) == 1: raise e - else: - first_half = data[:len(data)//2] - second_half = data[len(data)//2:] + + time.sleep(3) + first_half = data[:len(data)//2] + second_half = data[len(data)//2:] - self.insert_data(first_half, natural_keys, return_columns, string_fields, on_conflict_update) - self.insert_data(second_half, natural_keys, return_columns, string_fields, on_conflict_update) + self.insert_data(first_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + self.insert_data(second_half,table, natural_keys, return_columns, string_fields, on_conflict_update) else: self.logger.error("Unable to insert data in 10 attempts") @@ -213,8 +217,8 @@ def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[s # othewise it gets the requested return columns and returns them as a list of dicts while attempts < 10: try: - with EngineConnection(self.engine) as connection: - return_data_tuples = connection.execute(stmnt).fetchall() + with self.engine.begin() as connection: + return_data_tuples = connection.execute(stmnt) break except OperationalError as e: if isinstance(e.orig, DeadlockDetected): @@ -228,14 +232,15 @@ def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[s raise e except Exception as e: - if(len(data) == 1): + if len(data) == 1: raise e - else: - first_half = data[:len(data)//2] - second_half = data[len(data)//2:] + + time.sleep(3) + first_half = data[:len(data)//2] + second_half = data[len(data)//2:] - self.insert_data(first_half, natural_keys, return_columns, string_fields, on_conflict_update) - self.insert_data(second_half, natural_keys, return_columns, string_fields, on_conflict_update) + self.insert_data(first_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + self.insert_data(second_half, table, natural_keys, return_columns, string_fields, on_conflict_update) else: self.logger.error("Unable to insert and return data in 10 attempts") @@ -244,9 +249,11 @@ def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[s if deadlock_detected is True: self.logger.error("Made it through even though Deadlock was detected") - return_data = [] - for data_tuple in return_data_tuples: - return_data.append(dict(data_tuple)) + return_data = [dict(row) for row in return_data_tuples.mappings()] + + #no longer working in sqlalchemy 2.x + #for data_tuple in return_data_tuples: + # return_data.append(dict(data_tuple)) # using on confilict do nothing does not return the # present values so this does gets the return values diff --git a/augur/application/schema/alembic/env.py b/augur/application/schema/alembic/env.py index d170ef243f..94127a43be 100644 --- a/augur/application/schema/alembic/env.py +++ b/augur/application/schema/alembic/env.py @@ -5,7 +5,9 @@ from alembic import context from augur.application.db.models.base import Base -from augur.application.db.engine import DatabaseEngine +from augur.application.db.engine import DatabaseEngine, get_database_string +from sqlalchemy import create_engine, event +from sqlalchemy.pool import NullPool # this is the Alembic Config object, which provides # access to the values within the .ini file in use. @@ -59,8 +61,20 @@ def run_migrations_online(): and associate a connection with the context. """ + url = get_database_string() + engine = create_engine(url) - with DatabaseEngine() as connectable, connectable.connect() as connection: + @event.listens_for(engine, "connect", insert=True) + def set_search_path(dbapi_connection, connection_record): + existing_autocommit = dbapi_connection.autocommit + dbapi_connection.autocommit = True + cursor = dbapi_connection.cursor() + cursor.execute("SET SESSION search_path=public,augur_data,augur_operations,spdx") + cursor.close() + dbapi_connection.autocommit = existing_autocommit + + + with engine.connect() as connection: context.configure( connection=connection, target_metadata=target_metadata, diff --git a/augur/application/schema/alembic/versions/1_augur_new_changes.py b/augur/application/schema/alembic/versions/1_augur_new_changes.py index 0be3780a36..2e8440294f 100644 --- a/augur/application/schema/alembic/versions/1_augur_new_changes.py +++ b/augur/application/schema/alembic/versions/1_augur_new_changes.py @@ -300,8 +300,9 @@ def change_cntrb_id_to_uuid_5(upgrade=True): """ INSERT INTO "augur_data"."contributors"("cntrb_id", "cntrb_login", "cntrb_email", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:cntrb_uuid, 'not-provided', NULL, NULL, '2019-06-13 11:33:39', NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1, 'nobody', 'http://fake.me', 'http://fake.me', 'x', 'http://fake.me', NULL, 'http://fake.me', 'http://fake.me', 'http://fake.me', 'http://fake.me', 'http://fake.me', 'http://fake.me', 'http://fake.me', 'http://fake.me', NULL, NULL, NULL, NULL, NULL, NULL, '2019-06-13 16:35:25'); """ - ), - cntrb_uuid=UnresolvableUUID().to_UUID() + ).bindparams( + cntrb_uuid=UnresolvableUUID().to_UUID() + ) ) conn.execute( @@ -309,8 +310,9 @@ def change_cntrb_id_to_uuid_5(upgrade=True): """ INSERT INTO "augur_data"."contributors" ("cntrb_id", "cntrb_login", "cntrb_email", "cntrb_full_name", "cntrb_company", "cntrb_created_at", "cntrb_type", "cntrb_fake", "cntrb_deleted", "cntrb_long", "cntrb_lat", "cntrb_country_code", "cntrb_state", "cntrb_city", "cntrb_location", "cntrb_canonical", "cntrb_last_used", "gh_user_id", "gh_login", "gh_url", "gh_html_url", "gh_node_id", "gh_avatar_url", "gh_gravatar_id", "gh_followers_url", "gh_following_url", "gh_gists_url", "gh_starred_url", "gh_subscriptions_url", "gh_organizations_url", "gh_repos_url", "gh_events_url", "gh_received_events_url", "gh_type", "gh_site_admin", "gl_web_url", "gl_avatar_url", "gl_state", "gl_username", "gl_full_name", "gl_id", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (:cntrb_uuid, 'nan', 'kannayoshihiro@gmail.com', 'KANNA Yoshihiro', 'UTMC', '2009-04-17 12:43:58', NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, NULL, 'kannayoshihiro@gmail.com', '2021-01-28 21:56:10-06', 74832, 'nan', 'https://api.github.com/users/nan', 'https://github.com/nan', 'MDQ6VXNlcjc0ODMy', 'https://avatars.githubusercontent.com/u/74832?v=4', '', 'https://api.github.com/users/nan/followers', 'https://api.github.com/users/nan/following{/other_user}', 'https://api.github.com/users/nan/gists{/gist_id}', 'https://api.github.com/users/nan/starred{/owner}{/repo}', 'https://api.github.com/users/nan/subscriptions', 'https://api.github.com/users/nan/orgs', 'https://api.github.com/users/nan/repos', 'https://api.github.com/users/nan/events{/privacy}', 'https://api.github.com/users/nan/received_events', 'User', 'false', NULL, NULL, NULL, NULL, NULL, NULL, 'GitHub API Worker', '1.0.0', 'GitHub API', '2021-10-28 15:23:46'); """ - ), - cntrb_uuid=GithubUUID().to_UUID() + ).bindparams( + cntrb_uuid=GithubUUID().to_UUID() + ) ) else: diff --git a/augur/application/schema/alembic/versions/22_mat_view_cntrbid.py b/augur/application/schema/alembic/versions/22_mat_view_cntrbid.py new file mode 100644 index 0000000000..28b58756cd --- /dev/null +++ b/augur/application/schema/alembic/versions/22_mat_view_cntrbid.py @@ -0,0 +1,188 @@ +"""Fix Keys and materialized view + +Revision ID: 22 +Revises: 21 +Create Date: 2023-08-23 18:17:22.651191 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy import text + +# revision identifiers, used by Alembic. +revision = '22' +down_revision = '21' +branch_labels = None +depends_on = None + + +def upgrade(): + + add_fix_keys_22() + +def downgrade(): + + upgrade=False + + add_fix_keys_22(upgrade) + +def add_fix_keys_22(upgrade=True): + + if upgrade: + + conn = op.get_bind() + conn.execute(text(""" + alter TABLE + augur_data.commits DROP CONSTRAINT if exists fk_commits_contributors_3, + DROP CONSTRAINT if exists fk_commits_contributors_4; + alter TABLE augur_data.contributors + DROP CONSTRAINT if exists "GH-UNIQUE-C", + DROP CONSTRAINT if exists + "GL-cntrb-LOGIN-UNIQUE";""")) + + conn = op.get_bind() + conn.execute(text(""" + drop materialized view if exists augur_data.explorer_contributor_actions; """)) + + conn = op.get_bind() + conn.execute(text(""" + create materialized view augur_data.explorer_contributor_actions as + SELECT + A.ID AS cntrb_id, + A.created_at, + A.repo_id, + A.ACTION, + repo.repo_name, + A.LOGIN, + DENSE_RANK() OVER(PARTITION BY A.ID, A.repo_id ORDER BY A.created_at) AS RANK + FROM ( + select + commits.cmt_ght_author_id AS ID, + commits.cmt_author_timestamp AS created_at, + commits.repo_id, + 'commit' :: TEXT AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + ( augur_data.commits LEFT JOIN augur_data.contributors ON ( ( ( contributors.cntrb_id ) :: TEXT = ( commits.cmt_ght_author_id ) :: TEXT ) ) ) + GROUP BY + commits.cmt_commit_hash, + commits.cmt_ght_author_id, + commits.repo_id, + commits.cmt_author_timestamp, + 'commit' :: TEXT, + contributors.cntrb_login + UNION all + SELECT + issues.reporter_id AS ID, + issues.created_at, + issues.repo_id, + 'issue_opened' :: TEXT AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + ( augur_data.issues LEFT JOIN augur_data.contributors ON ( ( contributors.cntrb_id = issues.reporter_id ) ) ) + WHERE + ( issues.pull_request IS NULL ) + UNION ALL + SELECT + pull_request_events.cntrb_id AS ID, + pull_request_events.created_at, + pull_requests.repo_id, + 'pull_request_closed' :: TEXT AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + augur_data.pull_requests, + ( augur_data.pull_request_events LEFT JOIN augur_data.contributors ON ( ( contributors.cntrb_id = pull_request_events.cntrb_id ) ) ) + WHERE + pull_requests.pull_request_id = pull_request_events.pull_request_id + AND pull_requests.pr_merged_at IS NULL + AND ( ( pull_request_events.ACTION ) :: TEXT = 'closed' :: TEXT ) + UNION ALL + SELECT + pull_request_events.cntrb_id AS ID, + pull_request_events.created_at, + pull_requests.repo_id, + 'pull_request_merged' :: TEXT AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + augur_data.pull_requests, + ( augur_data.pull_request_events LEFT JOIN augur_data.contributors ON ( ( contributors.cntrb_id = pull_request_events.cntrb_id ) ) ) + WHERE + pull_requests.pull_request_id = pull_request_events.pull_request_id + AND ( ( pull_request_events.ACTION ) :: TEXT = 'merged' :: TEXT ) + UNION ALL + SELECT + issue_events.cntrb_id AS ID, + issue_events.created_at, + issues.repo_id, + 'issue_closed' :: TEXT AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + augur_data.issues, + augur_data.issue_events + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = issue_events.cntrb_id + WHERE + issues.issue_id = issue_events.issue_id + AND issues.pull_request IS NULL + AND ( ( issue_events.ACTION ) :: TEXT = 'closed' :: TEXT ) + UNION ALL + SELECT + pull_request_reviews.cntrb_id AS ID, + pull_request_reviews.pr_review_submitted_at AS created_at, + pull_requests.repo_id, + ( 'pull_request_review_' :: TEXT || ( pull_request_reviews.pr_review_state ) :: TEXT ) AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + augur_data.pull_requests, + augur_data.pull_request_reviews + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = pull_request_reviews.cntrb_id + WHERE + pull_requests.pull_request_id = pull_request_reviews.pull_request_id + UNION ALL + SELECT + pull_requests.pr_augur_contributor_id AS ID, + pull_requests.pr_created_at AS created_at, + pull_requests.repo_id, + 'pull_request_open' :: TEXT AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + augur_data.pull_requests + LEFT JOIN augur_data.contributors ON pull_requests.pr_augur_contributor_id = contributors.cntrb_id + UNION ALL + SELECT + message.cntrb_id AS ID, + message.msg_timestamp AS created_at, + pull_requests.repo_id, + 'pull_request_comment' :: TEXT AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + augur_data.pull_requests, + augur_data.pull_request_message_ref, + augur_data.message + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id + WHERE + pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + AND pull_request_message_ref.msg_id = message.msg_id + UNION ALL + SELECT + issues.reporter_id AS ID, + message.msg_timestamp AS created_at, + issues.repo_id, + 'issue_comment' :: TEXT AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + augur_data.issues, + augur_data.issue_message_ref, + augur_data.message + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id + WHERE + issue_message_ref.msg_id = message.msg_id + AND issues.issue_id = issue_message_ref.issue_id + AND issues.closed_at != message.msg_timestamp + ) A, + augur_data.repo + WHERE + A.repo_id = repo.repo_id + ORDER BY + A.created_at DESC""")) + diff --git a/augur/application/schema/alembic/versions/23_add_index_ghlogin.py b/augur/application/schema/alembic/versions/23_add_index_ghlogin.py new file mode 100644 index 0000000000..48a96eb3b4 --- /dev/null +++ b/augur/application/schema/alembic/versions/23_add_index_ghlogin.py @@ -0,0 +1,45 @@ +"""add index + +Revision ID: 23 +Revises: 22 +Create Date: 2023-08-23 18:17:22.651191 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy import text + +# revision identifiers, used by Alembic. +revision = '23' +down_revision = '22' +branch_labels = None +depends_on = None + + +def upgrade(): + + gh_loginindex() + +def downgrade(): + + upgrade=False + + gh_loginindex(upgrade) + +def gh_loginindex(upgrade=True): + + if upgrade: + + conn = op.get_bind() + conn.execute(text(""" + CREATE INDEX if not exists "gh_login" ON "augur_data"."contributors" USING btree ( + "gh_login" ASC NULLS FIRST);""")) + + else: + + + conn = op.get_bind() + conn.execute(text(""" + DROP INDEX if exists "gh_login" ON "augur_data"."contributors" USING btree ( + "gh_login" ASC NULLS FIRST);""")) diff --git a/augur/application/schema/alembic/versions/24_alter_repo_labor_unique.py b/augur/application/schema/alembic/versions/24_alter_repo_labor_unique.py new file mode 100644 index 0000000000..719aeabe75 --- /dev/null +++ b/augur/application/schema/alembic/versions/24_alter_repo_labor_unique.py @@ -0,0 +1,49 @@ +"""Alter repo labor unique + +Revision ID: 24 +Revises: 23 +Create Date: 2023-08-25 18:17:22.651191 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy.sql import text +import re + +# revision identifiers, used by Alembic. +revision = '24' +down_revision = '23' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + + conn = op.get_bind() + + #Remove constraint being initially deferred. + conn.execute(text(f""" + ALTER TABLE "augur_data"."repo_labor" + DROP CONSTRAINT IF EXISTS "rl-unique", + ADD CONSTRAINT "rl-unique" UNIQUE ("repo_id", "rl_analysis_date", "file_path", "file_name"); + """)) + """ + + """ + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + conn = op.get_bind() + + #Make unique initially deferred + conn.execute(text(f""" + ALTER TABLE "augur_data"."repo_labor" + DROP CONSTRAINT IF EXISTS "rl-unique", + ADD CONSTRAINT "rl-unique" UNIQUE ("repo_id", "rl_analysis_date", "file_path", "file_name") DEFERRABLE INITIALLY DEFERRED; + """)) + + # ### end Alembic commands ### diff --git a/augur/application/schema/alembic/versions/25_unique_on_mataview.py b/augur/application/schema/alembic/versions/25_unique_on_mataview.py new file mode 100644 index 0000000000..0480ae45ef --- /dev/null +++ b/augur/application/schema/alembic/versions/25_unique_on_mataview.py @@ -0,0 +1,466 @@ +""" THIS WILL TAKE LONGER ON A LARGE SET OF REPOSITORIES : a unique index on a materialized view allows it to be refreshed concurrently, preventing blocking behavior + +Revision ID: 25 +Revises: 24 +Create Date: 2023-08-23 18:17:22.651191 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy import text + +# revision identifiers, used by Alembic. +revision = '25' +down_revision = '24' +branch_labels = None +depends_on = None + + +def upgrade(): + + add_fix_keys_25() + +def downgrade(): + + upgrade=False + + add_fix_keys_25(upgrade) + +def add_fix_keys_25(upgrade=True): + + if upgrade: + + conn = op.get_bind() + conn.execute(text("""CREATE UNIQUE INDEX ON augur_data.api_get_all_repo_prs(repo_id);""")) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text(""" + CREATE UNIQUE INDEX ON augur_data.api_get_all_repos_commits(repo_id); """)) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text(""" + CREATE UNIQUE INDEX ON augur_data.api_get_all_repos_issues(repo_id); """)) + conn.execute(text("""COMMIT;""")) + + + conn = op.get_bind() + conn.execute(text(""" + CREATE UNIQUE INDEX ON augur_data.explorer_commits_and_committers_daily_count( repo_id, cmt_committer_date); """)) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text(""" + CREATE UNIQUE INDEX ON augur_data.explorer_entry_list(repo_id); """)) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text(""" + drop MATERIALIZED VIEW if exists augur_data.explorer_libyear_all; + drop MATERIALIZED VIEW if exists augur_data.explorer_libyear_detail; + drop MATERIALIZED VIEW if exists augur_data.explorer_libyear_summary; + drop MATERIALIZED VIEW if exists augur_data.explorer_contributor_actions; + + + ---- + DROP MATERIALIZED VIEW if exists "augur_data"."augur_new_contributors";""")) + + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text(""" + create MATERIALIZED VIEW "augur_data"."augur_new_contributors" + AS + SELECT a.id AS cntrb_id, + a.created_at, + a.repo_id, + a.action, + repo.repo_name, + a.login, + row_number() OVER (PARTITION BY a.id, a.repo_id ORDER BY a.created_at DESC) AS rank + FROM ( SELECT commits.cmt_ght_author_id AS id, + commits.cmt_author_timestamp AS created_at, + commits.repo_id, + 'commit'::text AS action, + contributors.cntrb_login AS login + FROM (augur_data.commits + LEFT JOIN augur_data.contributors ON (((contributors.cntrb_id)::text = (commits.cmt_ght_author_id)::text))) + GROUP BY commits.cmt_commit_hash, commits.cmt_ght_author_id, commits.repo_id, commits.cmt_author_timestamp, 'commit'::text, contributors.cntrb_login + UNION ALL + SELECT issues.reporter_id AS id, + issues.created_at, + issues.repo_id, + 'issue_opened'::text AS action, + contributors.cntrb_login AS login + FROM (augur_data.issues + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = issues.reporter_id))) + WHERE (issues.pull_request IS NULL) + UNION ALL + SELECT pull_request_events.cntrb_id AS id, + pull_request_events.created_at, + pull_requests.repo_id, + 'pull_request_closed'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + (augur_data.pull_request_events + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = pull_request_events.cntrb_id))) + WHERE ((pull_requests.pull_request_id = pull_request_events.pull_request_id) AND (pull_requests.pr_merged_at IS NULL) AND ((pull_request_events.action)::text = 'closed'::text)) + UNION ALL + SELECT pull_request_events.cntrb_id AS id, + pull_request_events.created_at, + pull_requests.repo_id, + 'pull_request_merged'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + (augur_data.pull_request_events + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = pull_request_events.cntrb_id))) + WHERE ((pull_requests.pull_request_id = pull_request_events.pull_request_id) AND ((pull_request_events.action)::text = 'merged'::text)) + UNION ALL + SELECT issue_events.cntrb_id AS id, + issue_events.created_at, + issues.repo_id, + 'issue_closed'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.issues, + (augur_data.issue_events + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = issue_events.cntrb_id))) + WHERE ((issues.issue_id = issue_events.issue_id) AND (issues.pull_request IS NULL) AND ((issue_events.action)::text = 'closed'::text)) + UNION ALL + SELECT pull_request_reviews.cntrb_id AS id, + pull_request_reviews.pr_review_submitted_at AS created_at, + pull_requests.repo_id, + ('pull_request_review_'::text || (pull_request_reviews.pr_review_state)::text) AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + (augur_data.pull_request_reviews + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = pull_request_reviews.cntrb_id))) + WHERE (pull_requests.pull_request_id = pull_request_reviews.pull_request_id) + UNION ALL + SELECT pull_requests.pr_augur_contributor_id AS id, + pull_requests.pr_created_at AS created_at, + pull_requests.repo_id, + 'pull_request_open'::text AS action, + contributors.cntrb_login AS login + FROM (augur_data.pull_requests + LEFT JOIN augur_data.contributors ON ((pull_requests.pr_augur_contributor_id = contributors.cntrb_id))) + UNION ALL + SELECT message.cntrb_id AS id, + message.msg_timestamp AS created_at, + pull_requests.repo_id, + 'pull_request_comment'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + augur_data.pull_request_message_ref, + (augur_data.message + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = message.cntrb_id))) + WHERE ((pull_request_message_ref.pull_request_id = pull_requests.pull_request_id) AND (pull_request_message_ref.msg_id = message.msg_id)) + UNION ALL + SELECT issues.reporter_id AS id, + message.msg_timestamp AS created_at, + issues.repo_id, + 'issue_comment'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.issues, + augur_data.issue_message_ref, + (augur_data.message + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = message.cntrb_id))) + WHERE ((issue_message_ref.msg_id = message.msg_id) AND (issues.issue_id = issue_message_ref.issue_id) AND (issues.closed_at <> message.msg_timestamp))) a, + augur_data.repo + WHERE (a.repo_id = repo.repo_id) + ORDER BY a.created_at DESC; + + ALTER MATERIALIZED VIEW "augur_data"."augur_new_contributors" OWNER TO "augur"; + + ---- + create materialized view augur_data.explorer_contributor_actions as + SELECT a.id AS cntrb_id, + a.created_at, + a.repo_id, + a.action, + repo.repo_name, + a.login, + row_number() OVER (PARTITION BY a.id, a.repo_id ORDER BY a.created_at desc) AS rank + FROM ( SELECT commits.cmt_ght_author_id AS id, + commits.cmt_author_timestamp AS created_at, + commits.repo_id, + 'commit'::text AS action, + contributors.cntrb_login AS login + FROM (augur_data.commits + LEFT JOIN augur_data.contributors ON (((contributors.cntrb_id)::text = (commits.cmt_ght_author_id)::text))) + GROUP BY commits.cmt_commit_hash, commits.cmt_ght_author_id, commits.repo_id, commits.cmt_author_timestamp, 'commit'::text, contributors.cntrb_login + UNION ALL + SELECT issues.reporter_id AS id, + issues.created_at, + issues.repo_id, + 'issue_opened'::text AS action, + contributors.cntrb_login AS login + FROM (augur_data.issues + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = issues.reporter_id))) + WHERE (issues.pull_request IS NULL) + UNION ALL + SELECT pull_request_events.cntrb_id AS id, + pull_request_events.created_at, + pull_requests.repo_id, + 'pull_request_closed'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + (augur_data.pull_request_events + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = pull_request_events.cntrb_id))) + WHERE ((pull_requests.pull_request_id = pull_request_events.pull_request_id) AND (pull_requests.pr_merged_at IS NULL) AND ((pull_request_events.action)::text = 'closed'::text)) + UNION ALL + SELECT pull_request_events.cntrb_id AS id, + pull_request_events.created_at, + pull_requests.repo_id, + 'pull_request_merged'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + (augur_data.pull_request_events + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = pull_request_events.cntrb_id))) + WHERE ((pull_requests.pull_request_id = pull_request_events.pull_request_id) AND ((pull_request_events.action)::text = 'merged'::text)) + UNION ALL + SELECT issue_events.cntrb_id AS id, + issue_events.created_at, + issues.repo_id, + 'issue_closed'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.issues, + (augur_data.issue_events + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = issue_events.cntrb_id))) + WHERE ((issues.issue_id = issue_events.issue_id) AND (issues.pull_request IS NULL) AND ((issue_events.action)::text = 'closed'::text)) + UNION ALL + SELECT pull_request_reviews.cntrb_id AS id, + pull_request_reviews.pr_review_submitted_at AS created_at, + pull_requests.repo_id, + ('pull_request_review_'::text || (pull_request_reviews.pr_review_state)::text) AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + (augur_data.pull_request_reviews + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = pull_request_reviews.cntrb_id))) + WHERE (pull_requests.pull_request_id = pull_request_reviews.pull_request_id) + UNION ALL + SELECT pull_requests.pr_augur_contributor_id AS id, + pull_requests.pr_created_at AS created_at, + pull_requests.repo_id, + 'pull_request_open'::text AS action, + contributors.cntrb_login AS login + FROM (augur_data.pull_requests + LEFT JOIN augur_data.contributors ON ((pull_requests.pr_augur_contributor_id = contributors.cntrb_id))) + UNION ALL + SELECT message.cntrb_id AS id, + message.msg_timestamp AS created_at, + pull_requests.repo_id, + 'pull_request_comment'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + augur_data.pull_request_message_ref, + (augur_data.message + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = message.cntrb_id))) + WHERE ((pull_request_message_ref.pull_request_id = pull_requests.pull_request_id) AND (pull_request_message_ref.msg_id = message.msg_id)) + UNION ALL + SELECT issues.reporter_id AS id, + message.msg_timestamp AS created_at, + issues.repo_id, + 'issue_comment'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.issues, + augur_data.issue_message_ref, + (augur_data.message + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = message.cntrb_id))) + WHERE ((issue_message_ref.msg_id = message.msg_id) AND (issues.issue_id = issue_message_ref.issue_id) AND (issues.closed_at <> message.msg_timestamp))) a, + augur_data.repo + WHERE (a.repo_id = repo.repo_id) + ORDER BY a.created_at DESC; + + update augur_operations.config set value='1' where setting_name = 'refresh_materialized_views_interval_in_days';""")) + + conn.execute(text("""COMMIT;""")) + + conn.execute(text(""" drop materialized view if exists augur_data.explorer_new_contributors; + create materialized view augur_data.explorer_new_contributors + AS + SELECT x.cntrb_id, + x.created_at, + x.month, + x.year, + x.repo_id, + x.repo_name, + x.full_name, + x.login, + x.rank + FROM ( SELECT b.cntrb_id, + b.created_at, + b.month, + b.year, + b.repo_id, + b.repo_name, + b.full_name, + b.login, + b.action, + b.rank + FROM ( SELECT a.id AS cntrb_id, + a.created_at, + date_part('month'::text, (a.created_at)::date) AS month, + date_part('year'::text, (a.created_at)::date) AS year, + a.repo_id, + repo.repo_name, + a.full_name, + a.login, + a.action, + row_number() OVER (PARTITION BY a.id, a.repo_id ORDER BY a.created_at desc) AS rank + FROM ( SELECT canonical_full_names.canonical_id AS id, + issues.created_at, + issues.repo_id, + 'issue_opened'::text AS action, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM ((augur_data.issues + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = issues.reporter_id))) + LEFT JOIN ( SELECT DISTINCT ON (contributors_1.cntrb_canonical) contributors_1.cntrb_full_name, + contributors_1.cntrb_canonical AS canonical_email, + contributors_1.data_collection_date, + contributors_1.cntrb_id AS canonical_id + FROM augur_data.contributors contributors_1 + WHERE ((contributors_1.cntrb_canonical)::text = (contributors_1.cntrb_email)::text) + ORDER BY contributors_1.cntrb_canonical) canonical_full_names ON (((canonical_full_names.canonical_email)::text = (contributors.cntrb_canonical)::text))) + WHERE (issues.pull_request IS NULL) + GROUP BY canonical_full_names.canonical_id, issues.repo_id, issues.created_at, contributors.cntrb_full_name, contributors.cntrb_login + UNION ALL + SELECT canonical_full_names.canonical_id AS id, + to_timestamp((commits.cmt_author_date)::text, 'YYYY-MM-DD'::text) AS created_at, + commits.repo_id, + 'commit'::text AS action, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM ((augur_data.commits + LEFT JOIN augur_data.contributors ON (((contributors.cntrb_canonical)::text = (commits.cmt_author_email)::text))) + LEFT JOIN ( SELECT DISTINCT ON (contributors_1.cntrb_canonical) contributors_1.cntrb_full_name, + contributors_1.cntrb_canonical AS canonical_email, + contributors_1.data_collection_date, + contributors_1.cntrb_id AS canonical_id + FROM augur_data.contributors contributors_1 + WHERE ((contributors_1.cntrb_canonical)::text = (contributors_1.cntrb_email)::text) + ORDER BY contributors_1.cntrb_canonical) canonical_full_names ON (((canonical_full_names.canonical_email)::text = (contributors.cntrb_canonical)::text))) + GROUP BY commits.repo_id, canonical_full_names.canonical_email, canonical_full_names.canonical_id, commits.cmt_author_date, contributors.cntrb_full_name, contributors.cntrb_login + UNION ALL + SELECT message.cntrb_id AS id, + commit_comment_ref.created_at, + commits.repo_id, + 'commit_comment'::text AS action, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM augur_data.commit_comment_ref, + augur_data.commits, + ((augur_data.message + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = message.cntrb_id))) + LEFT JOIN ( SELECT DISTINCT ON (contributors_1.cntrb_canonical) contributors_1.cntrb_full_name, + contributors_1.cntrb_canonical AS canonical_email, + contributors_1.data_collection_date, + contributors_1.cntrb_id AS canonical_id + FROM augur_data.contributors contributors_1 + WHERE ((contributors_1.cntrb_canonical)::text = (contributors_1.cntrb_email)::text) + ORDER BY contributors_1.cntrb_canonical) canonical_full_names ON (((canonical_full_names.canonical_email)::text = (contributors.cntrb_canonical)::text))) + WHERE ((commits.cmt_id = commit_comment_ref.cmt_id) AND (commit_comment_ref.msg_id = message.msg_id)) + GROUP BY message.cntrb_id, commits.repo_id, commit_comment_ref.created_at, contributors.cntrb_full_name, contributors.cntrb_login + UNION ALL + SELECT issue_events.cntrb_id AS id, + issue_events.created_at, + issues.repo_id, + 'issue_closed'::text AS action, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM augur_data.issues, + ((augur_data.issue_events + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = issue_events.cntrb_id))) + LEFT JOIN ( SELECT DISTINCT ON (contributors_1.cntrb_canonical) contributors_1.cntrb_full_name, + contributors_1.cntrb_canonical AS canonical_email, + contributors_1.data_collection_date, + contributors_1.cntrb_id AS canonical_id + FROM augur_data.contributors contributors_1 + WHERE ((contributors_1.cntrb_canonical)::text = (contributors_1.cntrb_email)::text) + ORDER BY contributors_1.cntrb_canonical) canonical_full_names ON (((canonical_full_names.canonical_email)::text = (contributors.cntrb_canonical)::text))) + WHERE ((issues.issue_id = issue_events.issue_id) AND (issues.pull_request IS NULL) AND (issue_events.cntrb_id IS NOT NULL) AND ((issue_events.action)::text = 'closed'::text)) + GROUP BY issue_events.cntrb_id, issues.repo_id, issue_events.created_at, contributors.cntrb_full_name, contributors.cntrb_login + UNION ALL + SELECT pull_requests.pr_augur_contributor_id AS id, + pull_requests.pr_created_at AS created_at, + pull_requests.repo_id, + 'open_pull_request'::text AS action, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM ((augur_data.pull_requests + LEFT JOIN augur_data.contributors ON ((pull_requests.pr_augur_contributor_id = contributors.cntrb_id))) + LEFT JOIN ( SELECT DISTINCT ON (contributors_1.cntrb_canonical) contributors_1.cntrb_full_name, + contributors_1.cntrb_canonical AS canonical_email, + contributors_1.data_collection_date, + contributors_1.cntrb_id AS canonical_id + FROM augur_data.contributors contributors_1 + WHERE ((contributors_1.cntrb_canonical)::text = (contributors_1.cntrb_email)::text) + ORDER BY contributors_1.cntrb_canonical) canonical_full_names ON (((canonical_full_names.canonical_email)::text = (contributors.cntrb_canonical)::text))) + GROUP BY pull_requests.pr_augur_contributor_id, pull_requests.repo_id, pull_requests.pr_created_at, contributors.cntrb_full_name, contributors.cntrb_login + UNION ALL + SELECT message.cntrb_id AS id, + message.msg_timestamp AS created_at, + pull_requests.repo_id, + 'pull_request_comment'::text AS action, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + augur_data.pull_request_message_ref, + ((augur_data.message + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = message.cntrb_id))) + LEFT JOIN ( SELECT DISTINCT ON (contributors_1.cntrb_canonical) contributors_1.cntrb_full_name, + contributors_1.cntrb_canonical AS canonical_email, + contributors_1.data_collection_date, + contributors_1.cntrb_id AS canonical_id + FROM augur_data.contributors contributors_1 + WHERE ((contributors_1.cntrb_canonical)::text = (contributors_1.cntrb_email)::text) + ORDER BY contributors_1.cntrb_canonical) canonical_full_names ON (((canonical_full_names.canonical_email)::text = (contributors.cntrb_canonical)::text))) + WHERE ((pull_request_message_ref.pull_request_id = pull_requests.pull_request_id) AND (pull_request_message_ref.msg_id = message.msg_id)) + GROUP BY message.cntrb_id, pull_requests.repo_id, message.msg_timestamp, contributors.cntrb_full_name, contributors.cntrb_login + UNION ALL + SELECT issues.reporter_id AS id, + message.msg_timestamp AS created_at, + issues.repo_id, + 'issue_comment'::text AS action, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM augur_data.issues, + augur_data.issue_message_ref, + ((augur_data.message + LEFT JOIN augur_data.contributors ON ((contributors.cntrb_id = message.cntrb_id))) + LEFT JOIN ( SELECT DISTINCT ON (contributors_1.cntrb_canonical) contributors_1.cntrb_full_name, + contributors_1.cntrb_canonical AS canonical_email, + contributors_1.data_collection_date, + contributors_1.cntrb_id AS canonical_id + FROM augur_data.contributors contributors_1 + WHERE ((contributors_1.cntrb_canonical)::text = (contributors_1.cntrb_email)::text) + ORDER BY contributors_1.cntrb_canonical) canonical_full_names ON (((canonical_full_names.canonical_email)::text = (contributors.cntrb_canonical)::text))) + WHERE ((issue_message_ref.msg_id = message.msg_id) AND (issues.issue_id = issue_message_ref.issue_id) AND (issues.pull_request_id = NULL::bigint)) + GROUP BY issues.reporter_id, issues.repo_id, message.msg_timestamp, contributors.cntrb_full_name, contributors.cntrb_login) a, + augur_data.repo + WHERE ((a.id IS NOT NULL) AND (a.repo_id = repo.repo_id)) + GROUP BY a.id, a.repo_id, a.action, a.created_at, repo.repo_name, a.full_name, a.login + ORDER BY a.id) b + WHERE (b.rank = ANY (ARRAY[(1)::bigint, (2)::bigint, (3)::bigint, (4)::bigint, (5)::bigint, (6)::bigint, (7)::bigint]))) x; + + ALTER MATERIALIZED VIEW augur_data.explorer_new_contributors OWNER TO augur;""")) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text(""" + CREATE UNIQUE INDEX ON augur_data.augur_new_contributors( cntrb_id, created_at, repo_id, repo_name, login, rank); """)) + conn.execute(text("""COMMIT;""")) + + + conn = op.get_bind() + conn.execute(text(""" + CREATE UNIQUE INDEX ON augur_data.explorer_contributor_actions(cntrb_id,created_at,repo_id, action, repo_name,login, rank); """)) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text(""" + CREATE UNIQUE INDEX ON augur_data.explorer_new_contributors(cntrb_id, created_at, month, year, repo_id, full_name, repo_name, login, rank); """)) + conn.execute(text("""COMMIT;""")) + diff --git a/augur/application/schema/alembic/versions/26_materialized_view_unique_updates.py b/augur/application/schema/alembic/versions/26_materialized_view_unique_updates.py new file mode 100644 index 0000000000..f381ec48ef --- /dev/null +++ b/augur/application/schema/alembic/versions/26_materialized_view_unique_updates.py @@ -0,0 +1,245 @@ +""" Updating materialized views and associated indices + +Revision ID: 26 +Revises: 25 +Create Date: 2023-08-23 18:17:22.651191 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy import text + +# revision identifiers, used by Alembic. +revision = '26' +down_revision = '25' +branch_labels = None +depends_on = None + + +def upgrade(): + + mview_keys_26() + +def downgrade(): + + upgrade=False + + mview_keys_26(upgrade) + +def mview_keys_26(upgrade=True): + + if upgrade: + conn = op.get_bind() + conn.execute(text(""" + drop materialized view if exists augur_data.explorer_pr_assignments; + drop materialized view if exists augur_data.explorer_user_repos; + drop materialized view if exists augur_data.explorer_pr_response_times; + drop materialized view if exists augur_data.explorer_pr_response; + drop materialized view if exists augur_data.explorer_issue_assignments;""")) + + conn.execute(text(""" + create materialized view augur_data.explorer_pr_assignments as + SELECT + pr.pull_request_id, + pr.repo_id AS ID, + pr.pr_created_at AS created, + pr.pr_closed_at AS closed, + pre.created_at AS assign_date, + pre.ACTION AS assignment_action, + pre.cntrb_id AS assignee, + pre.node_id AS node_id + FROM + ( + augur_data.pull_requests pr + LEFT JOIN augur_data.pull_request_events pre ON ( + ( + ( pr.pull_request_id = pre.pull_request_id ) + AND ( + ( pre.ACTION ) :: TEXT = ANY ( ARRAY [ ( 'unassigned' :: CHARACTER VARYING ) :: TEXT, ( 'assigned' :: CHARACTER VARYING ) :: TEXT ] ) + ) + ) + ) + );""")) + conn.execute(text(""" + create materialized view augur_data.explorer_pr_response as + SELECT pr.pull_request_id, + pr.repo_id AS id, + pr.pr_augur_contributor_id AS cntrb_id, + m.msg_timestamp, + m.msg_cntrb_id, + pr.pr_created_at, + pr.pr_closed_at + FROM (augur_data.pull_requests pr + LEFT JOIN ( SELECT prr.pull_request_id, + m_1.msg_timestamp, + m_1.cntrb_id AS msg_cntrb_id + FROM augur_data.pull_request_review_message_ref prrmr, + augur_data.pull_requests pr_1, + augur_data.message m_1, + augur_data.pull_request_reviews prr + WHERE ((prrmr.pr_review_id = prr.pr_review_id) AND (prrmr.msg_id = m_1.msg_id) AND (prr.pull_request_id = pr_1.pull_request_id)) + UNION + SELECT prmr.pull_request_id, + m_1.msg_timestamp, + m_1.cntrb_id AS msg_cntrb_id + FROM augur_data.pull_request_message_ref prmr, + augur_data.pull_requests pr_1, + augur_data.message m_1 + WHERE ((prmr.pull_request_id = pr_1.pull_request_id) AND (prmr.msg_id = m_1.msg_id))) m ON ((m.pull_request_id = pr.pull_request_id)));""")) + + + + conn.execute(text(""" + create materialized view augur_data.explorer_user_repos as + SELECT a.login_name, + a.user_id, + b.group_id, + c.repo_id + FROM augur_operations.users a, + augur_operations.user_groups b, + augur_operations.user_repos c + WHERE ((a.user_id = b.user_id) AND (b.group_id = c.group_id)) + ORDER BY a.user_id;""")) + + conn.execute(text(""" + create materialized view augur_data.explorer_pr_response_times as + SELECT repo.repo_id, + pull_requests.pr_src_id, + repo.repo_name, + pull_requests.pr_src_author_association, + repo_groups.rg_name AS repo_group, + pull_requests.pr_src_state, + pull_requests.pr_merged_at, + pull_requests.pr_created_at, + pull_requests.pr_closed_at, + date_part('year'::text, (pull_requests.pr_created_at)::date) AS created_year, + date_part('month'::text, (pull_requests.pr_created_at)::date) AS created_month, + date_part('year'::text, (pull_requests.pr_closed_at)::date) AS closed_year, + date_part('month'::text, (pull_requests.pr_closed_at)::date) AS closed_month, + base_labels.pr_src_meta_label, + base_labels.pr_head_or_base, + ((EXTRACT(epoch FROM pull_requests.pr_closed_at) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (3600)::numeric) AS hours_to_close, + ((EXTRACT(epoch FROM pull_requests.pr_closed_at) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (86400)::numeric) AS days_to_close, + ((EXTRACT(epoch FROM response_times.first_response_time) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (3600)::numeric) AS hours_to_first_response, + ((EXTRACT(epoch FROM response_times.first_response_time) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (86400)::numeric) AS days_to_first_response, + ((EXTRACT(epoch FROM response_times.last_response_time) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (3600)::numeric) AS hours_to_last_response, + ((EXTRACT(epoch FROM response_times.last_response_time) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (86400)::numeric) AS days_to_last_response, + response_times.first_response_time, + response_times.last_response_time, + response_times.average_time_between_responses, + response_times.assigned_count, + response_times.review_requested_count, + response_times.labeled_count, + response_times.subscribed_count, + response_times.mentioned_count, + response_times.referenced_count, + response_times.closed_count, + response_times.head_ref_force_pushed_count, + response_times.merged_count, + response_times.milestoned_count, + response_times.unlabeled_count, + response_times.head_ref_deleted_count, + response_times.comment_count, + master_merged_counts.lines_added, + master_merged_counts.lines_removed, + all_commit_counts.commit_count, + master_merged_counts.file_count + FROM augur_data.repo, + augur_data.repo_groups, + ((((augur_data.pull_requests + LEFT JOIN ( SELECT pull_requests_1.pull_request_id, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'assigned'::text)) AS assigned_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'review_requested'::text)) AS review_requested_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'labeled'::text)) AS labeled_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'unlabeled'::text)) AS unlabeled_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'subscribed'::text)) AS subscribed_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'mentioned'::text)) AS mentioned_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'referenced'::text)) AS referenced_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'closed'::text)) AS closed_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'head_ref_force_pushed'::text)) AS head_ref_force_pushed_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'head_ref_deleted'::text)) AS head_ref_deleted_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'milestoned'::text)) AS milestoned_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'merged'::text)) AS merged_count, + min(message.msg_timestamp) AS first_response_time, + count(DISTINCT message.msg_timestamp) AS comment_count, + max(message.msg_timestamp) AS last_response_time, + ((max(message.msg_timestamp) - min(message.msg_timestamp)) / (count(DISTINCT message.msg_timestamp))::double precision) AS average_time_between_responses + FROM augur_data.pull_request_events, + augur_data.pull_requests pull_requests_1, + augur_data.repo repo_1, + augur_data.pull_request_message_ref, + augur_data.message + WHERE ((repo_1.repo_id = pull_requests_1.repo_id) AND (pull_requests_1.pull_request_id = pull_request_events.pull_request_id) AND (pull_requests_1.pull_request_id = pull_request_message_ref.pull_request_id) AND (pull_request_message_ref.msg_id = message.msg_id)) + GROUP BY pull_requests_1.pull_request_id) response_times ON ((pull_requests.pull_request_id = response_times.pull_request_id))) + LEFT JOIN ( SELECT pull_request_commits.pull_request_id, + count(DISTINCT pull_request_commits.pr_cmt_sha) AS commit_count + FROM augur_data.pull_request_commits, + augur_data.pull_requests pull_requests_1, + augur_data.pull_request_meta + WHERE ((pull_requests_1.pull_request_id = pull_request_commits.pull_request_id) AND (pull_requests_1.pull_request_id = pull_request_meta.pull_request_id) AND ((pull_request_commits.pr_cmt_sha)::text <> (pull_requests_1.pr_merge_commit_sha)::text) AND ((pull_request_commits.pr_cmt_sha)::text <> (pull_request_meta.pr_sha)::text)) + GROUP BY pull_request_commits.pull_request_id) all_commit_counts ON ((pull_requests.pull_request_id = all_commit_counts.pull_request_id))) + LEFT JOIN ( SELECT max(pull_request_meta.pr_repo_meta_id) AS max, + pull_request_meta.pull_request_id, + pull_request_meta.pr_head_or_base, + pull_request_meta.pr_src_meta_label + FROM augur_data.pull_requests pull_requests_1, + augur_data.pull_request_meta + WHERE ((pull_requests_1.pull_request_id = pull_request_meta.pull_request_id) AND ((pull_request_meta.pr_head_or_base)::text = 'base'::text)) + GROUP BY pull_request_meta.pull_request_id, pull_request_meta.pr_head_or_base, pull_request_meta.pr_src_meta_label) base_labels ON ((base_labels.pull_request_id = all_commit_counts.pull_request_id))) + LEFT JOIN ( SELECT sum(commits.cmt_added) AS lines_added, + sum(commits.cmt_removed) AS lines_removed, + pull_request_commits.pull_request_id, + count(DISTINCT commits.cmt_filename) AS file_count + FROM augur_data.pull_request_commits, + augur_data.commits, + augur_data.pull_requests pull_requests_1, + augur_data.pull_request_meta + WHERE (((commits.cmt_commit_hash)::text = (pull_request_commits.pr_cmt_sha)::text) AND (pull_requests_1.pull_request_id = pull_request_commits.pull_request_id) AND (pull_requests_1.pull_request_id = pull_request_meta.pull_request_id) AND (commits.repo_id = pull_requests_1.repo_id) AND ((commits.cmt_commit_hash)::text <> (pull_requests_1.pr_merge_commit_sha)::text) AND ((commits.cmt_commit_hash)::text <> (pull_request_meta.pr_sha)::text)) + GROUP BY pull_request_commits.pull_request_id) master_merged_counts ON ((base_labels.pull_request_id = master_merged_counts.pull_request_id))) + WHERE ((repo.repo_group_id = repo_groups.repo_group_id) AND (repo.repo_id = pull_requests.repo_id)) + ORDER BY response_times.merged_count DESC;""")) + + conn.execute(text(""" + create materialized view augur_data.explorer_issue_assignments as + SELECT + i.issue_id, + i.repo_id AS ID, + i.created_at AS created, + i.closed_at AS closed, + ie.created_at AS assign_date, + ie.ACTION AS assignment_action, + ie.cntrb_id AS assignee, + ie.node_id as node_id + FROM + ( + augur_data.issues i + LEFT JOIN augur_data.issue_events ie ON ( + ( + ( i.issue_id = ie.issue_id ) + AND ( + ( ie.ACTION ) :: TEXT = ANY ( ARRAY [ ( 'unassigned' :: CHARACTER VARYING ) :: TEXT, ( 'assigned' :: CHARACTER VARYING ) :: TEXT ] ) + ) + ) + ) + );""")) + + conn = op.get_bind() + conn.execute(text("""CREATE UNIQUE INDEX ON augur_data.explorer_user_repos(login_name,user_id,group_id,repo_id);""")) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text("""CREATE UNIQUE INDEX ON augur_data.explorer_pr_response_times(repo_id, pr_src_id, pr_src_meta_label);""")) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text("""CREATE UNIQUE INDEX ON augur_data.explorer_pr_assignments(pull_request_id, id, node_id);""")) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text("""CREATE UNIQUE INDEX ON augur_data.explorer_issue_assignments(issue_id, id, node_id);""")) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text("""CREATE UNIQUE INDEX ON augur_data.explorer_pr_response(pull_request_id, id, cntrb_id, msg_cntrb_id, msg_timestamp);""")) + conn.execute(text("""COMMIT;""")) \ No newline at end of file diff --git a/augur/application/schema/alembic/versions/3_oauth_and_user_groups.py b/augur/application/schema/alembic/versions/3_oauth_and_user_groups.py index 8d75b7a709..0d9c6d744a 100644 --- a/augur/application/schema/alembic/versions/3_oauth_and_user_groups.py +++ b/augur/application/schema/alembic/versions/3_oauth_and_user_groups.py @@ -85,9 +85,9 @@ def upgrade(): table_changes = """ - ALTER TABLE user_repos + ALTER TABLE augur_operations.user_repos ADD COLUMN group_id BIGINT, - ADD CONSTRAINT user_repos_group_id_fkey FOREIGN KEY (group_id) REFERENCES user_groups(group_id), + ADD CONSTRAINT user_repos_group_id_fkey FOREIGN KEY (group_id) REFERENCES augur_operations.user_groups(group_id), DROP COLUMN user_id, ADD PRIMARY KEY (group_id, repo_id); """ diff --git a/augur/application/schema/alembic/versions/9_add_repo_group_for_frontend_repos.py b/augur/application/schema/alembic/versions/9_add_repo_group_for_frontend_repos.py index 288f584cf5..52a6e017db 100644 --- a/augur/application/schema/alembic/versions/9_add_repo_group_for_frontend_repos.py +++ b/augur/application/schema/alembic/versions/9_add_repo_group_for_frontend_repos.py @@ -25,7 +25,7 @@ def upgrade(): conn = op.get_bind() result = conn.execute(text(f"""SELECT * FROM "augur_data"."repo_groups" WHERE rg_name='{repo_group_name}';""")).fetchall() if len(result) == 0: - conn.execute(f"""INSERT INTO "augur_data"."repo_groups" ("rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ('{repo_group_name}', 'DO NOT DELETE OR FRONTEND REPOS WILL BREAK', '', 0, '2023-02-17 15:00:00', NULL, NULL, NULL, NULL, NULL);""") + conn.execute(text(f"""INSERT INTO "augur_data"."repo_groups" ("rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES ('{repo_group_name}', 'DO NOT DELETE OR FRONTEND REPOS WILL BREAK', '', 0, '2023-02-17 15:00:00', NULL, NULL, NULL, NULL, NULL);""")) # ### end Alembic commands ### diff --git a/augur/application/util.py b/augur/application/util.py index 1915abdeb0..03e591df98 100644 --- a/augur/application/util.py +++ b/augur/application/util.py @@ -25,6 +25,3 @@ def get_all_repos_count(**kwargs): result = controller.get_repo_count(source="all", **kwargs) return result - - - diff --git a/augur/static/css/dashboard.css b/augur/static/css/dashboard.css index 1e998a10a4..ef111c32a4 100644 --- a/augur/static/css/dashboard.css +++ b/augur/static/css/dashboard.css @@ -26,12 +26,30 @@ body { } .nav-pills .nav-link.active, .nav-pills .show > .nav-link { - background-color: var(--color-accent); + background-color: var(--color-accent) } .dashboard-sidebar { - width: 280px; background-color: var(--color-bg-light) !important; + color: var(--color-fg) !important; + max-height: 100vh; +} + +.nav-link { + color: var(--color-fg); +} + +.nav-pills li:has(a:not(.active)) :hover { + color: var(--color-notice); +} + +.nav-pills li { + width: 100%; +} + +.nav-pills li a { + padding-left: 10px !important; + padding-right: 10px !important; } .dashboard-form-control { @@ -44,6 +62,20 @@ body { color: #bcd0f7; } +.circle-opaque { + border-radius: 50%; /* Make it a circle */ + display: inline-block; + position: absolute; /* Able to position it, overlaying the other image */ + left:0px; /* Customise the position, but make sure it */ + top:0px; /* is the same as .circle-transparent */ + z-index: -1; /* Makes the image sit *behind* .circle-transparent */ +} + +.circle-opaque img { + border-radius: 50%; /* Make it a circle */ + z-index: -1; +} + table { background-color: var(--color-fg); color: var(--color-fg-contrast); diff --git a/augur/static/css/stylesheet.css b/augur/static/css/stylesheet.css index 7ebcfc6430..59bbf07857 100644 --- a/augur/static/css/stylesheet.css +++ b/augur/static/css/stylesheet.css @@ -125,6 +125,10 @@ body { overflow: auto; } +.display-table th { + word-wrap: normal; +} + .paginationActive { background-color: var(--color-accent-dark); border-color: var(--color-accent-dark); diff --git a/augur/static/img/tswiftjet.png b/augur/static/img/tswiftjet.png new file mode 100644 index 0000000000..1d7a96ee8a Binary files /dev/null and b/augur/static/img/tswiftjet.png differ diff --git a/augur/tasks/data_analysis/clustering_worker/setup.py b/augur/tasks/data_analysis/clustering_worker/setup.py index 9a1b425f90..78fb0b4b50 100644 --- a/augur/tasks/data_analysis/clustering_worker/setup.py +++ b/augur/tasks/data_analysis/clustering_worker/setup.py @@ -29,11 +29,11 @@ def read(filename): 'psycopg2-binary==2.9.3', #'sklearn==0.0.0', 'scikit-learn==1.1.3', - 'numpy==1.22.0', + 'numpy==1.26.0', 'nltk==3.6.6', 'seaborn==0.11.1', - 'pandas==1.3.5', - 'matplotlib==3.5.1' + 'pandas==1.5.3', + 'matplotlib>=3.5.1' ], classifiers=[ 'Development Status :: 2 - Pre-Alpha', diff --git a/augur/tasks/data_analysis/clustering_worker/tasks.py b/augur/tasks/data_analysis/clustering_worker/tasks.py index 2d4f4973de..c102e6c227 100644 --- a/augur/tasks/data_analysis/clustering_worker/tasks.py +++ b/augur/tasks/data_analysis/clustering_worker/tasks.py @@ -116,7 +116,9 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: """ ) # result = db.execute(delete_points_SQL, repo_id=repo_id, min_date=min_date) - msg_df_cur_repo = pd.read_sql(get_messages_for_repo_sql, engine, params={"repo_id": repo_id}) + + with engine.connect() as conn: + msg_df_cur_repo = pd.read_sql(get_messages_for_repo_sql, conn, params={"repo_id": repo_id}) logger.info(msg_df_cur_repo.head()) logger.debug(f"Repo message df size: {len(msg_df_cur_repo.index)}") @@ -303,7 +305,9 @@ def visualize_labels_PCA(features, labels, annotations, num_components, title): AND prmr.msg_id=m.msg_id """ ) - msg_df_all = pd.read_sql(get_messages_sql, engine, params={}) + + with engine.connect() as conn: + msg_df_all = pd.read_sql(get_messages_sql, conn, params={}) # select only highly active repos logger.debug("Selecting highly active repos") diff --git a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py index 44a6761cf9..4521a722e2 100644 --- a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py +++ b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py @@ -26,16 +26,40 @@ def contributor_breadth_model() -> None: tool_version = '0.0.1' data_source = 'GitHub API' - + # This version of the query pulls contributors who have not had any data collected yet + # To the top of the list cntrb_login_query = s.sql.text(""" - SELECT DISTINCT gh_login, cntrb_id - FROM augur_data.contributors - WHERE gh_login IS NOT NULL + SELECT DISTINCT + gh_login, + cntrb_id + FROM + ( + SELECT DISTINCT + gh_login, + cntrb_id, + data_collection_date + FROM + ( + SELECT DISTINCT + contributors.gh_login, + contributors.cntrb_id, + contributor_repo.data_collection_date :: DATE + FROM + contributor_repo + RIGHT OUTER JOIN contributors ON contributors.cntrb_id = contributor_repo.cntrb_id + AND contributors.gh_login IS NOT NULL + ORDER BY + contributor_repo.data_collection_date :: DATE NULLS FIRST + ) A + ORDER BY + data_collection_date DESC NULLS FIRST + ) b """) - result = engine.execute(cntrb_login_query) + with engine.connect() as connection: + result = connection.execute(cntrb_login_query) - current_cntrb_logins = [dict(row) for row in result] + current_cntrb_logins = [dict(row) for row in result.mappings()] cntrb_newest_events_query = s.sql.text(""" @@ -45,8 +69,10 @@ def contributor_breadth_model() -> None: GROUP BY c.gh_login; """) - cntrb_newest_events_list = engine.execute(cntrb_newest_events_query) - cntrb_newest_events_list = [dict(row) for row in cntrb_newest_events_list] + with engine.connect() as connection: + cntrb_newest_events_list = connection.execute(cntrb_newest_events_query) + + cntrb_newest_events_list = [dict(row) for row in cntrb_newest_events_list.mappings()] cntrb_newest_events_map = {} for cntrb_event in cntrb_newest_events_list: diff --git a/augur/tasks/data_analysis/discourse_analysis/setup.py b/augur/tasks/data_analysis/discourse_analysis/setup.py index 9a4e91c018..37d6557ec5 100644 --- a/augur/tasks/data_analysis/discourse_analysis/setup.py +++ b/augur/tasks/data_analysis/discourse_analysis/setup.py @@ -28,13 +28,13 @@ def read(filename): 'requests==2.28.0', 'psycopg2-binary==2.9.3', 'click==8.0.3', - 'scipy==1.7.3', + 'scipy>=1.10.0', 'nltk==3.6.6', - 'pandas==1.3.5', + 'pandas==1.5.3', 'scikit-learn==1.1.3', 'textblob==0.15.3', - 'python-crfsuite==0.9.8', - 'sklearn-crfsuite==0.3.6', + 'python-crfsuite>=0.9.8', + 'sklearn-crfsuite>=0.3.6', 'tabulate==0.8.9' ], # python-crfsuite-0.9.8 sklearn-crfsuite-0.3.6 tabulate-0.8.9 entry_points={ diff --git a/augur/tasks/data_analysis/discourse_analysis/tasks.py b/augur/tasks/data_analysis/discourse_analysis/tasks.py index 2febe86360..5a9941679c 100644 --- a/augur/tasks/data_analysis/discourse_analysis/tasks.py +++ b/augur/tasks/data_analysis/discourse_analysis/tasks.py @@ -72,7 +72,9 @@ def discourse_analysis_model(repo_git: str,logger,engine) -> None: """) # result = db.execute(delete_points_SQL, repo_id=repo_id, min_date=min_date) - msg_df_cur_repo = pd.read_sql(get_messages_for_repo_sql, engine, params={"repo_id": repo_id}) + + with engine.connect() as conn: + msg_df_cur_repo = pd.read_sql(get_messages_for_repo_sql, conn, params={"repo_id": repo_id}) msg_df_cur_repo = msg_df_cur_repo.sort_values(by=['thread_id']).reset_index(drop=True) logger.info(msg_df_cur_repo.head()) diff --git a/augur/tasks/data_analysis/insight_worker/setup.py b/augur/tasks/data_analysis/insight_worker/setup.py index 0eb35d8a78..1ee6e8a4bd 100644 --- a/augur/tasks/data_analysis/insight_worker/setup.py +++ b/augur/tasks/data_analysis/insight_worker/setup.py @@ -29,9 +29,9 @@ def read(filename): 'requests==2.28.0', 'psycopg2-binary==2.9.3', 'click==8.0.3', - 'scipy>=1.7.3', + 'scipy>=1.10.0', 'sklearn==0.0', - 'numpy==1.22.0', + 'numpy==1.26.0', ], entry_points={ 'console_scripts': [ diff --git a/augur/tasks/data_analysis/insight_worker/tasks.py b/augur/tasks/data_analysis/insight_worker/tasks.py index 7f506c8d12..37ae5f484c 100644 --- a/augur/tasks/data_analysis/insight_worker/tasks.py +++ b/augur/tasks/data_analysis/insight_worker/tasks.py @@ -134,13 +134,16 @@ def insight_model(repo_git: str,logger,engine,session) -> None: WHERE repo_insights.ri_metric = to_delete.ri_metric AND repo_insights.ri_field = to_delete.ri_field """) - result = engine.execute(delete_points_SQL, repo_id=repo_id, min_date=min_date) + + with engine.connect as conn: + result = conn.execute(delete_points_SQL, repo_id=repo_id, min_date=min_date) # get table values to check for dupes later on table_values_sql = s.sql.text("""SELECT * FROM repo_insights_records WHERE repo_id={}""".format(repo_id)) - insight_table_values = pd.read_sql(table_values_sql, engine, params={}) + with engine.connect() as conn: + insight_table_values = pd.read_sql(table_values_sql,conn, params={}) to_model_columns = df.columns[0:len(metrics) + 1] @@ -257,7 +260,7 @@ def classify_anomalies(df, metric): repo_insight_record_obj.ri_id)) # Send insight to Jonah for slack bot - send_insight(record, abs(next_recent_anomaly.iloc[0][metric] - mean), logger) + send_insight(record, abs(next_recent_anomaly.iloc[0][metric] - mean), logger,engine) insight_count += 1 else: @@ -526,8 +529,8 @@ def send_insight(insight, units_from_mean, logger, engine): FROM repo, repo_groups WHERE repo_id = {} """.format(insight['repo_id'])) - - repo = pd.read_sql(repoSQL, engine, params={}).iloc[0] + with engine.connect() as conn: + repo = pd.read_sql(repoSQL, conn, params={}).iloc[0] begin_date = datetime.datetime.now() - datetime.timedelta(days=anomaly_days) dict_date = insight['ri_date'].strftime("%Y-%m-%d %H:%M:%S") @@ -565,7 +568,8 @@ def clear_insights(repo_id, new_endpoint, new_field, logger): AND ri_field = '{}' """.format(repo_id, new_endpoint, new_field) try: - result = engine.execute(deleteSQL) + with engine.connect() as conn: + result = conn.execute(deleteSQL) except Exception as e: logger.info("Error occured deleting insight slot: {}".format(e)) @@ -582,7 +586,8 @@ def clear_insights(repo_id, new_endpoint, new_field, logger): AND ri_field = '{}' """.format(repo_id, new_endpoint, new_field) try: - result = engine.execute(deleteSQL) + with engine.connect() as conn: + result = conn.execute(deleteSQL) except Exception as e: logger.info("Error occured deleting insight slot: {}".format(e)) @@ -602,7 +607,8 @@ def clear_insight(repo_id, new_score, new_metric, new_field, logger): AND ri_field = '{}' ORDER BY ri_score DESC """.format(repo_id, new_metric, new_field)) - rec = json.loads(pd.read_sql(recordSQL, engine, params={}).to_json(orient='records')) + with engine.connect() as conn: + rec = json.loads(pd.read_sql(recordSQL, conn, params={}).to_json(orient='records')) logger.info("recordsql: {}, \n{}".format(recordSQL, rec)) # If new score is higher, continue with deletion if len(rec) > 0: @@ -623,7 +629,8 @@ def clear_insight(repo_id, new_score, new_metric, new_field, logger): AND ri_field = '{}' """.format(record['repo_id'], record['ri_metric'], record['ri_field']) try: - result = engine.execute(deleteSQL) + with engine.connect() as conn: + result = conn.execute(deleteSQL) except Exception as e: logger.info("Error occured deleting insight slot: {}".format(e)) else: @@ -637,7 +644,8 @@ def clear_insight(repo_id, new_score, new_metric, new_field, logger): WHERE repo_id = {} ORDER BY ri_score ASC """.format(repo_id)) - ins = json.loads(pd.read_sql(insightSQL, engine, params={}).to_json(orient='records')) + with engine.connect() as conn: + ins = json.loads(pd.read_sql(insightSQL, conn, params={}).to_json(orient='records')) logger.info("This repos insights: {}".format(ins)) # Determine if inisghts need to be deleted based on if there are more insights than we want stored, @@ -675,7 +683,8 @@ def clear_insight(repo_id, new_score, new_metric, new_field, logger): AND ri_metric = '{}' """.format(insight['repo_id'], insight['ri_metric']) try: - result = engine.execute(deleteSQL) + with engine.connect() as conn: + result = conn.execute(deleteSQL) except Exception as e: logger.info("Error occured deleting insight slot: {}".format(e)) @@ -744,7 +753,9 @@ def filter_duplicates(cols, tables, og_data, logger, engine): colSQL = s.sql.text(""" SELECT {} FROM {} """.format(col, table_str)) - values = pd.read_sql(colSQL, engine, params={}) + + with engine.connect() as conn: + values = pd.read_sql(colSQL, conn, params={}) for obj in og_data: if values.isin([obj[cols[col]]]).any().any(): diff --git a/augur/tasks/data_analysis/message_insights/setup.py b/augur/tasks/data_analysis/message_insights/setup.py index f1ac484fc1..a4f6a30c43 100644 --- a/augur/tasks/data_analysis/message_insights/setup.py +++ b/augur/tasks/data_analysis/message_insights/setup.py @@ -30,22 +30,22 @@ def read(filename): 'requests==2.28.0', 'psycopg2-binary==2.9.3', 'click==8.0.3', - 'scipy==1.7.3', + 'scipy>=1.10.0', 'scikit-learn==1.1.3', #0.24.2', - 'numpy==1.22.0', + 'numpy==1.26.0', 'nltk==3.6.6', - 'pandas==1.3.5', + 'pandas==1.5.3', 'emoji==1.2.0', - 'Keras', #<2.9.0rc0', - 'Keras-Preprocessing', #==1.1.2', - 'tensorflow', #==2.8.0', - #'h5py~=3.6.0', + 'keras>=2.15.0', + 'Keras-Preprocessing', + 'tensorflow==2.15.0', + 'h5py==3.10.0', 'scikit-image==0.19.1', - 'joblib==1.0.1', + 'joblib==1.2.0', 'xgboost', 'bs4==0.0.1', 'xlrd==2.0.1', - 'gensim==4.2.0' + 'gensim>=4.2.0' ], classifiers=[ 'Development Status :: 3 - Alpha', diff --git a/augur/tasks/data_analysis/message_insights/tasks.py b/augur/tasks/data_analysis/message_insights/tasks.py index 1acec976c3..4727d3def7 100644 --- a/augur/tasks/data_analysis/message_insights/tasks.py +++ b/augur/tasks/data_analysis/message_insights/tasks.py @@ -59,7 +59,8 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: repo_exists_SQL = s.sql.text(""" SELECT exists (SELECT 1 FROM augur_data.message_analysis_summary WHERE repo_id = :repo_id LIMIT 1)""") - df_rep = pd.read_sql_query(repo_exists_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + df_rep = pd.read_sql_query(repo_exists_SQL, conn, params={'repo_id': repo_id}) #full_train = not(df_rep['exists'].iloc[0]) logger.info(f'Full Train: {full_train}') @@ -84,7 +85,8 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: where message.repo_id = :repo_id """) - df_past = pd.read_sql_query(past_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + df_past = pd.read_sql_query(past_SQL, conn, params={'repo_id': repo_id}) df_past['msg_timestamp'] = pd.to_datetime(df_past['msg_timestamp']) df_past = df_past.sort_values(by='msg_timestamp') @@ -124,7 +126,8 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id where message.repo_id = :repo_id""") - df_message = pd.read_sql_query(join_SQL, engine, params={'repo_id': repo_id, 'begin_date': begin_date}) + with engine.connect() as conn: + df_message = pd.read_sql_query(join_SQL, conn, params={'repo_id': repo_id, 'begin_date': begin_date}) logger.info(f'Messages dataframe dim: {df_message.shape}') logger.info(f'Value 1: {df_message.shape[0]}') @@ -159,7 +162,8 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id where issue_message_ref.repo_id = :repo_id""") - df_past = pd.read_sql_query(merge_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + df_past = pd.read_sql_query(merge_SQL, conn, params={'repo_id': repo_id}) df_past = df_past.loc[df_past['novelty_flag'] == 0] rec_errors = df_past['reconstruction_error'].tolist() threshold = threshold_otsu(np.array(rec_errors)) @@ -345,7 +349,8 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: FROM message_analysis_summary WHERE repo_id=:repo_id""") - df_past = pd.read_sql_query(message_analysis_query, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + df_past = pd.read_sql_query(message_analysis_query, conn, params={'repo_id': repo_id}) # df_past = get_table_values(cols=['period', 'positive_ratio', 'negative_ratio', 'novel_count'], # tables=['message_analysis_summary'], @@ -414,12 +419,13 @@ def send_insight(repo_id, insights, logger, engine): WHERE repo_id = {} """.format(repo_id)) - repo = pd.read_sql(repoSQL, engine, params={}).iloc[0] + with engine.connect() as conn: + repo = pd.read_sql(repoSQL, conn, params={}).iloc[0] to_send = { 'message_insight': True, 'repo_git': repo['repo_git'], - 'insight_begin_date': begin_date.strftime("%Y-%m-%d %H:%M:%S"), + 'insight_begin_date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), # date from when insights are calculated 'sentiment': insights[0], # sentiment insight dict 'novelty': insights[1], # novelty insight dict @@ -449,13 +455,14 @@ def get_max_id(table, column, logger, engine, default=25150): SELECT max({0}.{1}) AS {1} FROM {0} """.format(table, column)) - rs = pd.read_sql(max_id_sql, engine, params={}) + + with engine.connect() as conn: + rs = pd.read_sql(max_id_sql, conn, params={}) if rs.iloc[0][column] is not None: max_id = int(rs.iloc[0][column]) + 1 logger.info("Found max id for {} column in the {} table: {}\n".format(column, table, max_id)) else: max_id = default - logger.warning("Could not find max id for {} column in the {} table... " + - "using default set to: {}\n".format(column, table, max_id)) + logger.warning(f"Could not find max id for {column} column in the {table} table... using default set to: {max_id}\n") return max_id diff --git a/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py b/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py index dc13c94bf9..3341f24ff1 100644 --- a/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py +++ b/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py @@ -29,12 +29,12 @@ def read(filename): 'psycopg2-binary==2.9.3', 'sklearn==0.0', 'nltk==3.6.6', - 'numpy==1.22.0', - 'pandas==1.3.5', + 'numpy==1.26.0', + 'pandas==1.5.3', 'emoji==1.2.0', - 'joblib==1.0.1', + 'joblib==1.2.0', 'xgboost==1.4.2', - 'scipy==1.7.3' + 'scipy>=1.10.0' ], classifiers=[ 'Development Status :: 2 - Pre-Alpha', diff --git a/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py b/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py index c2816bed8c..9d6d5be78e 100644 --- a/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py +++ b/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py @@ -74,8 +74,8 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: and pull_requests.repo_id = :repo_id and pr_src_state like 'open' """) - - df_pr = pd.read_sql_query(pr_SQL, engine, params={'begin_date': begin_date, 'repo_id': repo_id}) + with engine.connect() as conn: + df_pr = pd.read_sql_query(pr_SQL, conn, params={'begin_date': begin_date, 'repo_id': repo_id}) logger.info(f'PR Dataframe dim: {df_pr.shape}\n') @@ -106,15 +106,16 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: select message.msg_id, msg_timestamp, msg_text, message.cntrb_id from augur_data.message left outer join augur_data.issue_message_ref on message.msg_id = issue_message_ref.msg_id left outer join augur_data.issues on issue_message_ref.issue_id = issues.issue_id where issue_message_ref.repo_id = :repo_id""") - - df_message = pd.read_sql_query(messages_SQL, engine, params={'repo_id': repo_id}) + with engine.connect() as conn: + df_message = pd.read_sql_query(messages_SQL, conn, params={'repo_id': repo_id}) logger.info(f'Mapping messages to PR, find comment & participants counts') # Map PR to its corresponding messages pr_ref_sql = s.sql.text("select * from augur_data.pull_request_message_ref") - df_pr_ref = pd.read_sql_query(pr_ref_sql, engine) + with engine.connect() as conn: + df_pr_ref = pd.read_sql_query(pr_ref_sql, conn) df_merge = pd.merge(df_pr, df_pr_ref, on='pull_request_id', how='left') df_merge = pd.merge(df_merge, df_message, on='msg_id', how='left') df_merge = df_merge.dropna(subset=['msg_id'], axis=0) @@ -167,7 +168,9 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: SELECT repo_id, pull_requests_merged, pull_request_count,watchers_count, last_updated FROM augur_data.repo_info where repo_id = :repo_id """) - df_repo = pd.read_sql_query(repo_sql, engine, params={'repo_id': repo_id}) + + with engine.connect() as conn: + df_repo = pd.read_sql_query(repo_sql, conn, params={'repo_id': repo_id}) df_repo = df_repo.loc[df_repo.groupby('repo_id').last_updated.idxmax(), :] df_repo = df_repo.drop(['last_updated'], axis=1) diff --git a/augur/tasks/db/refresh_materialized_views.py b/augur/tasks/db/refresh_materialized_views.py index 53b29ddbd2..f04d01552b 100644 --- a/augur/tasks/db/refresh_materialized_views.py +++ b/augur/tasks/db/refresh_materialized_views.py @@ -6,27 +6,181 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.session import DatabaseSession +from augur.application.logs import AugurLogger @celery.task def refresh_materialized_views(): + #self.logger = AugurLogger("data_collection_jobs").get_logger() + from augur.tasks.init.celery_app import engine logger = logging.getLogger(refresh_materialized_views.__name__) + #self.logger = logging.getLogger(refresh_materialized_views.__name__) + + mv1_refresh = s.sql.text(""" + REFRESH MATERIALIZED VIEW concurrently augur_data.api_get_all_repo_prs with data; + COMMIT; + """) + + mv2_refresh = s.sql.text(""" + REFRESH MATERIALIZED VIEW concurrently augur_data.api_get_all_repos_commits with data; + COMMIT; + """) + + mv3_refresh = s.sql.text(""" + REFRESH MATERIALIZED VIEW concurrently augur_data.api_get_all_repos_issues with data; + COMMIT; + """) + + mv4_refresh = s.sql.text(""" + REFRESH MATERIALIZED VIEW concurrently augur_data.augur_new_contributors with data; + COMMIT; + """) + mv5_refresh = s.sql.text(""" + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_commits_and_committers_daily_count with data; + COMMIT; + """) + + mv6_refresh = s.sql.text(""" + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_new_contributors with data; + COMMIT; + """) + + mv7_refresh = s.sql.text(""" + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_entry_list with data; + COMMIT; + """) + + mv8_refresh = s.sql.text(""" + + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_contributor_actions with data; + COMMIT; + """) + + mv9_refresh = s.sql.text(""" + + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_user_repos with data; + COMMIT; + """) + + mv10_refresh = s.sql.text(""" + + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_pr_response_times with data; + COMMIT; + """) - refresh_view_query = s.sql.text(""" - REFRESH MATERIALIZED VIEW augur_data.api_get_all_repos_issues with data; - REFRESH MATERIALIZED VIEW augur_data.explorer_commits_and_committers_daily_count with data; - REFRESH MATERIALIZED VIEW augur_data.api_get_all_repos_commits with data; - REFRESH MATERIALIZED VIEW augur_data.augur_new_contributors with data; - REFRESH MATERIALIZED VIEW augur_data.explorer_contributor_actions with data; - REFRESH MATERIALIZED VIEW augur_data.explorer_libyear_all with data; - REFRESH MATERIALIZED VIEW augur_data.explorer_libyear_detail with data; - REFRESH MATERIALIZED VIEW augur_data.explorer_new_contributors with data; - REFRESH MATERIALIZED VIEW augur_data.explorer_libyear_summary with data; + mv11_refresh = s.sql.text(""" + + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_pr_assignments with data; + COMMIT; + """) + + mv12_refresh = s.sql.text(""" + + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_issue_assignments with data; + COMMIT; + """) + + mv13_refresh = s.sql.text(""" + + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_pr_response with data; + COMMIT; """) - with DatabaseSession(logger, engine) as session: + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv1_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv2_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv3_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv4_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv5_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv6_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv7_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv8_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv9_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv10_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv11_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv12_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv13_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + + + - session.execute_sql(refresh_view_query) \ No newline at end of file diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index b8eb8b203c..fffd79d330 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -30,15 +30,15 @@ def add_org_repo_list(user_id, group_name, urls): valid_repos = [] for url in urls: - # matches https://github.com/{org}/ or htts://github.com/{org} + # matches https://github.com/{org}/ or http://github.com/{org} if Repo.parse_github_org_url(url): - added = user.add_org(group_name, url)[0] + added = user.add_github_org(group_name, url)[0] if added: valid_orgs.append(url) - # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} + # matches https://github.com/{org}/{repo}/ or http://github.com/{org}/{repo} elif Repo.parse_github_repo_url(url)[0]: - added = user.add_repo(group_name, url)[0] + added = user.add_github_repo(group_name, url)[0] if added: valid_repos.append(url) @@ -46,7 +46,7 @@ def add_org_repo_list(user_id, group_name, urls): elif (match := parse_org_and_repo_name(url)): org, repo = match.groups() repo_url = f"https://github.com/{org}/{repo}/" - added = user.add_repo(group_name, repo_url)[0] + added = user.add_github_repo(group_name, repo_url)[0] if added: valid_repos.append(url) @@ -54,9 +54,17 @@ def add_org_repo_list(user_id, group_name, urls): elif (match := parse_org_name(url)): org = match.group(1) org_url = f"https://github.com/{org}/" - added = user.add_org(group_name, org_url)[0] + added = user.add_github_org(group_name, org_url)[0] if added: valid_orgs.append(url) + + # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} + elif Repo.parse_gitlab_repo_url(url)[0]: + + added = user.add_gitlab_repo(group_name, url)[0] + if added: + valid_repos.append(url) + else: invalid_urls.append(url) @@ -66,24 +74,25 @@ def add_org_repo_list(user_id, group_name, urls): - +# TODO: Change to github specific @celery.task def add_repo(user_id, group_name, repo_url): logger = logging.getLogger(add_org.__name__) with GithubTaskSession(logger) as session: - result = UserRepo.add(session, repo_url, user_id, group_name) + result = UserRepo.add_github_repo(session, repo_url, user_id, group_name) print(repo_url, result) +# TODO: Change to github specific @celery.task def add_org(user_id, group_name, org_url): logger = logging.getLogger(add_org.__name__) with GithubTaskSession(logger) as session: - result = UserRepo.add_org_repos(session, org_url, user_id, group_name) + result = UserRepo.add_github_org_repos(session, org_url, user_id, group_name) print(org_url, result) diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 4fdb3955ed..9176a93283 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -12,9 +12,10 @@ from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler from augur.application.db.util import execute_session_query from augur.tasks.git.dependency_tasks.dependency_util import dependency_calculator as dep_calc +from augur.tasks.util.worker_util import parse_json_from_subprocess_call def generate_deps_data(session, repo_id, path): - """Runs scc on repo and stores data in database + """Run dependency logic on repo and stores data in database :param repo_id: Repository ID :param path: Absolute path of the Repostiory """ @@ -46,22 +47,16 @@ def generate_deps_data(session, repo_id, path): session.logger.info(f"Inserted {len(deps)} dependencies for repo {repo_id}") +""" +def deps_model(session, repo_id,repo_git,repo_path,repo_name): + # Data collection and storage method -def deps_model(session, repo_id,repo_git,repo_group_id): - """ Data collection and storage method - """ session.logger.info(f"This is the deps model repo: {repo_git}.") - #result = session.execute_sql(repo_path_sql) - result = re.search(r"https:\/\/(github\.com\/[A-Za-z0-9 \- _]+\/)([A-Za-z0-9 \- _ .]+)$", repo_git).groups() - - relative_repo_path = f"{repo_group_id}/{result[0]}{result[1]}" - config = AugurConfig(session.logger, session) - absolute_repo_path = config.get_section("Facade")['repo_directory'] + relative_repo_path - generate_deps_data(session,repo_id, absolute_repo_path) +""" def generate_scorecard(session,repo_id,path): """Runs scorecard on repo and stores data in database @@ -86,16 +81,8 @@ def generate_scorecard(session,repo_id,path): key_handler = GithubApiKeyHandler(session) os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() - p= subprocess.run(['./scorecard', command, '--format=json'], cwd= path_to_scorecard ,capture_output=True, text=True, timeout=None) - session.logger.info('subprocess completed successfully... ') - output = p.stdout - - try: - required_output = json.loads(output) - except json.decoder.JSONDecodeError as e: - session.logger.error(f"Could not parse required output! \n output: {output} \n Error: {e}") - return - + required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) + session.logger.info('adding to database...') session.logger.debug(f"output: {required_output}") diff --git a/augur/tasks/git/dependency_tasks/dependency_util/kotlin_deps.py b/augur/tasks/git/dependency_tasks/dependency_util/kotlin_deps.py index 0d9da19ba4..8159d62e31 100644 --- a/augur/tasks/git/dependency_tasks/dependency_util/kotlin_deps.py +++ b/augur/tasks/git/dependency_tasks/dependency_util/kotlin_deps.py @@ -6,7 +6,7 @@ def get_files(path): files = list(p.glob('**/*.kt')) return files -def get_imports_for_file(path): +def get_deps_for_file(path): with open(path, 'r') as f: content = f.read() matches = re.findall('import\s+(.*?)(?:;|\n)', content, re.DOTALL) diff --git a/augur/tasks/git/dependency_tasks/dependency_util/rust_deps.py b/augur/tasks/git/dependency_tasks/dependency_util/rust_deps.py index 7b5fbdfce7..92380d8098 100644 --- a/augur/tasks/git/dependency_tasks/dependency_util/rust_deps.py +++ b/augur/tasks/git/dependency_tasks/dependency_util/rust_deps.py @@ -9,7 +9,8 @@ def get_files(path): files = list(p.glob('**/*.rs')) return files -def get_imports_for_file(path): +def get_deps_for_file(path): + #gets imports in specified file path. with open(path, 'r') as f: content = f.read() matches = re.findall(r'use\s+([\w:]+)(\s+as\s+([\w:]+))?(\s*\*\s*)?(;|\n)', content) @@ -17,5 +18,4 @@ def get_imports_for_file(path): for m in matches: import_path = m[0] imports.append(import_path) - return imports - + return imports \ No newline at end of file diff --git a/augur/tasks/git/dependency_tasks/tasks.py b/augur/tasks/git/dependency_tasks/tasks.py index 0cdd333b25..5e7c1d846f 100644 --- a/augur/tasks/git/dependency_tasks/tasks.py +++ b/augur/tasks/git/dependency_tasks/tasks.py @@ -3,8 +3,10 @@ from augur.application.db.session import DatabaseSession from augur.tasks.git.dependency_tasks.core import * from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask, AugurCoreRepoCollectionTask +from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask, AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask from augur.application.db.util import execute_session_query +from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path +from augur.application.config import AugurConfig @celery.task(base=AugurFacadeRepoCollectionTask) @@ -21,10 +23,17 @@ def process_dependency_metrics(repo_git): repo = execute_session_query(query,'one') - deps_model(session, repo.repo_id,repo_git,repo.repo_group_id) + + config = AugurConfig(session.logger, session) + + absolute_repo_path = get_absolute_repo_path(config.get_section("Facade")['repo_directory'],repo.repo_id,repo.repo_path,repo.repo_name) + + session.logger.debug(f"This is the deps model repo: {repo_git}.") + + generate_deps_data(session,repo.repo_id,absolute_repo_path) -@celery.task(base=AugurCoreRepoCollectionTask) +@celery.task(base=AugurSecondaryRepoCollectionTask) def process_ossf_dependency_metrics(repo_git): from augur.tasks.init.celery_app import engine diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index d407011b06..ee3dc047ff 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -23,15 +23,16 @@ from datetime import timedelta import sqlalchemy as s -from sqlalchemy import or_, and_, update +from sqlalchemy import or_, and_, update, insert -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import update_repo_log, trim_commit, store_working_author, trim_author +from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import update_repo_log, trim_commits, store_working_author, trim_author from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path, get_parent_commits_set, get_existing_commits_set from augur.tasks.git.util.facade_worker.facade_worker.analyzecommit import analyze_commit -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_facade_weight_time_factor, get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count +from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_facade_weight_time_factor, get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count, facade_bulk_insert_commits from augur.tasks.github.facade_github.tasks import * -from augur.tasks.util.collection_util import CollectionState, get_collection_status_repo_git_from_filter +from augur.tasks.util.collection_state import CollectionState +from augur.tasks.util.collection_util import get_collection_status_repo_git_from_filter from augur.tasks.git.util.facade_worker.facade_worker.repofetch import GitCloneError, git_repo_initialize @@ -46,6 +47,7 @@ from augur.tasks.git.dependency_tasks.tasks import process_dependency_metrics from augur.tasks.git.dependency_libyear_tasks.tasks import process_libyear_dependency_metrics +from augur.tasks.git.scc_value_tasks.tasks import process_scc_value_metrics from augur.tasks.github.util.github_paginator import GithubPaginator, hit_api from augur.tasks.github.util.gh_graphql_entities import PullRequest @@ -124,16 +126,9 @@ def update_analysis_log(repos_id,status): # If there's a commit still there, the previous run was interrupted and # the commit data may be incomplete. It should be trimmed, just in case. - for commit in working_commits: - trim_commit(session, repo_id,commit['working_commit']) - - # Remove the working commit. - remove_commit = s.sql.text("""DELETE FROM working_commits - WHERE repos_id = :repo_id AND - working_commit = :commit""").bindparams(repo_id=repo_id,commit=commit['working_commit']) - session.execute_sql(remove_commit) - session.log_activity('Debug',f"Removed working commit: {commit['working_commit']}") - + commits_to_trim = [commit['working_commit'] for commit in working_commits] + + trim_commits(session,repo_id,commits_to_trim) # Start the main analysis update_analysis_log(repo_id,'Collecting data') @@ -193,8 +188,8 @@ def update_analysis_log(repos_id,status): - for commit in trimmed_commits: - trim_commit(session,repo_id,commit) + #for commit in trimmed_commits: + trim_commits(session,repo_id,trimmed_commits) update_analysis_log(repo_id,'Commit trimming complete') @@ -255,20 +250,20 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: missing_commits = parent_commits - existing_commits session.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") + - queue = [] - if len(missing_commits) > 0: + if not len(missing_commits): #session.log_activity('Info','Type of missing_commits: %s' % type(missing_commits)) - - #encode the repo_id with the commit. - commits = list(missing_commits) - #Get all missing commits into one large list to split into task pools - queue.extend(commits) - else: return + + queue = list(missing_commits) logger.info(f"Got to analysis!") - + absoulte_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) + repo_loc = (f"{absoulte_path}/.git") + + pendingCommitRecordsToInsert = [] + for count, commitTuple in enumerate(queue): quarterQueue = int(len(queue) / 4) @@ -281,13 +276,27 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: #logger.info(f"Got to analysis!") - absoulte_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_id, repo.repo_path,repo.repo_name) - repo_loc = (f"{absoulte_path}/.git") + commitRecords = analyze_commit(session, repo_id, repo_loc, commitTuple) + #logger.debug(commitRecord) + if len(commitRecords): + pendingCommitRecordsToInsert.extend(commitRecords) + if len(pendingCommitRecordsToInsert) >= 1000: + facade_bulk_insert_commits(session,pendingCommitRecordsToInsert) + pendingCommitRecordsToInsert = [] + + facade_bulk_insert_commits(session,pendingCommitRecordsToInsert) - analyze_commit(session, repo_id, repo_loc, commitTuple) + - logger.info("Analysis complete") + + # Remove the working commit. + remove_commit = s.sql.text("""DELETE FROM working_commits + WHERE repos_id = :repo_id AND working_commit IN :hashes + """).bindparams(repo_id=repo_id,hashes=tuple(queue)) + session.execute_sql(remove_commit) + + logger.info("Analysis complete") return @celery.task @@ -389,7 +398,7 @@ def clone_repos(): # with FacadeSession(logger) as session: # check_for_repo_updates(session, repo_git) -@celery.task +@celery.task(base=AugurFacadeRepoCollectionTask) def git_update_commit_count_weight(repo_git): from augur.tasks.init.celery_app import engine @@ -402,7 +411,7 @@ def git_update_commit_count_weight(repo_git): update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) -@celery.task +@celery.task(base=AugurFacadeRepoCollectionTask) def git_repo_updates_facade_task(repo_git): logger = logging.getLogger(git_repo_updates_facade_task.__name__) @@ -526,7 +535,8 @@ def facade_phase(repo_git): group( chain(*facade_core_collection), process_dependency_metrics.si(repo_git), - process_libyear_dependency_metrics.si(repo_git) + process_libyear_dependency_metrics.si(repo_git), + process_scc_value_metrics.si(repo_git) ) ) diff --git a/augur/tasks/git/scc_value_tasks/__init__.py b/augur/tasks/git/scc_value_tasks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/augur/tasks/git/scc_value_tasks/core.py b/augur/tasks/git/scc_value_tasks/core.py new file mode 100644 index 0000000000..5fd7afb7b8 --- /dev/null +++ b/augur/tasks/git/scc_value_tasks/core.py @@ -0,0 +1,57 @@ +from datetime import datetime +import logging +import requests +import json +import os +import subprocess +import re +import traceback +from augur.application.db.models import * +from augur.application.db.session import DatabaseSession +from augur.application.config import AugurConfig +from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler +from augur.application.db.util import execute_session_query +from augur.tasks.util.worker_util import parse_json_from_subprocess_call + +def value_model(session,repo_git,repo_id, path): + """Runs scc on repo and stores data in database + :param repo_id: Repository ID + :param path: absolute file path of the Repostiory + """ + + session.logger.info('Generating value data for repo') + session.logger.info(f"Repo ID: {repo_id}, Path: {path}") + session.logger.info('Running scc...') + + path_to_scc = os.environ['HOME'] + '/scc' + + required_output = parse_json_from_subprocess_call(session.logger,['./scc', '-f','json','--by-file', path], cwd=path_to_scc) + + session.logger.info('adding scc data to database... ') + session.logger.debug(f"output: {required_output}") + + to_insert = [] + for record in required_output: + for file in record['Files']: + repo_labor = { + 'repo_id': repo_id, + 'rl_analysis_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ'), + 'programming_language': file['Language'], + 'file_path': file['Location'], + 'file_name': file['Filename'], + 'total_lines': file['Lines'], + 'code_lines': file['Code'], + 'comment_lines': file['Comment'], + 'blank_lines': file['Blank'], + 'code_complexity': file['Complexity'], + 'repo_url': repo_git, + 'tool_source': 'value_model', + 'data_source': 'Git', + 'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') + } + + to_insert.append(repo_labor) + + session.insert_data(to_insert, RepoLabor, ["repo_id", "rl_analysis_date", "file_path", "file_name" ]) + + session.logger.info(f"Done generating scc data for repo {repo_id} from path {path}") diff --git a/augur/tasks/git/scc_value_tasks/tasks.py b/augur/tasks/git/scc_value_tasks/tasks.py new file mode 100644 index 0000000000..a2e4d11fc8 --- /dev/null +++ b/augur/tasks/git/scc_value_tasks/tasks.py @@ -0,0 +1,28 @@ +import logging +import traceback +from augur.application.db.session import DatabaseSession +from augur.tasks.git.scc_value_tasks.core import * +from augur.tasks.init.celery_app import celery_app as celery +from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask, AugurCoreRepoCollectionTask +from augur.application.db.util import execute_session_query +from augur.application.config import AugurConfig +from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path + + +@celery.task(base=AugurFacadeRepoCollectionTask) +def process_scc_value_metrics(repo_git): + + from augur.tasks.init.celery_app import engine + + logger = logging.getLogger(process_scc_value_metrics.__name__) + + with DatabaseSession(logger,engine) as session: + logger.info(f"repo_git: {repo_git}") + + query = session.query(Repo).filter(Repo.repo_git == repo_git) + repo = execute_session_query(query, 'one') + + config = AugurConfig(session.logger, session) + absolute_repo_path = get_absolute_repo_path(config.get_section("Facade")['repo_directory'],repo.repo_id,repo.repo_path,repo.repo_name) + + value_model(session,repo_git,repo.repo_id, absolute_repo_path) \ No newline at end of file diff --git a/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py b/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py index 2126d1ee9a..285ec6c780 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py @@ -103,7 +103,7 @@ def discover_alias(email): else: return email - def store_commit(repos_id,commit,filename, + def generate_commit_record(repos_id,commit,filename, author_name,author_email,author_date,author_timestamp, committer_name,committer_email,committer_date,committer_timestamp, added,removed, whitespace): @@ -122,72 +122,31 @@ def store_commit(repos_id,commit,filename, #2021-10-11 11:57:46 -0500 placeholder_date = "1970-01-01 00:00:15 -0500" - #session.logger.info(f"Timestamp: {author_timestamp}") commit_record = { 'repo_id' : repos_id, - 'commit' : str(commit), - 'filename' : filename, - 'author_name' : str(author_name), - 'author_email_raw' : author_email, - 'author_email' : discover_alias(author_email), - 'author_date' : author_date, - 'author_timestamp' : author_timestamp if len(author_timestamp.replace(" ", "")) != 0 else placeholder_date, - 'committer_name' : committer_name, - 'committer_email_raw' : committer_email, - 'committer_email' : discover_alias(committer_email), - 'committer_date' : committer_date if len(committer_date.replace(" ", "")) != 0 else placeholder_date, - 'committer_timestamp' : committer_timestamp if len(committer_timestamp.replace(" ","")) != 0 else placeholder_date, - 'added' : added, - 'removed' : removed, - 'whitespace' : whitespace, - 'committer_date' : committer_date if len(committer_date.replace(" ","")) != 0 else placeholder_date, + 'cmt_commit_hash' : str(commit), + 'cmt_filename' : filename, + 'cmt_author_name' : str(author_name), + 'cmt_author_raw_email' : author_email, + 'cmt_author_email' : discover_alias(author_email), + 'cmt_author_date' : author_date, + 'cmt_author_timestamp' : author_timestamp if len(author_timestamp.replace(" ", "")) != 0 else placeholder_date, + 'cmt_committer_name' : committer_name, + 'cmt_committer_raw_email' : committer_email, + 'cmt_committer_email' : discover_alias(committer_email), + 'cmt_committer_date' : committer_date if len(committer_date.replace(" ", "")) != 0 else placeholder_date, + 'cmt_committer_timestamp' : committer_timestamp if len(committer_timestamp.replace(" ","")) != 0 else placeholder_date, + 'cmt_added' : added, + 'cmt_removed' : removed, + 'cmt_whitespace' : whitespace, + 'cmt_date_attempted' : committer_date if len(committer_date.replace(" ","")) != 0 else placeholder_date, 'tool_source' : "Facade", 'tool_version' : "0.42", 'data_source' : "git" } - #TODO: replace with a postgres on conflict do nothing. - IM 10/11/22 - store = s.sql.text("""INSERT INTO commits (repo_id,cmt_commit_hash,cmt_filename, - cmt_author_name,cmt_author_raw_email,cmt_author_email,cmt_author_date,cmt_author_timestamp, - cmt_committer_name,cmt_committer_raw_email,cmt_committer_email,cmt_committer_date,cmt_committer_timestamp, - cmt_added,cmt_removed,cmt_whitespace, cmt_date_attempted, tool_source, tool_version, data_source) - VALUES (:repo_id,:commit,:filename,:author_name,:author_email_raw,:author_email,:author_date,:author_timestamp, - :committer_name,:committer_email_raw,:committer_email,:committer_date,:committer_timestamp, - :added,:removed,:whitespace,:committer_date,:tool_source,:tool_version,:data_source) - """).bindparams(**commit_record) - - try: - session.execute_sql(store) - except DataError as e: - session.logger.error(f"Ran into bad data when trying to insert commit with values: \n {commit_record} \n Error: {e}") - - #Check for improper utc timezone offset - #UTC timezone offset should be betwen -14:00 and +14:00 - - if "time zone displacement" in f"{e}": - commit_record['author_timestamp'] = placeholder_date - commit_record['committer_timestamp'] = placeholder_date - - store = s.sql.text("""INSERT INTO commits (repo_id,cmt_commit_hash,cmt_filename, - cmt_author_name,cmt_author_raw_email,cmt_author_email,cmt_author_date,cmt_author_timestamp, - cmt_committer_name,cmt_committer_raw_email,cmt_committer_email,cmt_committer_date,cmt_committer_timestamp, - cmt_added,cmt_removed,cmt_whitespace, cmt_date_attempted, tool_source, tool_version, data_source) - VALUES (:repo_id,:commit,:filename,:author_name,:author_email_raw,:author_email,:author_date,:author_timestamp, - :committer_name,:committer_email_raw,:committer_email,:committer_date,:committer_timestamp, - :added,:removed,:whitespace,:committer_date,:tool_source,:tool_version,:data_source) - """).bindparams(**commit_record) - - session.execute_sql(store) - else: - raise e - except Exception as e: - - session.logger.error(f"Ran into issue when trying to insert commit with values: \n {commit_record} \n Error: {e}") - raise e - - - #session.log_activity('Debug',f"Stored commit: {commit}") + return commit_record ### The real function starts here ### @@ -199,6 +158,8 @@ def store_commit(repos_id,commit,filename, removed = 0 whitespace = 0 + recordsToInsert = [] + # Go get the contributors (committers) for this repo here: # curl https://api.github.com/repos/chaoss/augur/contributors # Load the contributors @@ -297,10 +258,10 @@ def store_commit(repos_id,commit,filename, if not header: - store_commit(repo_id,commit,filename, + recordsToInsert.append(generate_commit_record(repo_id,commit,filename, author_name,author_email,author_date,author_timestamp, committer_name,committer_email,committer_date,committer_timestamp, - added,removed,whitespace) + added,removed,whitespace)) header = False @@ -356,19 +317,10 @@ def store_commit(repos_id,commit,filename, whitespaceCheck.append(line[1:].strip()) # Store the last stats from the git log - store_commit(repo_id,commit,filename, + recordsToInsert.append(generate_commit_record(repo_id,commit,filename, author_name,author_email,author_date,author_timestamp, committer_name,committer_email,committer_date,committer_timestamp, - added,removed,whitespace) - - # Remove the working commit. - try: - remove_commit = s.sql.text("""DELETE FROM working_commits - WHERE repos_id = :repo_id AND working_commit = :hash - """).bindparams(repo_id=repo_id,hash=commit) - session.execute_sql(remove_commit) - - #session.log_activity('Debug',f"Completed and removed working commit: {commit}") - except: - session.log_activity('Info', f"Working Commit: {commit}") - # If multithreading, clean up the local database + added,removed,whitespace)) + + + return recordsToInsert diff --git a/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py b/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py index a88fd940a3..909c418094 100755 --- a/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py @@ -30,7 +30,7 @@ import sys, platform, imp, time, datetime, html.parser, subprocess, os, getopt, xlsxwriter, configparser, logging from multiprocessing import Process, Queue from .config import FacadeSession as FacadeSession -from .utilitymethods import trim_commit, store_working_author, trim_author +from .utilitymethods import trim_commits, store_working_author, trim_author from .analyzecommit import analyze_commit from .postanalysiscleanup import git_repo_cleanup from .repofetch import git_repo_initialize, check_for_repo_updates, force_repo_updates, force_repo_analysis, git_repo_updates diff --git a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py index e74c33a82e..03206b0242 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py @@ -37,7 +37,7 @@ import xlsxwriter import configparser import sqlalchemy as s -from .utilitymethods import update_repo_log, trim_commit, store_working_author, trim_author +from .utilitymethods import update_repo_log, trim_commits, store_working_author, trim_author # if platform.python_implementation() == 'PyPy': # import pymysql # else: diff --git a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py index c738976e41..35110239bf 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -38,7 +38,7 @@ import configparser import pathlib import sqlalchemy as s -from .utilitymethods import update_repo_log, trim_commit, store_working_author, trim_author, get_absolute_repo_path +from .utilitymethods import update_repo_log, trim_commits, store_working_author, trim_author, get_absolute_repo_path from augur.application.db.models.augur_data import * from augur.application.db.models.augur_operations import CollectionStatus from augur.application.db.util import execute_session_query, convert_orm_list_to_dict_list diff --git a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index e73a202e97..aef4e59989 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -38,6 +38,7 @@ import xlsxwriter import configparser import sqlalchemy as s +from sqlalchemy.exc import IntegrityError, DataError from .config import get_database_args_from_env from augur.application.db.models import * from .config import FacadeSession as FacadeSession @@ -60,19 +61,28 @@ def update_repo_log(session, repos_id,status): session.logger.error(f"Ran into error in update_repo_log: {e}") pass -def trim_commit(session, repo_id,commit): +def trim_commits(session, repo_id,commits): -# Quickly remove a given commit + # Quickly remove a given commit - remove_commit = s.sql.text("""DELETE FROM commits - WHERE repo_id=:repo_id - AND cmt_commit_hash=:hash""").bindparams(repo_id=repo_id,hash=commit) - - - - session.execute_sql(remove_commit) + if len(commits): + remove_commit = s.sql.text("""DELETE FROM commits + WHERE repo_id=:repo_id + AND cmt_commit_hash IN :hashes""").bindparams(repo_id=repo_id,hashes=tuple(commits)) + + + session.execute_sql(remove_commit) + + # Remove the working commit. + remove_commit = s.sql.text("""DELETE FROM working_commits + WHERE repos_id = :repo_id AND + working_commit IN :hashes""").bindparams(repo_id=repo_id,hashes=tuple(commits)) + + session.execute_sql(remove_commit) - session.log_activity('Debug',f"Trimmed commit: {commit}") + for commit in commits: + session.log_activity('Debug',f"Trimmed commit: {commit}") + session.log_activity('Debug',f"Removed working commit: {commit}") def store_working_author(session, email): @@ -205,3 +215,43 @@ def update_facade_scheduling_fields(session, repo_git, weight, commit_count): session.execute(update_query) session.commit() + +def facade_bulk_insert_commits(session,records): + + try: + session.execute( + s.insert(Commit), + records, + ) + session.commit() + except Exception as e: + + if len(records) > 1: + session.logger.error(f"Ran into issue when trying to insert commits \n Error: {e}") + + #split list into halves and retry insert until we isolate offending record + firsthalfRecords = records[:len(records)//2] + secondhalfRecords = records[len(records)//2:] + + facade_bulk_insert_commits(session,firsthalfRecords) + facade_bulk_insert_commits(session,secondhalfRecords) + elif len(records) == 1 and isinstance(e,DataError) and "time zone displacement" in f"{e}": + commit_record = records[0] + #replace incomprehensible dates with epoch. + #2021-10-11 11:57:46 -0500 + placeholder_date = "1970-01-01 00:00:15 -0500" + + #Check for improper utc timezone offset + #UTC timezone offset should be betwen -14:00 and +14:00 + + commit_record['author_timestamp'] = placeholder_date + commit_record['committer_timestamp'] = placeholder_date + + session.execute( + s.insert(Commit), + [commit_record], + ) + session.commit() + else: + raise e + diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index 304574bc80..cf7d2d1e5a 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -6,20 +6,24 @@ from augur.tasks.github.util.util import parse_json_response import logging from datetime import datetime -from enum import Enum +from augur.tasks.util.collection_state import CollectionState from augur.application.db.util import execute_session_query -class CollectionState(Enum): - SUCCESS = "Success" - PENDING = "Pending" - ERROR = "Error" - COLLECTING = "Collecting" -def update_repo_with_dict(current_dict,new_dict,logger,db): - +def update_repo_with_dict(repo,new_dict,logger,db): + """ + Update a repository record in the database using a dictionary tagged with + the appropriate table fields + + Args: + repo: orm repo object to update + new_dict: dict of new values to add to the repo record + logger: logging object + db: db object + """ - to_insert = current_dict + to_insert = repo.__dict__ del to_insert['_sa_instance_state'] to_insert.update(new_dict) @@ -45,7 +49,6 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' owner, name = get_owner_repo(repo.repo_git) url = f"https://api.github.com/repos/{owner}/{name}" - current_repo_dict = repo.__dict__ attempts = 0 while attempts < 10: @@ -56,64 +59,71 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' attempts += 1 - #Mark as errored if not found - if response_from_gh.status_code == 404: - logger.error(f"Repo {repo.repo_git} responded 404 when pinged!") + #Update Url and retry if 301 + #301 moved permanently + if response_from_gh.status_code == 301: + + owner, name = extract_owner_and_repo_from_endpoint(key_auth, response_from_gh.headers['location'], logger) + try: + old_description = str(repo.description) + except Exception: + old_description = "" + + #Create new repo object to update existing repo_update_dict = { - 'repo_git': repo.repo_git, - 'repo_path': None, - 'repo_name': None, - 'description': f"During our check for this repo on {datetime.today().strftime('%Y-%m-%d')}, a 404 error was returned. The repository does not appear to have moved. Instead, it appears to be deleted", - 'data_collection_date': datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') + 'repo_git': f"https://github.com/{owner}/{name}", + 'repo_path': None, + 'repo_name': None, + 'description': f"(Originally hosted at {url}) {old_description}" } - update_repo_with_dict(current_repo_dict, repo_update_dict, logger, augur_db) - - raise Exception(f"ERROR: Repo not found at requested host {repo.repo_git}") - elif attempts >= 10: - logger.warning(f"Could not check if repo moved because the api timed out 10 times. Url: {url}") - return - + update_repo_with_dict(repo, repo_update_dict, logger,augur_db) - #skip if not moved - #301 moved permanently - if response_from_gh.status_code != 301: - logger.info(f"Repo found at url: {url}") - return + raise Exception("ERROR: Repo has moved! Resetting Collection!") - owner, name = extract_owner_and_repo_from_endpoint(key_auth, response_from_gh.headers['location'], logger) - - - try: - old_description = str(repo.description) - except: - old_description = "" + #Mark as ignore if 404 + if response_from_gh.status_code == 404: + repo_update_dict = { + 'repo_git': repo.repo_git, + 'repo_path': None, + 'repo_name': None, + 'description': f"During our check for this repo on {datetime.today().strftime('%Y-%m-%d')}, a 404 error was returned. The repository does not appear to have moved. Instead, it appears to be deleted", + 'data_collection_date': datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') + } - #Create new repo object to update existing - repo_update_dict = { - 'repo_git': f"https://github.com/{owner}/{name}", - 'repo_path': None, - 'repo_name': None, - 'description': f"(Originally hosted at {url}) {old_description}" - } + update_repo_with_dict(repo, repo_update_dict, logger, augur_db) - update_repo_with_dict(current_repo_dict, repo_update_dict, logger,augur_db) + statusQuery = augur_db.session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) - statusQuery = augur_db.session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) + collectionRecord = execute_session_query(statusQuery,'one') - collectionRecord = execute_session_query(statusQuery,'one') - if collection_hook == 'core': - collectionRecord.core_status = CollectionState.PENDING.value + collectionRecord.core_status = CollectionState.IGNORE.value collectionRecord.core_task_id = None collectionRecord.core_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') - elif collection_hook == 'secondary': - collectionRecord.secondary_status = CollectionState.PENDING.value + + collectionRecord.secondary_status = CollectionState.IGNORE.value collectionRecord.secondary_task_id = None collectionRecord.secondary_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') - augur_db.session.commit() + collectionRecord.facade_status = CollectionState.IGNORE.value + collectionRecord.facade_task_id = None + collectionRecord.facade_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') - raise Exception("ERROR: Repo has moved! Marked repo as pending and stopped collection") + collectionRecord.ml_status = CollectionState.IGNORE.value + collectionRecord.ml_task_id = None + collectionRecord.ml_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') + + augur_db.session.commit() + raise Exception("ERROR: Repo has moved! Resetting Collection!") + + + if attempts >= 10: + logger.error(f"Could not check if repo moved because the api timed out 10 times. Url: {url}") + raise Exception(f"ERROR: Could not get api response for repo: {url}") + #skip if not 404 + logger.info(f"Repo found at url: {url}") + return + diff --git a/augur/tasks/github/events/tasks.py b/augur/tasks/github/events/tasks.py index 54996c42cc..640079d852 100644 --- a/augur/tasks/github/events/tasks.py +++ b/augur/tasks/github/events/tasks.py @@ -1,6 +1,7 @@ import time import logging import traceback +import sqlalchemy as s from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask @@ -180,6 +181,7 @@ def process_events(events, task_name, repo_id, logger, augur_db): issue_event_natural_keys = ["issue_id", "issue_event_src_id"] augur_db.insert_data(issue_event_dicts, IssueEvent, issue_event_natural_keys) + update_issue_closed_cntrbs_from_events(augur_db.engine, repo_id) # TODO: Should we skip an event if there is no contributor to resolve it o def process_github_event_contributors(logger, event, tool_source, tool_version, data_source): @@ -194,3 +196,32 @@ def process_github_event_contributors(logger, event, tool_source, tool_version, return event, None return event, event_cntrb + + +def update_issue_closed_cntrbs_from_events(engine, repo_id): + + get_ranked_issues = s.text(f""" + WITH RankedIssues AS ( + SELECT repo_id, issue_id, cntrb_id, + ROW_NUMBER() OVER(PARTITION BY issue_id ORDER BY created_at DESC) AS rn + FROM issue_events + WHERE "action" = 'closed' + ) + + SELECT issue_id, cntrb_id from RankedIssues where rn=1 and repo_id={repo_id} and cntrb_id is not NULL + """) + + with engine.connect() as conn: + result = conn.execute(get_ranked_issues).fetchall() + + update_data = [{'issue_id': row[0], 'cntrb_id': row[1], 'repo_id': repo_id} for row in result] + with engine.connect() as connection: + update_stmt = s.text(""" + UPDATE issues + SET cntrb_id = :cntrb_id + WHERE issue_id = :issue_id + AND repo_id = :repo_id + """) + connection.execute(update_stmt, update_data) + + diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 04ccc16e12..26d1027538 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -3,6 +3,7 @@ from augur.tasks.init.celery_app import celery_app as celery +from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask from augur.tasks.github.util.github_paginator import GithubPaginator, hit_api, retrieve_dict_from_endpoint from augur.tasks.github.util.github_task_session import GithubTaskSession, GithubTaskManifest from augur.tasks.github.util.util import get_owner_repo @@ -198,7 +199,7 @@ def link_commits_to_contributor(session,contributorQueue): # Update the contributors table from the data facade has gathered. -@celery.task +@celery.task(base=AugurFacadeRepoCollectionTask) def insert_facade_contributors(repo_id): from augur.tasks.init.celery_app import engine @@ -251,8 +252,8 @@ def insert_facade_contributors(repo_id): """).bindparams(repo_id=repo_id) #Execute statement with session. - result = manifest.augur_db.execute_sql(new_contrib_sql).fetchall() - new_contribs = [dict(zip(row.keys(), row)) for row in result] + result = manifest.augur_db.execute_sql(new_contrib_sql) + new_contribs = [dict(row) for row in result.mappings()] #print(new_contribs) @@ -302,8 +303,8 @@ def insert_facade_contributors(repo_id): #existing_cntrb_emails = json.loads(pd.read_sql(resolve_email_to_cntrb_id_sql, self.db, params={ # 'repo_id': repo_id}).to_json(orient="records")) - result = session.execute_sql(resolve_email_to_cntrb_id_sql).fetchall() - existing_cntrb_emails = [dict(zip(row.keys(), row)) for row in result] + result = session.execute_sql(resolve_email_to_cntrb_id_sql) + existing_cntrb_emails = [dict(row) for row in result.mappings()] print(existing_cntrb_emails) link_commits_to_contributor(session,list(existing_cntrb_emails)) diff --git a/augur/tasks/github/issues/tasks.py b/augur/tasks/github/issues/tasks.py index 81fa3a341a..0ba793470e 100644 --- a/augur/tasks/github/issues/tasks.py +++ b/augur/tasks/github/issues/tasks.py @@ -5,6 +5,7 @@ from sqlalchemy.exc import IntegrityError +from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask @@ -29,16 +30,29 @@ def collect_issues(repo_git : str) -> int: augur_db = manifest.augur_db + logger.info(f'this is the manifest.key_auth value: {str(manifest.key_auth)}') + try: query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) repo_obj = execute_session_query(query, 'one') repo_id = repo_obj.repo_id + #try this + # the_key = manifest.key_auth + # try: + # randomon = GithubApiKeyHandler(augur_db.session) + # the_key = randomon.get_random_key() + # logger.info(f'The Random Key {the_key}') + # except Exception as e: + # logger.info(f'error: {e}') + # the_key = manifest.key_auth + # pass + owner, repo = get_owner_repo(repo_git) issue_data = retrieve_all_issue_data(repo_git, logger, manifest.key_auth) - + #issue_data = retrieve_all_issue_data(repo_git, logger, the_key) if issue_data: total_issues = len(issue_data) @@ -181,7 +195,7 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: issue_assignee_dicts += add_key_value_pair_to_dicts(other_issue_data["assignees"], "issue_id", issue_id) - logger.info(f"{task_name}: Inserting other issue data of lengths: Labels: {len(issue_label_dicts)} - Assignees: {len(issue_assignee_dicts)}") + logger.info(f"{task_name}: Inserting other github issue data of lengths: Labels: {len(issue_label_dicts)} - Assignees: {len(issue_assignee_dicts)}") # inserting issue labels # we are using label_src_id and issue_id to determine if the label is already in the database. @@ -207,7 +221,7 @@ def process_issue_contributors(issue, tool_source, tool_version, data_source): for assignee in issue["assignees"]: - issue_assignee_cntrb = extract_needed_contributor_data(issue["user"], tool_source, tool_version, data_source) + issue_assignee_cntrb = extract_needed_contributor_data(assignee, tool_source, tool_version, data_source) assignee["cntrb_id"] = issue_assignee_cntrb["cntrb_id"] contributors.append(issue_assignee_cntrb) diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index 6e23434bae..4dfd3a634b 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -187,7 +187,8 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): message_string_fields = ["msg_text"] message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) - + if message_return_data is None: + return pr_message_ref_dicts = [] issue_message_ref_dicts = [] diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index e7ebcd9457..81b4c4397a 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -20,8 +20,8 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth): pr_numbers = [] #pd.read_sql(pr_number_sql, self.db, params={}) - result = augur_db.execute_sql(pr_number_sql).fetchall() - pr_numbers = [dict(zip(row.keys(), row)) for row in result] + result = augur_db.execute_sql(pr_number_sql)#.fetchall() + pr_numbers = [dict(row) for row in result.mappings()] query = augur_db.session.query(Repo).filter(Repo.repo_id == repo_id) repo = execute_session_query(query, 'one') diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index a5ba6db7c4..8db394754c 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -74,9 +74,18 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None: return all_data - -def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): +def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): + """ + Parse and insert all retrieved PR data. + + Arguments: + pull_requests: List of paginated pr endpoint data + task_name: Name of the calling task and the repo + repo_id: augur id of the repository + logger: logging object + augur_db: sqlalchemy db object + """ tool_source = "Pr Task" tool_version = "2.0" data_source = "Github API" @@ -333,7 +342,7 @@ def collect_pull_request_reviews(repo_git: str) -> None: pr_count = len(prs) - all_raw_pr_reviews = [] + all_pr_reviews = {} for index, pr in enumerate(prs): pr_number = pr.pr_src_number @@ -343,9 +352,9 @@ def collect_pull_request_reviews(repo_git: str) -> None: pr_review_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews" - pr_reviews = GithubPaginator(pr_review_url, manifest.key_auth, logger) - - for page_data, page in pr_reviews.iter_pages(): + pr_reviews = [] + pr_reviews_generator = GithubPaginator(pr_review_url, manifest.key_auth, logger) + for page_data, page in pr_reviews_generator.iter_pages(): if page_data is None: break @@ -353,30 +362,36 @@ def collect_pull_request_reviews(repo_git: str) -> None: if len(page_data) == 0: break - all_raw_pr_reviews.extend(page_data) + pr_reviews.extend(page_data) + + if pr_reviews: + all_pr_reviews[pull_request_id] = pr_reviews - if not all_raw_pr_reviews: + if not list(all_pr_reviews.keys()): logger.info(f"{owner}/{repo} No pr reviews for repo") return contributors = [] - for raw_pr_review in all_raw_pr_reviews: - contributor = process_pull_request_review_contributor(raw_pr_review, tool_source, tool_version, data_source) - if contributor: - contributors.append(contributor) + for pull_request_id in all_pr_reviews.keys(): + + reviews = all_pr_reviews[pull_request_id] + for review in reviews: + contributor = process_pull_request_review_contributor(review, tool_source, tool_version, data_source) + if contributor: + contributors.append(contributor) logger.info(f"{owner}/{repo} Pr reviews: Inserting {len(contributors)} contributors") augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) pr_reviews = [] - for raw_pr_review in all_raw_pr_reviews: - - logger.info(f"Pr review type: {type(raw_pr_review)}") - logger.info(raw_pr_review) + for pull_request_id in all_pr_reviews.keys(): - if "cntrb_id" in raw_pr_review: - pr_reviews.append(extract_needed_pr_review_data(raw_pr_review, pull_request_id, repo_id, platform_id, tool_source, tool_version)) + reviews = all_pr_reviews[pull_request_id] + for review in reviews: + + if "cntrb_id" in review: + pr_reviews.append(extract_needed_pr_review_data(review, pull_request_id, repo_id, platform_id, tool_source, tool_version)) logger.info(f"{owner}/{repo}: Inserting pr reviews of length: {len(pr_reviews)}") pr_review_natural_keys = ["pr_review_src_id",] @@ -395,7 +410,3 @@ def collect_pull_request_reviews(repo_git: str) -> None: - - - - diff --git a/augur/tasks/github/releases/core.py b/augur/tasks/github/releases/core.py index f3050fc1b3..5957d4cb57 100644 --- a/augur/tasks/github/releases/core.py +++ b/augur/tasks/github/releases/core.py @@ -84,7 +84,8 @@ def insert_release(augur_db, logger, repo_id, owner, release, tag_only = False): release_inf = get_release_inf(repo_id, release, tag_only) #Do an upsert - augur_db.insert_data(release_inf,Release,['release_id']) + string_fields = ["release_name", "release_description", "release_author", "release_tag_name"] + augur_db.insert_data(release_inf,Release,['release_id'], string_fields=string_fields) logger.info(f"Inserted info for {owner}/{repo_id}/{release['name']}\n") diff --git a/augur/tasks/github/repo_info/core.py b/augur/tasks/github/repo_info/core.py index 50fa88068e..50142f614e 100644 --- a/augur/tasks/github/repo_info/core.py +++ b/augur/tasks/github/repo_info/core.py @@ -150,10 +150,10 @@ def repo_info_model(augur_db, key_auth, repo_orm_obj, logger): pr_merged: pullRequests(states: MERGED) { totalCount } - ref(qualifiedName: "master") { + defaultBranchRef { target { ... on Commit { - history(first: 0){ + history { totalCount } } @@ -248,7 +248,7 @@ def repo_info_model(augur_db, key_auth, repo_orm_obj, logger): 'security_audit_file': None, 'status': None, 'keywords': None, - 'commit_count': data['ref']['target']['history']['totalCount'] if data['ref'] else None, + 'commit_count': data['defaultBranchRef']['target']['history']['totalCount'] if data['defaultBranchRef'] else None, 'issues_count': data['issue_count']['totalCount'] if data['issue_count'] else None, 'issues_closed': data['issues_closed']['totalCount'] if data['issues_closed'] else None, 'pull_request_count': data['pr_count']['totalCount'] if data['pr_count'] else None, @@ -256,7 +256,7 @@ def repo_info_model(augur_db, key_auth, repo_orm_obj, logger): 'pull_requests_closed': data['pr_closed']['totalCount'] if data['pr_closed'] else None, 'pull_requests_merged': data['pr_merged']['totalCount'] if data['pr_merged'] else None, 'tool_source': 'Repo_info Model', - 'tool_version': '0.42', + 'tool_version': '0.50.0', 'data_source': "Github" } diff --git a/augur/tasks/github/util/github_api_key_handler.py b/augur/tasks/github/util/github_api_key_handler.py index 2406ecef00..20ce07f066 100644 --- a/augur/tasks/github/util/github_api_key_handler.py +++ b/augur/tasks/github/util/github_api_key_handler.py @@ -7,6 +7,7 @@ from augur.tasks.util.redis_list import RedisList from augur.application.db.session import DatabaseSession from augur.application.config import AugurConfig +from sqlalchemy import func class NoValidKeysError(Exception): @@ -31,7 +32,7 @@ def __init__(self, session: DatabaseSession): self.logger = session.logger self.config = AugurConfig(self.logger, session) - self.oauth_redis_key = "oauth_keys_list" + self.oauth_redis_key = "github_oauth_keys_list" self.redis_key_list = RedisList(self.oauth_redis_key) @@ -39,7 +40,7 @@ def __init__(self, session: DatabaseSession): self.keys = self.get_api_keys() - # self.logger.debug(f"Retrieved {len(self.keys)} github api keys for use") + self.logger.info(f"Retrieved {len(self.keys)} github api keys for use") def get_random_key(self): """Retrieves a random key from the list of keys @@ -71,9 +72,12 @@ def get_api_keys_from_database(self) -> List[str]: from augur.application.db.models import WorkerOauth select = WorkerOauth.access_token + # randomizing the order at db time + #select.order_by(func.random()) where = [WorkerOauth.access_token != self.config_key, WorkerOauth.platform == 'github'] - return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).all()] + return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).order_by(func.random()).all()] + #return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).all()] def get_api_keys(self) -> List[str]: @@ -130,6 +134,18 @@ def get_api_keys(self) -> List[str]: if not valid_keys: raise NoValidKeysError("No valid github api keys found in the config or worker oauth table") + + # shuffling the keys so not all processes get the same keys in the same order + valid_now = valid_keys + #try: + #self.logger.info(f'valid keys before shuffle: {valid_keys}') + #valid_keys = random.sample(valid_keys, len(valid_keys)) + #self.logger.info(f'valid keys AFTER shuffle: {valid_keys}') + #except Exception as e: + # self.logger.debug(f'{e}') + # valid_keys = valid_now + # pass + return valid_keys def is_bad_api_key(self, client: httpx.Client, oauth_key: str) -> bool: diff --git a/augur/tasks/github/util/github_paginator.py b/augur/tasks/github/util/github_paginator.py index 548d25b0f9..31c14565df 100644 --- a/augur/tasks/github/util/github_paginator.py +++ b/augur/tasks/github/util/github_paginator.py @@ -154,6 +154,8 @@ class GithubApiResult(Enum): SECONDARY_RATE_LIMIT = 4 RATE_LIMIT_EXCEEDED = 5 ABUSE_MECHANISM_TRIGGERED = 6 + # TODO: Add bad credentials detection that removes key + # from redis if bad credentials are detected BAD_CREDENTIALS = 7 HTML = 8 EMPTY_STRING = 9 diff --git a/augur/tasks/github/util/github_random_key_auth.py b/augur/tasks/github/util/github_random_key_auth.py index 158d578a7c..926ac04216 100644 --- a/augur/tasks/github/util/github_random_key_auth.py +++ b/augur/tasks/github/util/github_random_key_auth.py @@ -3,6 +3,7 @@ from augur.tasks.util.random_key_auth import RandomKeyAuth from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler from augur.application.db.session import DatabaseSession +import random class GithubRandomKeyAuth(RandomKeyAuth): @@ -16,6 +17,7 @@ def __init__(self, session: DatabaseSession, logger): # gets the github api keys from the database via the GithubApiKeyHandler github_api_keys = GithubApiKeyHandler(session).keys + #github_api_keys = random.sample(github_api_keys, len(github_api_keys)) if not github_api_keys: print("Failed to find github api keys. This is usually because your key has expired") diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index fbb23dd6e8..42989dcca3 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -54,10 +54,21 @@ def parse_json_response(logger: logging.Logger, response: httpx.Response) -> dic try: return response.json() except json.decoder.JSONDecodeError as e: - logger.warning(f"invalid return from GitHub. Response was: {response.text}. Exception: {e}") + logger.warning(f"invalid return. Response was: {response.text}. Exception: {e}") return json.loads(json.dumps(response.text)) def get_repo_weight_by_issue(logger,repo_git): + """ + Retrieve the sum of the number of issues and prs in a repository from a graphql query. + + Arguments: + logger: logger object + repo_git: repository url + + Returns: + Sum of issues and prs for that repo + """ + from augur.tasks.github.util.gh_graphql_entities import GitHubRepo as GitHubRepoGraphql owner,name = get_owner_repo(repo_git) diff --git a/augur/tasks/gitlab/events_task.py b/augur/tasks/gitlab/events_task.py new file mode 100644 index 0000000000..8058831ba3 --- /dev/null +++ b/augur/tasks/gitlab/events_task.py @@ -0,0 +1,209 @@ +""" +Module to define the task methods to collect gitlab event data for augur +""" +import logging + +from augur.tasks.init.celery_app import celery_app as celery +from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask +from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler +from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest +from augur.application.db.data_parse import extract_gitlab_mr_event_data, extract_gitlab_issue_event_data +from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts +from augur.application.db.models import Repo, Issue, IssueEvent, PullRequest, PullRequestEvent +from augur.application.db.util import execute_session_query + +platform_id = 2 + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_issue_events(repo_git) -> int: + """ + Retrieve and parse gitlab events for the desired repo + + Arguments: + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_gitlab_issue_events.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + events = retrieve_all_gitlab_event_data("issue", repo_git, logger, manifest.key_auth) + + if events: + logger.info(f"Length of gitlab issue events: {len(events)}") + process_issue_events(events, f"{owner}/{repo}: Gitlab Issue Events task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab issue events") + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_merge_request_events(repo_git) -> int: + """ + Retrieve and parse gitlab mrs for the desired repo + + Arguments: + repo_git: the repo url string + """ + + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_gitlab_issue_events.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + events = retrieve_all_gitlab_event_data("merge_request", repo_git, logger, manifest.key_auth) + + if events: + logger.info(f"Length of gitlab merge request events: {len(events)}") + process_mr_events(events, f"{owner}/{repo}: Gitlab MR Events task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request events") + + +def retrieve_all_gitlab_event_data(gtype, repo_git, logger, key_auth) -> None: + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + gtype: type of event data + repo_git: url of the relevant repo + logger: loggin object + key_auth: key auth cache and rotator object + """ + + owner, repo = get_owner_repo(repo_git) + + logger.info(f"Collecting gitlab issue events for {owner}/{repo}") + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type={gtype}" + events = GitlabApiHandler(key_auth, logger) + + all_data = [] + num_pages = events.get_num_pages(url) + for page_data, page in events.iter_pages(url): + + if page_data is None: + return all_data + + if len(page_data) == 0: + logger.debug( + f"{owner}/{repo}: Gitlab {gtype} Events Page {page} contains no data...returning") + logger.info(f"{owner}/{repo}: {gtype} Events Page {page} of {num_pages}") + return all_data + + logger.info(f"{owner}/{repo}: Gitlab {gtype} Events Page {page} of {num_pages}") + + all_data += page_data + + return all_data + +def process_issue_events(events, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + events: List of dictionaries of issue event data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Gitlab issue events task" + tool_version = "2.0" + data_source = "Gitlab API" + + issue_event_dicts = [] + + # create mapping from issue number to issue id of current issues + issue_url_to_id_map = {} + issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + for issue in issues: + issue_url_to_id_map[issue.gh_issue_number] = issue.issue_id + + for event in events: + + issue_number = event["target_iid"] + + try: + issue_id = issue_url_to_id_map[issue_number] + except KeyError: + logger.info(f"{task_name}: Could not find related issue") + logger.info(f"{task_name}: We were searching for an issue with number {issue_number} in repo {repo_id}") + logger.info(f"{task_name}: Skipping") + continue + + issue_event_dicts.append( + extract_gitlab_issue_event_data(event, issue_id, platform_id, repo_id, + tool_source, tool_version, data_source) + ) + + logger.info(f"{task_name}: Inserting {len(issue_event_dicts)} gitlab issue events") + issue_event_natural_keys = ["issue_id", "issue_event_src_id"] + augur_db.insert_data(issue_event_dicts, IssueEvent, issue_event_natural_keys) + + +def process_mr_events(events, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr events from the api response + + Arguments: + labels: List of dictionaries of label data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed label dicts + """ + + tool_source = "Gitlab mr events task" + tool_version = "2.0" + data_source = "Gitlab API" + + mr_event_dicts = [] + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + for event in events: + + mr_number = event["target_iid"] + + try: + issue_id = mr_number_to_id_map[mr_number] + except KeyError: + logger.info(f"{task_name}: Could not find related mr") + logger.info(f"{task_name}: We were searching for an mr with number {mr_number} in repo {repo_id}") + logger.info(f"{task_name}: Skipping") + continue + + mr_event_dicts.append( + extract_gitlab_mr_event_data(event, issue_id, platform_id, repo_id, + tool_source, tool_version, data_source) + ) + + # TODO: Add unique key for this + logger.info(f"{task_name}: Inserting {len(mr_event_dicts)} gitlab mr events") + mr_event_natural_keys = ["pull_request_id", "issue_event_src_id"] + augur_db.insert_data(mr_event_dicts, PullRequestEvent, mr_event_natural_keys) + + diff --git a/augur/tasks/gitlab/gitlab_api_handler.py b/augur/tasks/gitlab/gitlab_api_handler.py new file mode 100644 index 0000000000..5303d606e9 --- /dev/null +++ b/augur/tasks/gitlab/gitlab_api_handler.py @@ -0,0 +1,386 @@ +""" +Defines a GitlabApiHandler class to paginate and handle interaction with GitLab's +api through automatic use of relevant key auth and pagination tools. +""" +import httpx +import time +import logging + +from typing import List, Optional, Union, Generator, Tuple +from urllib.parse import urlencode, urlparse, parse_qs, urlunparse +from enum import Enum + +from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth +from augur.tasks.github.util.util import parse_json_response + +class GitlabApiResult(Enum): + """All the different results of querying the Gitlab API.""" + + SUCCESS = 0 + TIMEOUT = 1 + NO_MORE_ATTEMPTS = 2 + NOT_FOUND = 3 + SECONDARY_RATE_LIMIT = 4 + RATE_LIMIT_EXCEEDED = 5 + ABUSE_MECHANISM_TRIGGERED = 6 + # TODO: Add bad credentials detection that removes key + # from redis if bad credentials are detected + BAD_CREDENTIALS = 7 + +class GitlabApiHandler(): + """This class is a sequence that handles retrieving data from the Gitlab API. + + Attributes: + url (str): The url that we are collecting data + key_mangager (GitlabRandomKeyAuth): Custom httpx auth class + that randomizes the github api key a request gets. + This is how the requests are getting their api keys + logger (logging.Logger): Logger that handler printing information to files and stdout + """ + + def __init__(self, key_manager: GitlabRandomKeyAuth, logger: logging.Logger): + """Initialize the class GitlabPaginator. + + Args: + url: url that the data is being collected + key_manager: class that randomly selects a Gitlab API key for each request + logger: handles logging + from_datetime: collects data after this datatime (not yet implemented) + to_datetime: collects data before this datatime (not yet implemented) + """ + self.key_manager = key_manager + self.logger = logger + + def get_length(self, url): + """Get the length of the Gitlab API data. + + Returns: + The length of the Gitlab API data at the url. + + Examples: + This function is called when len() is called on the GitlabPaginator class for example. + + issues = GitlabPaginator(url, session.oauths, logger) + issue_len = len(issues) + """ + + num_pages = self.get_num_pages(url) + + self.logger.info(f"Num pages: {num_pages}") + + params = {"page": num_pages} + url = add_query_params(url, params) + + # get the amount of data on last page + data, _, result = self.retrieve_data(url) + + if result == GitlabApiResult.SUCCESS: + return (100 * (num_pages -1)) + len(data) + + self.logger.debug("Unable to retrieve data length from api") + return 0 + + def iter(self, url) -> Generator[Optional[dict], None, None]: + """Provide data from Gitlab API via a generator that yields one dict at a time. + + Yields: + A piece of data from the github api as the specified url + """ + + url = self._set_paginaton_query_params(url) + + data_list, response, result = self.retrieve_data(url) + + if result != GitlabApiResult.SUCCESS: + self.logger.debug("Failed to retrieve the data even though 10 attempts were given") + yield None + return + + # yield the first page data + for data in data_list: + yield data + + while 'next' in response.links.keys(): + next_page = response.links['next']['url'] + + # Here we don't need to pass in params with the page, or the default params because the url from the headers already has those values + data_list, response, result = self.retrieve_data(next_page) + + if result != GitlabApiResult.SUCCESS: + self.logger.debug("Failed to retrieve the data even though 10 attempts were given") + return + + for data in data_list: + yield data + + def iter_pages(self, url) -> Generator[Tuple[Optional[List[dict]], int], None, None]: + """Provide data from Gitlab API via a generator that yields a page of dicts at a time. + + Returns: + A page of data from the Gitlab API at the specified url + """ + + url = self._set_paginaton_query_params(url) + + # retrieves the data for the given url + data_list, response, result = self.retrieve_data(url) + + if result != GitlabApiResult.SUCCESS: + self.logger.debug("Failed to retrieve the data even though 10 attempts were given") + yield None, None + return + + # this retrieves the page for the given url + page_number = get_url_page_number(url) + + # yields the first page of data and its page number + yield data_list, page_number + + while 'next' in response.links.keys(): + + # gets the next page from the last responses header + next_page = response.links['next']['url'] + + # Here we don't need to pass in params with the page, or the default params because the url from the headers already has those values + data_list, response, result = self.retrieve_data(next_page) + + if result != GitlabApiResult.SUCCESS: + self.logger.debug(f"Failed to retrieve the data for even though 10 attempts were given. Url: {next_page}") + return + + page_number = get_url_page_number(next_page) + + # if either the data or response is None then yield None and return + if data_list is None or response is None: + return + + # yield the data from the page and its number + yield data_list, page_number + + def retrieve_data(self, url: str) -> Tuple[Optional[List[dict]], Optional[httpx.Response]]: + """Attempt to retrieve data at given url. + + Args: + url: The url to retrieve the data from + + Returns + The response object from hitting the url and the data on the page + """ + + timeout = 30 + timeout_count = 0 + num_attempts = 1 + while num_attempts <= 10: + + response = hit_api(self.key_manager, url, self.logger, timeout) + + num_attempts += 1 + + if response is None: + if timeout_count == 10: + self.logger.error(f"Request timed out 10 times for {url}") + return None, None, GitlabApiResult.TIMEOUT + + timeout = timeout * 1.1 + num_attempts += 1 + continue + + if response.status_code == 500: + self.logger.error(f"Gitlab returned {response.status_code} error when fetching {url}. Message: {response.json()}") + continue + + if response.status_code == 429: + + current_epoch = int(time.time()) + epoch_when_key_resets = int(response.headers["ratelimit-reset"]) + key_reset_time = epoch_when_key_resets - current_epoch + + if key_reset_time < 0: + self.logger.error(f"Key reset time was less than 0 setting it to 0.\nThe current epoch is {current_epoch} and the epoch that the key resets at is {epoch_when_key_resets}") + key_reset_time = 0 + + self.logger.info(f"\n\n\nGitlab API rate limit exceeded. Sleeping until the key resets ({key_reset_time} seconds)") + time.sleep(key_reset_time) + continue + + if response.status_code == 404: + self.logger.info(f"ERROR: 404 not found for {url}") + return [], response, GitlabApiResult.NOT_FOUND + + if response.status_code == 204: + return [], response, GitlabApiResult.SUCCESS + + if response.status_code >= 200 and response.status_code <=299: + + page_data = parse_json_response(self.logger, response) + return page_data, response, GitlabApiResult.SUCCESS + + self.logger.warning(f"Unhandled gitlab response. Status code: {response.status_code}. Body: {response.json()}") + + + + self.logger.error("Unable to collect data in 10 attempts") + return None, None, GitlabApiResult.NO_MORE_ATTEMPTS + + def get_num_pages(self, url) -> Optional[int]: + """Get the number of pages of data that a url can paginate through. + + Returns: + The number of pages a url can access + """ + + url = self._set_paginaton_query_params(url) + + timeout: float = 5 + num_attempts = 0 + while num_attempts < 10: + r = self.hit_api(url=url, timeout=timeout, method="HEAD") + + if r: + break + + timeout = timeout * 1.2 + else: + raise RuntimeError("Unable to get the number of pages of data in 10 attempts") + + if 'last' not in r.links.keys(): + return 1 + + # get the last url from header + last_page_url = r.links['last']['url'] + + parsed_url = urlparse(last_page_url) + try: + num_pages = int(parse_qs(parsed_url.query)['page'][0]) + except (KeyError, ValueError): + return None + + return num_pages + + def hit_api(self, url, timeout, method): + """Attempt to retrieve data at given url. + + Args: + url: The url to retrieve the data from + timeout: time to wait until timeout + method: GET, POST, etc. + + Returns + The response object from hitting the url and the data on the page + """ + + return hit_api(self.key_manager, url, self.logger, timeout, method=method) + + def _set_paginaton_query_params(self, url): + + remove_fields = ["per_page", "page"] + url = clean_url(url, remove_fields) + + # we need to add query params directly to the url, instead of passing the param to the httpx.Client.request + # this is because github will only append specified params to the links in the headers if they are a part + # of the url, and not the params with the request + params = {"per_page": 100} + url = add_query_params(url, params) + + return url + +################################################################################ + +# Url Helper Method to remove query parameters from the url +def clean_url(url: str, keys: List[str]) -> str: + """Remove query params from url. + + Args: + url: the url that is being modified + keys: the query params that are being removed + + Returns: + A url with the params in keys removed + """ + u = urlparse(url) + query = parse_qs(u.query, keep_blank_values=True) + + for key in keys: + query.pop(key, None) + + u = u._replace(query=urlencode(query, True)) + + return urlunparse(u) + + +def add_query_params(url: str, additional_params: dict) -> str: + """Add query params to a url. + + Args: + url: the url that is being modified + additional_params: key value pairs specifying the parameters to be added + + Returns: + The url with the key value pairs in additional_params added as query params + """ + url_components = urlparse(url) + original_params = parse_qs(url_components.query) + # Before Python 3.5 you could update original_params with + # additional_params, but here all the variables are immutable. + merged_params = {**original_params, **additional_params} + updated_query = urlencode(merged_params, doseq=True) + # _replace() is how you can create a new NamedTuple with a changed field + return url_components._replace(query=updated_query).geturl() + + +def get_url_page_number(url: str) -> int: + """Parse the page number from the url. + + Note: + If the url does not contain a page number the function returns 1 + + Args: + url: url to get the page number from + + Returns: + The page number that the url contains + """ + parsed_url = urlparse(url) + try: + # if page is not a url query param then this is page 1 + page_number = int(parse_qs(parsed_url.query)['page'][0]) + + except KeyError: + return 1 + + return page_number + +################################################################################ + +def hit_api(key_manager, url: str, logger: logging.Logger, timeout: float = 10, method: str = 'GET', ) -> Optional[httpx.Response]: + """Ping the api and get the data back for the page. + + Returns: + A httpx response that contains the data. None if a timeout occurs + """ + # self.logger.info(f"Hitting endpoint with {method} request: {url}...\n") + + with httpx.Client() as client: + + try: + response = client.request( + method=method, url=url, auth=key_manager, timeout=timeout, follow_redirects=True) + + except TimeoutError: + logger.info(f"Request timed out. Sleeping {round(timeout)} seconds and trying again...\n") + time.sleep(round(timeout)) + return None + except httpx.TimeoutException: + logger.info(f"Request timed out. Sleeping {round(timeout)} seconds and trying again...\n") + time.sleep(round(timeout)) + return None + except httpx.NetworkError: + logger.info(f"Network Error. Sleeping {round(timeout)} seconds and trying again...\n") + time.sleep(round(timeout)) + return None + except httpx.ProtocolError: + logger.info(f"Protocol Error. Sleeping {round(timeout*1.5)} seconds and trying again...\n") + time.sleep(round(timeout*1.5)) + return None + + return response diff --git a/augur/tasks/gitlab/gitlab_api_key_handler.py b/augur/tasks/gitlab/gitlab_api_key_handler.py new file mode 100644 index 0000000000..20bc1219ca --- /dev/null +++ b/augur/tasks/gitlab/gitlab_api_key_handler.py @@ -0,0 +1,176 @@ +""" +Defines the handler logic needed to effectively fetch GitLab auth keys +from either the redis cache or the database. Follows the same patterns as +the github api key handler. +""" +import httpx +import time +import random + +from typing import Optional, List + +from augur.tasks.util.redis_list import RedisList +from augur.application.db.session import DatabaseSession +from augur.application.config import AugurConfig +from sqlalchemy import func + + +class NoValidKeysError(Exception): + """Defines an exception that is thrown when no gitlab keys are valid""" + + +class GitlabApiKeyHandler(): + """Handles Gitlab API key retrieval from the database and redis + + Attributes: + session (DatabaseSession): Database connection + logger (logging.Logger): Handles all logs + oauth_redis_key (str): The key where the gitlab api keys are cached in redis + redis_key_list (RedisList): Acts like a python list, and interacts directly with the redis cache + config_key (str): The api key that is stored in the users config table + key: (List[str]): List of keys retrieve from database or cache + """ + + def __init__(self, session: DatabaseSession): + + self.session = session + self.logger = session.logger + self.config = AugurConfig(self.logger, session) + + self.oauth_redis_key = "gitlab_oauth_keys_list" + + self.redis_key_list = RedisList(self.oauth_redis_key) + + self.config_key = self.get_config_key() + + self.keys = self.get_api_keys() + + self.logger.info(f"Retrieved {len(self.keys)} gitlab api keys for use") + + def get_random_key(self): + """Retrieves a random key from the list of keys + + Returns: + A random gitlab api key + """ + + return random.choice(self.keys) + + def get_config_key(self) -> str: + """Retrieves the users gitlab api key from their config table + + Returns: + Github API key from config table + """ + return self.config.get_value("Keys", "gitlab_api_key") + + def get_api_keys_from_database(self) -> List[str]: + """Retieves all gitlab api keys from database + + Note: + It retrieves all the keys from the database except the one defined in the users config + + Returns: + Github api keys that are in the database + """ + from augur.application.db.models import WorkerOauth + + select = WorkerOauth.access_token + # randomizing the order at db time + #select.order_by(func.random()) + where = [WorkerOauth.access_token != self.config_key, WorkerOauth.platform == 'gitlab'] + + return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).order_by(func.random()).all()] + #return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).all()] + + + def get_api_keys(self) -> List[str]: + """Retrieves all valid Github API Keys + + Note: + It checks to see if the keys are in the redis cache first. + It removes bad keys before returning. + If keys were taken from the database, it caches all the valid keys that were found + + Returns: + Valid Github api keys + """ + + redis_keys = list(self.redis_key_list) + + if redis_keys: + return redis_keys + + attempts = 0 + while attempts < 3: + + try: + keys = self.get_api_keys_from_database() + break + except Exception as e: + self.logger.error(f"Ran into issue when fetching key from database:\n {e}\n") + self.logger.error("Sleeping for 5 seconds...") + time.sleep(5) + attempts += 1 + + if self.config_key is not None: + keys += [self.config_key] + + if len(keys) == 0: + return [] + + valid_keys = [] + with httpx.Client() as client: + + for key in keys: + + # removes key if it returns "Bad Credentials" + if self.is_bad_api_key(client, key) is False: + valid_keys.append(key) + else: + print(f"WARNING: The key '{key}' is not a valid key. Hint: If valid in past it may have expired") + + # just in case the mulitprocessing adds extra values to the list. + # we are clearing it before we push the values we got + self.redis_key_list.clear() + + # add all the keys to redis + self.redis_key_list.extend(valid_keys) + + if not valid_keys: + raise NoValidKeysError("No valid gitlab api keys found in the config or worker oauth table") + + + # shuffling the keys so not all processes get the same keys in the same order + #valid_now = valid_keys + #try: + #self.logger.info(f'valid keys before shuffle: {valid_keys}') + #valid_keys = random.sample(valid_keys, len(valid_keys)) + #self.logger.info(f'valid keys AFTER shuffle: {valid_keys}') + #except Exception as e: + # self.logger.debug(f'{e}') + # valid_keys = valid_now + # pass + + return valid_keys + + def is_bad_api_key(self, client: httpx.Client, oauth_key: str) -> bool: + """Determines if a Gitlab API key is bad + + Args: + client: makes the http requests + oauth_key: gitlab api key that is being tested + + Returns: + True if key is bad. False if the key is good + """ + + url = "https://gitlab.com/api/v4/user" + + headers = {'Authorization': f'Bearer {oauth_key}'} + + response = client.request(method="GET", url=url, headers=headers, timeout=180) + if response.status_code == 401: + return True + + return False \ No newline at end of file diff --git a/augur/tasks/gitlab/gitlab_random_key_auth.py b/augur/tasks/gitlab/gitlab_random_key_auth.py new file mode 100644 index 0000000000..64ba31dd19 --- /dev/null +++ b/augur/tasks/gitlab/gitlab_random_key_auth.py @@ -0,0 +1,26 @@ +"""Defines the GitlabRandomKeyAuth class""" + +from augur.tasks.util.random_key_auth import RandomKeyAuth +from augur.tasks.gitlab.gitlab_api_key_handler import GitlabApiKeyHandler +from augur.application.db.session import DatabaseSession + + +class GitlabRandomKeyAuth(RandomKeyAuth): + """Defines a gitlab specific RandomKeyAuth class so + gitlab collections can have a class randomly selects an api key for each request + """ + + def __init__(self, session: DatabaseSession, logger): + """Creates a GitlabRandomKeyAuth object and initializes the RandomKeyAuth parent class""" + + + # gets the gitlab api keys from the database via the GitlabApiKeyHandler + gitlab_api_keys = GitlabApiKeyHandler(session).keys + + if not gitlab_api_keys: + print("Failed to find github api keys. This is usually because your key has expired") + + header_name = "Authorization" + key_format = "Bearer {0}" + + super().__init__(gitlab_api_keys, header_name, session.logger, key_format) \ No newline at end of file diff --git a/augur/tasks/gitlab/gitlab_task_session.py b/augur/tasks/gitlab/gitlab_task_session.py new file mode 100644 index 0000000000..58a6e64373 --- /dev/null +++ b/augur/tasks/gitlab/gitlab_task_session.py @@ -0,0 +1,55 @@ +""" +Defines a GitLab-specific session and manifest object for use in GitLab tasks +""" +from logging import Logger + +from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth +from augur.application.db.session import DatabaseSession + +class GitlabTaskManifest: + """ + Manifest object that represents the state and common elements of + the specified task. GitLab version for the GitLab tasks. + + Attributes: + augur_db: sqlalchemy db object + key_auth: GitLab specific key auth retrieval collection + logger: logging object + platform_id: GitLab specific platform id (github is 1) + """ + + def __init__(self, logger): + + from augur.tasks.init.celery_app import engine + + self.augur_db = DatabaseSession(logger, engine) + self.key_auth = GitlabRandomKeyAuth(self.augur_db.session, logger) + self.logger = logger + self.platform_id = 2 + + def __enter__(self): + + return self + + def __exit__(self, exception_type, exception_value, exception_traceback): + + self.augur_db.close() + +class GitlabTaskSession(DatabaseSession): + """ORM session used in gitlab tasks. + This class adds the platform_id and the gitlab key authentication class, + to the already existing DatabaseSession so there is a central location to access + api keys and a single platform_id reference + + Attributes: + oauths (GitlabRandomKeyAuth): Class that handles randomly assigning gitlab api keys to httpx requests + platform_id (int): The id that refers to the Gitlab platform + """ + + def __init__(self, logger: Logger, engine=None): + + super().__init__(logger, engine=engine) + + self.oauths = GitlabRandomKeyAuth(self, logger) + self.platform_id = 2 + diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py new file mode 100644 index 0000000000..cf6e5e5dab --- /dev/null +++ b/augur/tasks/gitlab/issues_task.py @@ -0,0 +1,320 @@ +""" +Defines the set of tasks used to retrieve GitLab issue data. +""" +import logging +import traceback + +from augur.tasks.init.celery_app import celery_app as celery +from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask +from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler +from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest +from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data, extract_needed_gitlab_issue_message_ref_data, extract_needed_gitlab_message_data +from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts +from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Repo +from augur.application.db.util import execute_session_query + +platform_id = 2 + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_issues(repo_git : str) -> int: + """ + Retrieve and parse gitlab issues for the desired repo + + Arguments: + repo_git: the repo url string + """ + + logger = logging.getLogger(collect_gitlab_issues.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + try: + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + owner, repo = get_owner_repo(repo_git) + + issue_data = retrieve_all_gitlab_issue_data(repo_git, logger, manifest.key_auth) + + if issue_data: + issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + + return issue_ids + else: + logger.info(f"{owner}/{repo} has no issues") + return [] + except Exception as e: + logger.error(f"Could not collect gitlab issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") + return -1 + + + +def retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) -> None: + """ + Retrieve only the needed data for issues from the api response + + Arguments: + repo_git: url of the relevant repo + logger: loggin object + key_auth: key auth cache and rotator object + """ + + owner, repo = get_owner_repo(repo_git) + + logger.info(f"Collecting gitlab issues for {owner}/{repo}") + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues?with_labels_details=True" + issues = GitlabApiHandler(key_auth, logger) + + all_data = [] + num_pages = issues.get_num_pages(url) + for page_data, page in issues.iter_pages(url): + + if page_data is None: + return all_data + + if len(page_data) == 0: + logger.debug( + f"{owner}/{repo}: Gitlab Issues Page {page} contains no data...returning") + logger.info(f"{owner}/{repo}: Issues Page {page} of {num_pages}") + return all_data + + logger.info(f"{owner}/{repo}: Gitlab Issues Page {page} of {num_pages}") + + all_data += page_data + + return all_data + +def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: + """ + Retrieve only the needed data for issues from the api response + + Arguments: + issues: List of dictionaries of issue data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + # get repo_id or have it passed + tool_source = "Gitlab Issue Task" + tool_version = "2.0" + data_source = "Gitlab API" + + issue_dicts = [] + issue_ids = [] + issue_mapping_data = {} + for issue in issues: + + issue_ids.append(issue["iid"]) + + issue_dicts.append( + extract_needed_issue_data_from_gitlab_issue(issue, repo_id, tool_source, tool_version, data_source) + ) + + issue_labels = extract_needed_gitlab_issue_label_data(issue["labels"], repo_id, + tool_source, tool_version, data_source) + + issue_assignees = extract_needed_gitlab_issue_assignee_data(issue["assignees"], repo_id, + tool_source, tool_version, data_source) + + mapping_data_key = issue["id"] + issue_mapping_data[mapping_data_key] = { + "labels": issue_labels, + "assignees": issue_assignees, + } + + + if len(issue_dicts) == 0: + print("No gitlab issues found while processing") + return + + logger.info(f"{task_name}: Inserting {len(issue_dicts)} gitlab issues") + issue_natural_keys = ["repo_id", "gh_issue_id"] + issue_string_columns = ["issue_title", "issue_body"] + issue_return_columns = ["gh_issue_id", "issue_id"] + + issue_return_data = augur_db.insert_data(issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) + + issue_label_dicts = [] + issue_assignee_dicts = [] + for data in issue_return_data: + + gh_issue_id = data["gh_issue_id"] + issue_id = data["issue_id"] + + try: + other_issue_data = issue_mapping_data[gh_issue_id] + except KeyError as e: + logger.info(f"{task_name}: Cold not find other gitlab issue data. This should never happen. Error: {e}") + + + # add the issue id to the lables and assignees, then add them to a list of dicts that will be inserted soon + dict_key = "issue_id" + issue_label_dicts += add_key_value_pair_to_dicts(other_issue_data["labels"], dict_key, issue_id) + issue_assignee_dicts += add_key_value_pair_to_dicts(other_issue_data["assignees"], dict_key, issue_id) + + + logger.info(f"{task_name}: Inserting other gitlab issue data of lengths: Labels: {len(issue_label_dicts)} - Assignees: {len(issue_assignee_dicts)}") + + # inserting issue labels + # we are using label_src_id and issue_id to determine if the label is already in the database. + issue_label_natural_keys = ['label_src_id', 'issue_id'] + issue_label_string_fields = ["label_text", "label_description"] + augur_db.insert_data(issue_label_dicts, IssueLabel, + issue_label_natural_keys, string_fields=issue_label_string_fields) + + # inserting issue assignees + # we are using issue_assignee_src_id and issue_id to determine if the label is already in the database. + # issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] + # augur_db.insert_data(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) + + return issue_ids + + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_issue_comments(issue_ids, repo_git) -> int: + """ + Retrieve and parse gitlab events for the desired repo + + Arguments: + issue_ids: Set of issue ids to collect coments for + repo_git: repo url + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_gitlab_issues.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + comments = retrieve_all_gitlab_issue_comments(manifest.key_auth, logger, issue_ids, repo_git) + + if comments: + logger.info(f"Length of comments: {len(comments)}") + process_gitlab_issue_messages(comments, f"{owner}/{repo}: Gitlab issue messages task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab issue comments") + + +def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): + """ + Retrieve only the needed data for issue comments + + Arguments: + key_auth: key auth cache and rotator object + logger: loggin object + issue_ids: ids of issues to find comements for + repo_git: repo url + """ + + owner, repo = get_owner_repo(repo_git) + + all_comments = {} + issue_count = len(issue_ids) + index = 1 + + comments = GitlabApiHandler(key_auth, logger) + + for id in issue_ids: + + logger.info(f"Collecting {owner}/{repo} gitlab issue comments for issue {index} of {issue_count}") + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues/{id}/notes" + + for page_data, page in comments.iter_pages(url): + + if page_data is None or len(page_data) == 0: + break + + if id in all_comments: + all_comments[id].extend(page_data) + else: + all_comments[id] = page_data + + index += 1 + + return all_comments + + +def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for issue messages from the api response + + Arguments: + data: List of dictionaries of issue event data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Gitlab issue comments" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + issue_number_to_id_map = {} + issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + for issue in issues: + issue_number_to_id_map[issue.gh_issue_number] = issue.issue_id + + message_dicts = [] + message_ref_mapping_data = {} + for id, messages in data.items(): + + try: + issue_id = issue_number_to_id_map[id] + except KeyError: + logger.info(f"{task_name}: Could not find related issue") + logger.info(f"{task_name}: We were searching for issue number {id} in repo {repo_id}") + logger.info(f"{task_name}: Skipping") + continue + + for message in messages: + + issue_message_ref_data = extract_needed_gitlab_issue_message_ref_data(message, issue_id, repo_id, tool_source, tool_version, data_source) + + message_ref_mapping_data[message["id"]] = { + "msg_ref_data": issue_message_ref_data + } + + message_dicts.append( + extract_needed_gitlab_message_data(message, platform_id, tool_source, tool_version, data_source) + ) + + + logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") + message_natural_keys = ["platform_msg_id"] + message_return_columns = ["msg_id", "platform_msg_id"] + message_string_fields = ["msg_text"] + message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, + return_columns=message_return_columns, string_fields=message_string_fields) + + issue_message_ref_dicts = [] + for data in message_return_data: + + augur_msg_id = data["msg_id"] + platform_message_id = data["platform_msg_id"] + + ref = message_ref_mapping_data[platform_message_id] + message_ref_data = ref["msg_ref_data"] + message_ref_data["msg_id"] = augur_msg_id + + issue_message_ref_dicts.append(message_ref_data) + + logger.info(f"{task_name}: Inserting {len(issue_message_ref_dicts)} gitlab issue messages ref rows") + issue_message_ref_natural_keys = ["issue_id", "issue_msg_ref_src_comment_id"] + augur_db.insert_data(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) + + diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py new file mode 100644 index 0000000000..ccf3c7e012 --- /dev/null +++ b/augur/tasks/gitlab/merge_request_task.py @@ -0,0 +1,560 @@ +import logging + +from augur.tasks.init.celery_app import celery_app as celery +from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask +from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler +from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest +from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_mr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data +from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts +from augur.application.db.models import PullRequest, PullRequestAssignee, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message +from augur.application.db.util import execute_session_query + +platform_id = 2 + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_merge_requests(repo_git: str) -> int: + """ + Retrieve and parse gitlab MRs for the desired repo + + Arguments: + repo_git: the repo url string + """ + + + logger = logging.getLogger(collect_gitlab_merge_requests.__name__) + + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + repo_id = augur_db.session.query(Repo).filter( + Repo.repo_git == repo_git).one().repo_id + + owner, repo = get_owner_repo(repo_git) + mr_data = retrieve_all_mr_data(repo_git, logger, manifest.key_auth) + + if mr_data: + mr_ids = process_merge_requests(mr_data, f"{owner}/{repo}: Mr task", repo_id, logger, augur_db) + + return mr_ids + else: + logger.info(f"{owner}/{repo} has no merge requests") + return [] + + +def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None: + """ + Retrieve only the needed data for MRs from the api response + + Arguments: + repo_git: url of the relevant repo + logger: loggin object + key_auth: key auth cache and rotator object + """ + + owner, repo = get_owner_repo(repo_git) + + logger.info(f"Collecting pull requests for {owner}/{repo}") + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests?with_labels_details=True" + mrs = GitlabApiHandler(key_auth, logger) + + all_data = [] + num_pages = mrs.get_num_pages(url) + for page_data, page in mrs.iter_pages(url): + + if page_data is None: + return all_data + + if len(page_data) == 0: + logger.debug( + f"{owner}/{repo} Mrs Page {page} contains no data...returning") + logger.info(f"{owner}/{repo} Mrs Page {page} of {num_pages}") + return all_data + + logger.info(f"{owner}/{repo} Mrs Page {page} of {num_pages}") + + all_data += page_data + + return all_data + + +def process_merge_requests(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + data: collection of mr data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + + Returns: + List of parsed MR ids. + """ + + tool_source = "Mr Task" + tool_version = "2.0" + data_source = "Gitlab API" + + merge_requests = [] + mr_ids = [] + mr_mapping_data = {} + for mr in data: + + mr_ids.append(mr["iid"]) + + merge_requests.append(extract_needed_pr_data_from_gitlab_merge_request(mr, repo_id, tool_source, tool_version)) + + assignees = extract_needed_merge_request_assignee_data(mr["assignees"], repo_id, tool_source, tool_version, data_source) + + labels = extract_needed_mr_label_data(mr["labels"], repo_id, tool_source, tool_version, data_source) + + mapping_data_key = mr["id"] + mr_mapping_data[mapping_data_key] = { + "assignees": assignees, + "labels": labels + } + + logger.info(f"{task_name}: Inserting mrs of length: {len(merge_requests)}") + pr_natural_keys = ["repo_id", "pr_src_id"] + pr_string_fields = ["pr_src_title", "pr_body"] + pr_return_columns = ["pull_request_id", "pr_src_id"] + pr_return_data = augur_db.insert_data(merge_requests, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) + + + mr_assignee_dicts = [] + mr_label_dicts = [] + for data in pr_return_data: + + mr_src_id = data["pr_src_id"] + pull_request_id = data["pull_request_id"] + + try: + other_mr_data = mr_mapping_data[mr_src_id] + except KeyError as e: + logger.info(f"Cold not find other pr data. This should never happen. Error: {e}") + + dict_key = "pull_request_id" + mr_assignee_dicts += add_key_value_pair_to_dicts(other_mr_data["assignees"], dict_key, pull_request_id) + mr_label_dicts += add_key_value_pair_to_dicts(other_mr_data["labels"], dict_key, pull_request_id) + + logger.info(f"{task_name}: Inserting other pr data of lengths: Labels: {len(mr_label_dicts)} - Assignees: {len(mr_assignee_dicts)}") + + # TODO: Setup unique key on asignees with a value of ('cntrb_id', 'pull_request_id') and add 'cntrb_id' to assingee data + # mr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] + # augur_db.insert_data(mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) + + pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] + pr_label_string_fields = ["pr_src_description"] + augur_db.insert_data(mr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) + + return mr_ids + + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_comments(mr_ids, repo_git) -> int: + """ + Retrieve and parse gitlab events for the desired repo + + Arguments: + mr_ids: ids of MRs to paginate comments for + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_merge_request_comments.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes".format(owner=owner, repo=repo, id="{id}") + comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, manifest.key_auth, logger, response_type="list") + + if comments: + logger.info(f"Length of merge request comments: {len(comments)}") + process_gitlab_mr_messages(comments, f"{owner}/{repo}: Gitlab mr messages task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request comments") + + +def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + data: List of dictionaries of mr message data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Gitlab mr comments" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + message_dicts = [] + message_ref_mapping_data = {} + for id, messages in data.items(): + + try: + pull_request_id = mr_number_to_id_map[id] + except KeyError: + logger.info(f"{task_name}: Could not find related mr") + logger.info(f"{task_name}: We were searching for mr number {id} in repo {repo_id}") + logger.info(f"{task_name}: Skipping") + continue + + for message in messages: + + mr_message_ref_data = extract_needed_gitlab_mr_message_ref_data(message, pull_request_id, repo_id, tool_source, tool_version, data_source) + + message_ref_mapping_data[message["id"]] = { + "msg_ref_data": mr_message_ref_data + } + + message_dicts.append( + extract_needed_gitlab_message_data(message, platform_id, tool_source, tool_version, data_source) + ) + + + logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") + message_natural_keys = ["platform_msg_id"] + message_return_columns = ["msg_id", "platform_msg_id"] + message_string_fields = ["msg_text"] + message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, + return_columns=message_return_columns, string_fields=message_string_fields) + + mr_message_ref_dicts = [] + for data in message_return_data: + + augur_msg_id = data["msg_id"] + platform_message_id = data["platform_msg_id"] + + ref = message_ref_mapping_data[platform_message_id] + message_ref_data = ref["msg_ref_data"] + message_ref_data["msg_id"] = augur_msg_id + + mr_message_ref_dicts.append(message_ref_data) + + logger.info(f"{task_name}: Inserting {len(mr_message_ref_dicts)} mr messages ref rows") + mr_message_ref_natural_keys = ["pull_request_id", "pr_message_ref_src_comment_id"] + augur_db.insert_data(mr_message_ref_dicts, PullRequestMessageRef, mr_message_ref_natural_keys) + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_metadata(mr_ids, repo_git) -> int: + """ + Retrieve and parse gitlab events for the desired repo + + Arguments: + mr_ids: list of mr ids to find metadata for + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_merge_request_metadata.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}") + metadata_list = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, manifest.key_auth, logger, response_type="dict") + + if metadata_list: + logger.info(f"Length of merge request metadata: {len(metadata_list)}") + process_mr_metadata(metadata_list, f"{owner}/{repo}: Mr metadata task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request metadata") + +def process_mr_metadata(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + data: List of dictionaries of mr metadata + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Mr Metadata Task" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + all_metadata = [] + for id, metadata in data.items(): + + pull_request_id = mr_number_to_id_map[id] + + all_metadata.extend(extract_needed_mr_metadata(metadata, repo_id, pull_request_id, tool_source, tool_version, data_source)) + + logger.info(f"{task_name}: Inserting {len(all_metadata)} merge request metadata") + pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] + augur_db.insert_data(all_metadata, PullRequestMeta, pr_metadata_natural_keys) + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_reviewers(mr_ids, repo_git) -> int: + """ + Retrieve and parse mr reviewers for the desired repo + + Arguments: + mr_ids: mrs to search for reviewers for + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_merge_request_reviewers.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}") + reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, manifest.key_auth, logger, response_type="dict") + + if reviewers: + logger.info(f"Length of merge request reviewers: {len(reviewers)}") + process_mr_reviewers(reviewers, f"{owner}/{repo}: Mr reviewer task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request reviewers") + +def process_mr_reviewers(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr Reviewer data from the api response + + Arguments: + data: List of dictionaries of mr Reviewer data + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Mr Reviewer Task" + tool_version = "2.0" + data_source = "Gitlab API" + + logger.info(f"Running {task_name}...") + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + all_reviewers = [] + for id, values in data.items(): + + pull_request_id = mr_number_to_id_map[id] + + reviewers = extract_needed_mr_reviewer_data(values, pull_request_id, tool_source, tool_version, data_source) + + all_reviewers += reviewers + + # TODO: Need to add unique key with pull_request_id and cntrb_id to insert gitlab reviewers + # pr_reviewer_natural_keys = ["pull_request_id", "cntrb_id"] + # augur_db.insert_data(all_reviewers, PullRequestReviewer, pr_reviewer_natural_keys) + + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_commits(mr_ids, repo_git) -> int: + """ + Retrieve and parse mr commits for the desired repo + + Arguments: + mr_ids: ids of mrs to get commits for + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_merge_request_commits.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}") + commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, manifest.key_auth, logger, response_type="list") + + if commits: + logger.info(f"Length of merge request commits: {len(commits)}") + process_mr_commits(commits, f"{owner}/{repo}: Mr commit task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request commits") + + +def process_mr_commits(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr commits from the api response + + Arguments: + data: List of dictionaries of mr commit data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Mr Commit Task" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + all_commits = [] + for id, values in data.items(): + + pull_request_id = mr_number_to_id_map[id] + + for commit in values: + + all_commits.append(extract_needed_mr_commit_data(commit, repo_id, pull_request_id, tool_source, tool_version, data_source)) + + + logger.info(f"{task_name}: Inserting {len(all_commits)} merge request commits") + pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] + augur_db.insert_data(all_commits,PullRequestCommit,pr_commits_natural_keys) + + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_files(mr_ids, repo_git) -> int: + """ + Retrieve and parse gitlab events for the desired repo + + Arguments: + mr_ids: the ids of mrs to get files for. + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_merge_request_files.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}") + files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, manifest.key_auth, logger, response_type="dict") + + if files: + logger.info(f"Length of merge request files: {len(files)}") + process_mr_files(files, f"{owner}/{repo}: Mr files task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request files") + +def process_mr_files(data, task_name, repo_id, logger, augur_db): + + tool_source = "Mr files Task" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + all_files = [] + for id, gitlab_file_data in data.items(): + + pull_request_id = mr_number_to_id_map[id] + + all_files.extend(extract_needed_mr_file_data(gitlab_file_data, repo_id, pull_request_id, tool_source, tool_version, data_source)) + + logger.info(f"{task_name}: Inserting {len(all_files)} merge request files") + pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] + augur_db.insert_data(all_files, PullRequestFile, pr_file_natural_keys) + + +def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, response_type): + """ + Retrieve specific mr data from the GitLab api. + + Arguments: + ids: mr ids to paginate info for + url: endpoint to paginate or hit + name: name of data to collect + owner: owner of the repo + repo: repo name + key_auth: key auth cache and rotator object + logger: loggin object + response_type: type of data to get from the api + """ + + all_data = {} + mr_count = len(ids) + index = 1 + + api_handler = GitlabApiHandler(key_auth, logger) + for id in ids: + + print(f"Collecting {owner}/{repo} gitlab merge request {name} for merge request {index} of {mr_count}") + formatted_url = url.format(id=id) + + if response_type == "dict": + page_data, _, _ = api_handler.retrieve_data(formatted_url) + if page_data: + all_data[id] = page_data + + elif response_type == "list": + + for page_data, _ in api_handler.iter_pages(formatted_url): + + if page_data is None or len(page_data) == 0: + break + + if id in all_data: + all_data[id].extend(page_data) + else: + all_data[id] = page_data + else: + raise Exception(f"Unexpected response type: {response_type}") + + index += 1 + + return all_data diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index ac6e18fc64..274305449a 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -20,16 +20,7 @@ from augur.application.db.engine import get_database_string from augur.tasks.init import get_redis_conn_values, get_rabbitmq_conn_string from augur.application.db.models import CollectionStatus, Repo - -class CollectionState(Enum): - SUCCESS = "Success" - PENDING = "Pending" - ERROR = "Error" - COLLECTING = "Collecting" - INITIALIZING = "Initializing" - UPDATE = "Update" - FAILED_CLONE = "Failed Clone" - +from augur.tasks.util.collection_state import CollectionState logger = logging.getLogger(__name__) @@ -50,9 +41,14 @@ class CollectionState(Enum): 'augur.tasks.github.pull_requests.commits_model.tasks', 'augur.tasks.github.traffic.tasks'] +gitlab_tasks = ['augur.tasks.gitlab.merge_request_task', + 'augur.tasks.gitlab.issues_task', + 'augur.tasks.gitlab.events_task'] + git_tasks = ['augur.tasks.git.facade_tasks', 'augur.tasks.git.dependency_tasks.tasks', - 'augur.tasks.git.dependency_libyear_tasks.tasks'] + 'augur.tasks.git.dependency_libyear_tasks.tasks', + 'augur.tasks.git.scc_value_tasks.tasks'] data_analysis_tasks = ['augur.tasks.data_analysis.message_insights.tasks', 'augur.tasks.data_analysis.clustering_worker.tasks', @@ -65,7 +61,7 @@ class CollectionState(Enum): frontend_tasks = ['augur.tasks.frontend'] -tasks = start_tasks + github_tasks + git_tasks + materialized_view_tasks + frontend_tasks +tasks = start_tasks + github_tasks + gitlab_tasks + git_tasks + materialized_view_tasks + frontend_tasks if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": tasks += data_analysis_tasks @@ -80,7 +76,7 @@ class CollectionState(Enum): #Classes for tasks that take a repo_git as an argument. class AugurCoreRepoCollectionTask(celery.Task): - def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_hook='core'): + def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_hook='core',after_fail=CollectionState.ERROR.value): from augur.tasks.init.celery_app import engine logger = AugurLogger(logger_name).get_logger() @@ -99,7 +95,7 @@ def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_h prevStatus = getattr(repoStatus, f"{collection_hook}_status") if prevStatus == CollectionState.COLLECTING.value or prevStatus == CollectionState.INITIALIZING.value: - setattr(repoStatus, f"{collection_hook}_status", CollectionState.ERROR.value) + setattr(repoStatus, f"{collection_hook}_status", after_fail) setattr(repoStatus, f"{collection_hook}_task_id", None) session.commit() @@ -124,6 +120,7 @@ def on_failure(self,exc,task_id,args,kwargs,einfo): repo_git = args[0] self.augur_handle_task_failure(exc,task_id,repo_git, "ml_task_failure", collection_hook='ml') + #task_cls='augur.tasks.init.celery_app:AugurCoreRepoCollectionTask' celery_app = Celery('tasks', broker=BROKER_URL, backend=BACKEND_URL, include=tasks) @@ -139,6 +136,7 @@ def on_failure(self,exc,task_id,args,kwargs,einfo): 'augur.tasks.github.pull_requests.tasks.collect_pull_request_review_comments': {'queue': 'secondary'}, 'augur.tasks.git.dependency_tasks.tasks.process_ossf_dependency_metrics': {'queue': 'secondary'}, 'augur.tasks.git.dependency_tasks.tasks.process_dependency_metrics': {'queue': 'facade'}, + 'augur.tasks.git.scc_value_tasks.tasks.process_scc_value_metrics' : {'queue': 'facade'}, 'augur.tasks.git.dependency_libyear_tasks.tasks.process_libyear_dependency_metrics': {'queue': 'facade'}, 'augur.tasks.frontend.*': {'queue': 'frontend'}, 'augur.tasks.data_analysis.contributor_breadth_worker.*': {'queue': 'secondary'}, @@ -203,7 +201,7 @@ def setup_periodic_tasks(sender, **kwargs): """ from celery.schedules import crontab from augur.tasks.start_tasks import augur_collection_monitor, augur_collection_update_weights - from augur.tasks.start_tasks import non_repo_domain_tasks + from augur.tasks.start_tasks import non_repo_domain_tasks, retry_errored_repos from augur.tasks.git.facade_tasks import clone_repos from augur.tasks.db.refresh_materialized_views import refresh_materialized_views from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model @@ -228,6 +226,9 @@ def setup_periodic_tasks(sender, **kwargs): logger.info(f"Scheduling update of collection weights on midnight each day") sender.add_periodic_task(crontab(hour=0, minute=0),augur_collection_update_weights.s()) + logger.info(f"Setting 404 repos to be marked for retry on midnight each day") + sender.add_periodic_task(crontab(hour=0, minute=0),retry_errored_repos.s()) + logger.info(f"Scheduling contributor breadth every 30 days") thirty_days_in_seconds = 30*24*60*60 sender.add_periodic_task(thirty_days_in_seconds, contributor_breadth_model.s()) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 1918efbf8a..a9ba7e1634 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -24,15 +24,18 @@ from augur.tasks.github.pull_requests.commits_model.tasks import process_pull_request_commits from augur.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics from augur.tasks.github.traffic.tasks import collect_github_repo_clones_data +from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_comments, collect_merge_request_metadata, collect_merge_request_reviewers, collect_merge_request_commits, collect_merge_request_files +from augur.tasks.gitlab.issues_task import collect_gitlab_issues, collect_gitlab_issue_comments +from augur.tasks.gitlab.events_task import collect_gitlab_issue_events, collect_gitlab_merge_request_events from augur.tasks.git.facade_tasks import * from augur.tasks.db.refresh_materialized_views import * # from augur.tasks.data_analysis import * from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.session import DatabaseSession from logging import Logger -from enum import Enum from augur.tasks.util.redis_list import RedisList from augur.application.db.models import CollectionStatus, Repo +from augur.tasks.util.collection_state import CollectionState from augur.tasks.util.collection_util import * from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_facade_weight_time_factor @@ -93,6 +96,27 @@ def primary_repo_collect_phase(repo_git): return repo_task_group +def primary_repo_collect_phase_gitlab(repo_git): + + logger = logging.getLogger(primary_repo_collect_phase_gitlab.__name__) + + jobs = group( + chain(collect_gitlab_merge_requests.si(repo_git), group( + #collect_merge_request_comments.s(repo_git), + #collect_merge_request_reviewers.s(repo_git), + collect_merge_request_metadata.s(repo_git), + collect_merge_request_commits.s(repo_git), + collect_merge_request_files.s(repo_git), + collect_gitlab_merge_request_events.si(repo_git), + )), + chain(collect_gitlab_issues.si(repo_git), group( + #collect_gitlab_issue_comments.s(repo_git), + collect_gitlab_issue_events.si(repo_git), + )), + ) + + return jobs + #This phase creates the message for secondary collection tasks. #These are less important and have their own worker. @@ -102,8 +126,8 @@ def secondary_repo_collect_phase(repo_git): repo_task_group = group( process_pull_request_files.si(repo_git), process_pull_request_commits.si(repo_git), - process_ossf_dependency_metrics.si(repo_git), - chain(collect_pull_request_reviews.si(repo_git), collect_pull_request_review_comments.si(repo_git)) + chain(collect_pull_request_reviews.si(repo_git), collect_pull_request_review_comments.si(repo_git)), + process_ossf_dependency_metrics.si(repo_git) ) return repo_task_group @@ -143,65 +167,53 @@ def non_repo_domain_tasks(): tasks.apply_async() - - """ - The below functions define augur's collection hooks. - Each collection hook schedules tasks for a number of repos - """ -def start_primary_collection(session,max_repo, days_until_collect_again = 1): - - #Get list of enabled phases - enabled_phase_names = get_enabled_phase_names_from_config(session.logger, session) - - #Primary collection hook. +def build_primary_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): + #Add all required tasks to a list and pass it to the CollectionRequest primary_enabled_phases = [] + primary_gitlab_enabled_phases = [] #Primary jobs if prelim_phase.__name__ in enabled_phase_names: primary_enabled_phases.append(prelim_phase) - - + primary_enabled_phases.append(primary_repo_collect_phase) + primary_gitlab_enabled_phases.append(primary_repo_collect_phase_gitlab) #task success is scheduled no matter what the config says. def core_task_success_util_gen(repo_git): return core_task_success_util.si(repo_git) - - primary_enabled_phases.append(core_task_success_util_gen) - - start_repos_by_user(session, max_repo, primary_enabled_phases) -def start_secondary_collection(session,max_repo, days_until_collect_again = 1): + primary_enabled_phases.append(core_task_success_util_gen) + primary_gitlab_enabled_phases.append(core_task_success_util_gen) - #Get list of enabled phases - enabled_phase_names = get_enabled_phase_names_from_config(session.logger, session) + primary_request = CollectionRequest("core",primary_enabled_phases,max_repo=40, days_until_collect_again=7, gitlab_phases=primary_gitlab_enabled_phases) + primary_request.get_valid_repos(session) + return primary_request +def build_secondary_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): #Deal with secondary collection secondary_enabled_phases = [] if prelim_phase.__name__ in enabled_phase_names: secondary_enabled_phases.append(prelim_phase_secondary) - + secondary_enabled_phases.append(secondary_repo_collect_phase) def secondary_task_success_util_gen(repo_git): return secondary_task_success_util.si(repo_git) secondary_enabled_phases.append(secondary_task_success_util_gen) + request = CollectionRequest("secondary",secondary_enabled_phases,max_repo=10, days_until_collect_again=10) - conds = f"augur_operations.collection_status.core_status = '{str(CollectionState.SUCCESS.value)}'"#[CollectionStatus.core_status == str(CollectionState.SUCCESS.value)] - start_repos_by_user( - session, max_repo, - secondary_enabled_phases,hook="secondary", - additional_conditions=conds - ) + request.get_valid_repos(session) + return request -def start_facade_collection(session,max_repo,days_until_collect_again = 1): - #Deal with secondary collection +def build_facade_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): + #Deal with facade collection facade_enabled_phases = [] - + facade_enabled_phases.append(facade_phase) def facade_task_success_util_gen(repo_git): @@ -214,22 +226,12 @@ def facade_task_update_weight_util_gen(repo_git): facade_enabled_phases.append(facade_task_update_weight_util_gen) - #cutoff_date = datetime.datetime.now() - datetime.timedelta(days=days) - #not_pending = CollectionStatus.facade_status != str(CollectionState.PENDING.value) - #not_failed_clone = CollectionStatus.facade_status != str(CollectionState.FAILED_CLONE.value) - #not_initializing = CollectionStatus.facade_status != str(CollectionState.INITIALIZING.value) + request = CollectionRequest("facade",facade_enabled_phases,max_repo=30, days_until_collect_again=7) - conds = f"augur_operations.collection_status.facade_status != '{str(CollectionState.PENDING.value)}' "#[not_pending,not_failed_clone,not_initializing] - conds += f"AND augur_operations.collection_status.facade_status != '{str(CollectionState.FAILED_CLONE.value)}' " - conds += f"AND augur_operations.collection_status.facade_status != '{str(CollectionState.INITIALIZING.value)}'" + request.get_valid_repos(session) + return request - start_repos_by_user( - session, max_repo, - facade_enabled_phases,hook="facade", - new_status=CollectionState.UPDATE.value,additional_conditions=conds - ) - -def start_ml_collection(session,max_repo, days_until_collect_again=7): +def build_ml_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): ml_enabled_phases = [] ml_enabled_phases.append(machine_learning_phase) @@ -239,13 +241,9 @@ def ml_task_success_util_gen(repo_git): ml_enabled_phases.append(ml_task_success_util_gen) - conds = f"augur_operations.collection_status.secondary_status = '{str(CollectionState.SUCCESS.value)}'" - - start_repos_by_user( - session,max_repo, - ml_enabled_phases,hook="ml",additional_conditions=conds - ) - + request = CollectionRequest("ml",ml_enabled_phases,max_repo=5, days_until_collect_again=10) + request.get_valid_repos(session) + return request @celery.task def augur_collection_monitor(): @@ -260,17 +258,27 @@ def augur_collection_monitor(): #Get list of enabled phases enabled_phase_names = get_enabled_phase_names_from_config(session.logger, session) + enabled_collection_hooks = [] + if primary_repo_collect_phase.__name__ in enabled_phase_names: - start_primary_collection(session, max_repo=40) + enabled_collection_hooks.append(build_primary_repo_collect_request(session,enabled_phase_names)) if secondary_repo_collect_phase.__name__ in enabled_phase_names: - start_secondary_collection(session, max_repo=10) + enabled_collection_hooks.append(build_secondary_repo_collect_request(session,enabled_phase_names)) + #start_secondary_collection(session, max_repo=10) if facade_phase.__name__ in enabled_phase_names: - start_facade_collection(session, max_repo=30) + #start_facade_collection(session, max_repo=30) + enabled_collection_hooks.append(build_facade_repo_collect_request(session,enabled_phase_names)) if machine_learning_phase.__name__ in enabled_phase_names: - start_ml_collection(session,max_repo=5) + enabled_collection_hooks.append(build_ml_repo_collect_request(session,enabled_phase_names)) + #start_ml_collection(session,max_repo=5) + + logger.info(f"Starting collection phases: {[h.name for h in enabled_collection_hooks]}") + main_routine = AugurTaskRoutine(session,enabled_collection_hooks) + + main_routine.start_data_collection() # have a pipe of 180 @@ -320,9 +328,41 @@ def augur_collection_update_weights(): session.commit() #git_update_commit_count_weight(repo_git) +@celery.task +def retry_errored_repos(): + """ + Periodic task to reset repositories that have errored and try again. + """ + from augur.tasks.init.celery_app import engine + logger = logging.getLogger(create_collection_status_records.__name__) + + #TODO: Isaac needs to normalize the status's to be abstract in the + #collection_status table once augur dev is less unstable. + with DatabaseSession(logger,engine) as session: + query = s.sql.text(f"""UPDATE repo SET secondary_status = {CollectionState.PENDING.value}""" + f""" WHERE secondary_status = '{CollectionState.ERROR.value}' ;""" + f"""UPDATE repo SET core_status = {CollectionState.PENDING.value}""" + f""" WHERE core_status = '{CollectionState.ERROR.value}' ;""" + f"""UPDATE repo SET facade_status = {CollectionState.PENDING.value}""" + f""" WHERE facade_status = '{CollectionState.ERROR.value}' ;""" + f"""UPDATE repo SET ml_status = {CollectionState.PENDING.value}""" + f""" WHERE ml_status = '{CollectionState.ERROR.value}' ;""" + ) + + session.execute_sql(query) + + + #Retry this task for every issue so that repos that were added manually get the chance to be added to the collection_status table. @celery.task(autoretry_for=(Exception,), retry_backoff=True, retry_backoff_max=300, retry_jitter=True, max_retries=None) def create_collection_status_records(): + """ + Automatic task that runs and checks for repos that haven't been given a collection_status + record corresponding to the state of their collection at the monent. + + A special celery task that automatically retries itself and has no max retries. + """ + from augur.tasks.init.celery_app import engine logger = logging.getLogger(create_collection_status_records.__name__) @@ -338,4 +378,4 @@ def create_collection_status_records(): repo = session.execute_sql(query).first() #Check for new repos every seven minutes to be out of step with the clone_repos task - create_collection_status_records.si().apply_async(countdown=60*7) \ No newline at end of file + create_collection_status_records.si().apply_async(countdown=60*7) diff --git a/augur/tasks/util/collection_state.py b/augur/tasks/util/collection_state.py new file mode 100644 index 0000000000..b5b8f0d264 --- /dev/null +++ b/augur/tasks/util/collection_state.py @@ -0,0 +1,30 @@ + +from enum import Enum + +class CollectionState(Enum): + """ + Enum of possible states a repository's collection + can have whether it is core, secondary, facade, etc. + + Attributes: + + SUCCESS: State of success for the jobs in that collection hook + PENDING: Means the repo has not had collection run at all + ERROR: The collection hook has crashed + COLLECTING: The collection hook is running + INITIALIZING: Only for facade, indicates the repo is being cloned via git + UPDATE: Only for facade, indicates the repo has been cloned + FAILED_CLONE: Only for facade, indicates the clone has failed (usually 404) + STANDBY: Indicates the repo has been paused + IGNORE: Repo has encountered an error and we will not try again (usually 404) + """ + + SUCCESS = "Success" + PENDING = "Pending" + ERROR = "Error" + COLLECTING = "Collecting" + INITIALIZING = "Initializing" + UPDATE = "Update" + FAILED_CLONE = "Failed Clone" + STANDBY = "Standby" + IGNORE = "Ignore" diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 288f8132e1..89ae5f3d53 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -24,17 +24,176 @@ from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.application.db.session import DatabaseSession from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps +from augur.tasks.util.collection_state import CollectionState -# class syntax -class CollectionState(Enum): - SUCCESS = "Success" - PENDING = "Pending" - ERROR = "Error" - COLLECTING = "Collecting" - INITIALIZING = "Initializing" - UPDATE = "Update" - FAILED_CLONE = "Failed Clone" +def get_list_of_all_users(session): + #Get a list of all users. + query = s.sql.text(""" + SELECT + user_id + FROM augur_operations.users + """) + + users = session.execute_sql(query).fetchall() + return users + + +def get_required_conditions_for_core_repos(allow_collected_before = False, days_until_collect_again = 1): + + if not allow_collected_before: + condition_concat_string = f""" + core_status='{str(CollectionState.PENDING.value)}' AND core_status!='{str(CollectionState.ERROR.value)}' + AND augur_operations.collection_status.core_data_last_collected IS NULL + AND core_status!='{str(CollectionState.COLLECTING.value)}' + """ + else: + condition_concat_string = f""" + core_status='Success' AND core_status!='{str(CollectionState.ERROR.value)}' + AND augur_operations.collection_status.core_data_last_collected IS NOT NULL + AND core_status!='{str(CollectionState.COLLECTING.value)}' + AND core_data_last_collected <= NOW() - INTERVAL '{days_until_collect_again} DAYS' + """ + + return condition_concat_string + +def get_required_conditions_for_secondary_repos(allow_collected_before = False, days_until_collect_again = 1): + + if not allow_collected_before: + condition_concat_string = f""" + secondary_status='{str(CollectionState.PENDING.value)}' AND secondary_status!='{str(CollectionState.ERROR.value)}' + AND augur_operations.collection_status.core_status = '{str(CollectionState.SUCCESS.value)}' + AND augur_operations.collection_status.secondary_data_last_collected IS NULL + AND secondary_status!='{str(CollectionState.COLLECTING.value)}' + """ + else: + condition_concat_string = f""" + secondary_status='Success' AND secondary_status!='{str(CollectionState.ERROR.value)}' + AND augur_operations.collection_status.secondary_data_last_collected IS NOT NULL + AND augur_operations.collection_status.core_status = '{str(CollectionState.SUCCESS.value)}' + AND secondary_status!='{str(CollectionState.COLLECTING.value)}' + AND secondary_data_last_collected <= NOW() - INTERVAL '{days_until_collect_again} DAYS' + """ + + return condition_concat_string + +def get_required_conditions_for_facade_repos(allow_collected_before = False, days_until_collect_again = 1): + + if not allow_collected_before: + condition_concat_string = f""" + facade_status='{str(CollectionState.UPDATE.value)}' AND facade_status!='{str(CollectionState.ERROR.value)}' + AND augur_operations.collection_status.facade_status != '{str(CollectionState.PENDING.value)}' + AND augur_operations.collection_status.facade_status != '{str(CollectionState.FAILED_CLONE.value)}' + AND augur_operations.collection_status.facade_status != '{str(CollectionState.INITIALIZING.value)}' + AND augur_operations.collection_status.facade_data_last_collected IS NULL + AND facade_status!='{str(CollectionState.COLLECTING.value)}' + """ + else: + condition_concat_string = f""" + facade_status='Success' AND facade_status!='{str(CollectionState.ERROR.value)}' + AND augur_operations.collection_status.facade_data_last_collected IS NOT NULL + AND augur_operations.collection_status.facade_status != '{str(CollectionState.PENDING.value)}' + AND augur_operations.collection_status.facade_status != '{str(CollectionState.FAILED_CLONE.value)}' + AND augur_operations.collection_status.facade_status != '{str(CollectionState.INITIALIZING.value)}' + AND facade_status!='{str(CollectionState.COLLECTING.value)}' + AND facade_data_last_collected <= NOW() - INTERVAL '{days_until_collect_again} DAYS' + """ + + return condition_concat_string + +def get_required_conditions_for_ml_repos(allow_collected_before = False, days_until_collect_again = 1): + + if not allow_collected_before: + condition_concat_string = f""" + ml_status='{str(CollectionState.PENDING.value)}' AND ml_status!='{str(CollectionState.ERROR.value)}' + AND augur_operations.collection_status.secondary_status = '{str(CollectionState.SUCCESS.value)}' + AND augur_operations.collection_status.ml_data_last_collected IS NULL + AND ml_status!='{str(CollectionState.COLLECTING.value)}' + """ + else: + condition_concat_string = f""" + ml_status='Success' AND ml_status!='{str(CollectionState.ERROR.value)}' + AND augur_operations.collection_status.ml_data_last_collected IS NOT NULL + AND ml_status!='{str(CollectionState.COLLECTING.value)}' + AND ml_data_last_collected <= NOW() - INTERVAL '{days_until_collect_again} DAYS' + """ + + return condition_concat_string + + + +class CollectionRequest: + def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1, gitlab_phases=None): + self.name = name + self.phases = phases + self.gitlab_phases = gitlab_phases + self.max_repo = max_repo + self.days_until_collect_again = days_until_collect_again + self.new_status = CollectionState.PENDING.value + self.repo_list = [] + + self.status_column = f"{name}_status" + + + if name == "facade": + self.new_status = CollectionState.UPDATE.value + + def get_active_repo_count(self,session): + return len(session.query(CollectionStatus).filter(getattr(CollectionStatus,f"{self.name}_status" ) == CollectionState.COLLECTING.value).all()) + + #Get repo urls based on passed in info. + def get_valid_repos(self,session): + #getattr(CollectionStatus,f"{hook}_status" ) represents the status of the given hook + #Get the count of repos that are currently running this collection hook + #status_column = f"{hook}_status" + active_repo_count = self.get_active_repo_count(session) + + #Will always disallow errored repos and repos that are already collecting + + #The maximum amount of repos to schedule is affected by the existing repos running tasks + limit = self.max_repo-active_repo_count + + #Extract the user id from the randomized list and split into four chunks + split_user_list = split_random_users_list(session,f"{self.name}_status",self.new_status) + + session.logger.info(f"User_list: {split_user_list}") + + #Iterate through each fourth of the users fetched + for quarter_list in split_user_list: + if limit <= 0: + return + + collection_list = get_valid_repos_for_users(session,limit,tuple(quarter_list),hook=self.name, days_to_wait_until_next_collection=self.days_until_collect_again) + + self.repo_list.extend(collection_list) + #Update limit with amount of repos started + limit -= len(collection_list) + + #Now start old repos if there is space to do so. + if limit <= 0: + return + + + user_list = get_list_of_all_users(session) + random.shuffle(user_list) + + #Extract the user id from the randomized list and split into four chunks + split_user_list = split_list_into_chunks([row[0] for row in user_list], 4) + + for quarter_list in split_user_list: + + #Break out if limit has been reached + if limit <= 0: + return + + #only start repos older than the specified amount of days + #Query a set of valid repositories sorted by weight, also making sure that the repos aren't new or errored + #Order by the relevant weight for the collection hook + collection_list = get_valid_repos_for_users(session,limit,tuple(quarter_list),allow_old_repos=True,hook=self.name, days_to_wait_until_next_collection=self.days_until_collect_again) + + self.repo_list.extend(collection_list) + limit -= len(collection_list) + def get_enabled_phase_names_from_config(logger, session): @@ -372,6 +531,7 @@ class AugurTaskRoutine: """ class to keep track of various groups of collection tasks for a group of repos. Simple version to just schedule a number of repos not worrying about repo weight. + The repo weight matters when constructing the CollectionRequest through get_valid_repos Used when scheduling repo clones/updates. @@ -382,28 +542,23 @@ class to keep track of various groups of collection tasks for a group of repos. collection_hook (str): String determining the attributes to update when collection for a repo starts. e.g. core session: Database session to use """ - def __init__(self,session,repos: List[str]=[],collection_phases: List=[],collection_hook: str="core"): + def __init__(self,session,collection_hooks): self.logger = session.logger - #self.session = TaskSession(self.logger) - self.collection_phases = collection_phases - #self.disabled_collection_tasks = disabled_collection_tasks - self.repos = repos - self.session = session - self.collection_hook = collection_hook - #Also have attribute to determine what to set repos' status as when they are run - self.start_state = CollectionState.COLLECTING.value + self.collection_hooks = collection_hooks + self.session = session - def update_status_and_id(self,repo_git, task_id): + def update_status_and_id(self,repo_git, task_id, name): repo = self.session.query(Repo).filter(Repo.repo_git == repo_git).one() #Set status in database to collecting repoStatus = repo.collection_status[0] # - setattr(repoStatus,f"{self.collection_hook}_task_id",task_id) - setattr(repoStatus,f"{self.collection_hook}_status",self.start_state) + setattr(repoStatus,f"{name}_task_id",task_id) + setattr(repoStatus,f"{name}_status", CollectionState.COLLECTING.value) self.session.commit() + def start_data_collection(self): """Start all task items and return. @@ -415,48 +570,81 @@ def start_data_collection(self): #Send messages starts each repo and yields its running info #to concurrently update the correct field in the database. - for repo_git, task_id in self.send_messages(): - self.update_status_and_id(repo_git,task_id) + for repo_git, task_id, hook_name in self.send_messages(): + self.update_status_and_id(repo_git,task_id,hook_name) def send_messages(self): augur_collection_list = [] - for repo_git in self.repos: - - #repo = self.session.query(Repo).filter(Repo.repo_git == repo_git).one() - #repo_id = repo.repo_id - - augur_collection_sequence = [] - for job in self.collection_phases: - #Add the phase to the sequence in order as a celery task. - #The preliminary task creates the larger task chain - augur_collection_sequence.append(job(repo_git)) - - #augur_collection_sequence.append(core_task_success_util.si(repo_git)) - #Link all phases in a chain and send to celery - augur_collection_chain = chain(*augur_collection_sequence) - task_id = augur_collection_chain.apply_async().task_id - - self.logger.info(f"Setting repo {self.collection_hook} status to collecting for repo: {repo_git}") + for col_hook in self.collection_hooks: - #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated - yield repo_git, task_id - -def start_block_of_repos(logger,session,repo_git_identifiers,phases,repos_type,hook="core"): - - logger.info(f"Starting collection on {len(repo_git_identifiers)} {repos_type} {hook} repos") - if len(repo_git_identifiers) == 0: - return 0 - - logger.info(f"Collection starting for {hook}: {tuple(repo_git_identifiers)}") - - routine = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=phases,collection_hook=hook) - - routine.start_data_collection() - - return len(repo_git_identifiers) + self.logger.info(f"Starting collection on {len(col_hook.repo_list)} {col_hook.name} repos") + + for repo_git in col_hook.repo_list: + + repo = self.session.query(Repo).filter(Repo.repo_git == repo_git).one() + if "github" in repo.repo_git: + augur_collection_sequence = [] + for job in col_hook.phases: + #Add the phase to the sequence in order as a celery task. + #The preliminary task creates the larger task chain + augur_collection_sequence.append(job(repo_git)) + + #augur_collection_sequence.append(core_task_success_util.si(repo_git)) + #Link all phases in a chain and send to celery + augur_collection_chain = chain(*augur_collection_sequence) + task_id = augur_collection_chain.apply_async().task_id + + self.logger.info(f"Setting github repo {col_hook.name} status to collecting for repo: {repo_git}") + + #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated + yield repo_git, task_id, col_hook.name + else: + if col_hook.gitlab_phases is not None: + + augur_collection_sequence = [] + for job in col_hook.gitlab_phases: + #Add the phase to the sequence in order as a celery task. + #The preliminary task creates the larger task chain + augur_collection_sequence.append(job(repo_git)) + + #augur_collection_sequence.append(core_task_success_util.si(repo_git)) + #Link all phases in a chain and send to celery + augur_collection_chain = chain(*augur_collection_sequence) + task_id = augur_collection_chain.apply_async().task_id + + self.logger.info(f"Setting gitlab repo {col_hook.name} status to collecting for repo: {repo_git}") + + #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated + yield repo_git, task_id, col_hook.name + +#def start_block_of_repos(logger,session,repo_git_identifiers,phases,repos_type,hook="core"): +# +# logger.info(f"Starting collection on {len(repo_git_identifiers)} {repos_type} {hook} repos") +# if len(repo_git_identifiers) == 0: +# return 0 +# +# logger.info(f"Collection starting for {hook}: {tuple(repo_git_identifiers)}") +# +# routine = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=phases,collection_hook=hook) +# +# routine.start_data_collection() +# +# return len(repo_git_identifiers) + +def get_valid_repos_for_users(session,limit,users,allow_old_repos = False,hook="core",days_to_wait_until_next_collection = 1): + + condition_string = "1" + + if hook == "core": + condition_string = get_required_conditions_for_core_repos(allow_collected_before=allow_old_repos,days_until_collect_again= days_to_wait_until_next_collection) + elif hook == "secondary": + condition_string = get_required_conditions_for_secondary_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection) + elif hook == "facade": + condition_string = get_required_conditions_for_facade_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection) + elif hook == "ml": + condition_string = get_required_conditions_for_ml_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection) -def start_repos_from_given_group_of_users(session,limit,users,condition_string,phases,hook="core",repos_type="new"): #Query a set of valid repositories sorted by weight, also making sure that the repos are new #Order by the relevant weight for the collection hook repo_query = s.sql.text(f""" @@ -477,29 +665,15 @@ def start_repos_from_given_group_of_users(session,limit,users,condition_string,p session.logger.info(f"valid repo git list: {tuple(valid_repo_git_list)}") #start repos for new primary collection hook - collection_size = start_block_of_repos( - session.logger, session, - valid_repo_git_list, - phases, repos_type=repos_type, hook=hook - ) - - return collection_size - -""" - Generalized function for starting a phase of tasks for a given collection hook with options to add restrictive conditions -""" -def start_repos_by_user(session, max_repo,phase_list, days_until_collect_again = 1, hook="core",new_status=CollectionState.PENDING.value,additional_conditions=None): - - #getattr(CollectionStatus,f"{hook}_status" ) represents the status of the given hook - #Get the count of repos that are currently running this collection hook - status_column = f"{hook}_status" - active_repo_count = len(session.query(CollectionStatus).filter(getattr(CollectionStatus,status_column ) == CollectionState.COLLECTING.value).all()) + #collection_size = start_block_of_repos( + # session.logger, session, + # valid_repo_git_list, + # phases, repos_type=repos_type, hook=hook + #) - #Will always disallow errored repos and repos that are already collecting - - #The maximum amount of repos to schedule is affected by the existing repos running tasks - limit = max_repo-active_repo_count + return valid_repo_git_list +def split_random_users_list(session,status_col, status_new): #Split all users that have new repos into four lists and randomize order query = s.sql.text(f""" SELECT @@ -508,7 +682,7 @@ def start_repos_by_user(session, max_repo,phase_list, days_until_collect_again = JOIN augur_operations.user_repos ON augur_operations.user_groups.group_id = augur_operations.user_repos.group_id JOIN augur_data.repo ON augur_operations.user_repos.repo_id = augur_data.repo.repo_id JOIN augur_operations.collection_status ON augur_operations.user_repos.repo_id = augur_operations.collection_status.repo_id - WHERE {status_column}='{str(new_status)}' + WHERE {status_col}='{str(status_new)}' GROUP BY user_id """) @@ -518,55 +692,5 @@ def start_repos_by_user(session, max_repo,phase_list, days_until_collect_again = #Extract the user id from the randomized list and split into four chunks split_user_list = split_list_into_chunks([row[0] for row in user_list], 4) - session.logger.info(f"User_list: {split_user_list}") - - #Iterate through each fourth of the users fetched - for quarter_list in split_user_list: - if limit <= 0: - return - - condition_concat_string = f""" - {status_column}='{str(new_status)}' AND {status_column}!='{str(CollectionState.ERROR.value)}' - AND {additional_conditions if additional_conditions else 'TRUE'} AND augur_operations.collection_status.{hook}_data_last_collected IS NULL - AND {status_column}!='{str(CollectionState.COLLECTING.value)}' - """ - - collection_size = start_repos_from_given_group_of_users(session,limit,tuple(quarter_list),condition_concat_string,phase_list,hook=hook) - #Update limit with amount of repos started - limit -= collection_size - - #Now start old repos if there is space to do so. - if limit <= 0: - return - - #Get a list of all users. - query = s.sql.text(""" - SELECT - user_id - FROM augur_operations.users - """) - - user_list = session.execute_sql(query).fetchall() - random.shuffle(user_list) - - #Extract the user id from the randomized list and split into four chunks - split_user_list = split_list_into_chunks([row[0] for row in user_list], 4) - - for quarter_list in split_user_list: - - #Break out if limit has been reached - if limit <= 0: - return - - condition_concat_string = f""" - {status_column}='Success' AND {status_column}!='{str(CollectionState.ERROR.value)}' - AND {additional_conditions if additional_conditions else 'TRUE'} AND augur_operations.collection_status.{hook}_data_last_collected IS NOT NULL - AND {status_column}!='{str(CollectionState.COLLECTING.value)}' AND {hook}_data_last_collected <= NOW() - INTERVAL '{days_until_collect_again} DAYS' - """ - - #only start repos older than the specified amount of days - #Query a set of valid repositories sorted by weight, also making sure that the repos aren't new or errored - #Order by the relevant weight for the collection hook - collection_size = start_repos_from_given_group_of_users(session,limit,tuple(quarter_list),condition_concat_string,phase_list,hook=hook,repos_type="old") + return split_user_list - limit -= collection_size \ No newline at end of file diff --git a/augur/tasks/util/random_key_auth.py b/augur/tasks/util/random_key_auth.py index 345067ec18..7f7bd65557 100644 --- a/augur/tasks/util/random_key_auth.py +++ b/augur/tasks/util/random_key_auth.py @@ -33,7 +33,7 @@ def auth_flow(self, request: Request) -> Generator[Request, Response, None]: if self.list_of_keys: key_value = choice(self.list_of_keys) - + self.logger.debug(f'Key value used: {key_value}') # formats the key string into a format GitHub will accept if self.key_format: @@ -43,6 +43,7 @@ def auth_flow(self, request: Request) -> Generator[Request, Response, None]: # set the headers of the request with the new key request.headers[self.header_name] = key_string + #self.logger.info(f"List of Keys: {self.list_of_keys}") else: self.logger.error(f"There are no valid keys to make a request with: {self.list_of_keys}") diff --git a/augur/tasks/util/redis_list.py b/augur/tasks/util/redis_list.py index 0a3eaa79fa..0137273c1e 100644 --- a/augur/tasks/util/redis_list.py +++ b/augur/tasks/util/redis_list.py @@ -168,8 +168,10 @@ def pop(self, index: int = None): """ if index is None: - - redis.rpop(self.redis_list_key) + # This will get a random index from the list and remove it, + # decreasing the likelihood of everyone using the same key all the time + #redis.rpop(self.redis_list_key) + redis.spop(self.redis_list_key) else: # calls __delitem__ diff --git a/augur/tasks/util/worker_util.py b/augur/tasks/util/worker_util.py index b12646cd31..84c177724b 100644 --- a/augur/tasks/util/worker_util.py +++ b/augur/tasks/util/worker_util.py @@ -9,6 +9,8 @@ from typing import Optional, List, Any, Tuple from datetime import datetime, timedelta +import json +import subprocess def create_grouped_task_load(*args,processes=8,dataList=[],task=None): @@ -122,6 +124,24 @@ def calculate_date_weight_from_timestamps(added,last_collection,domain_start_day #Else increase its weight return -1 * factor +def parse_json_from_subprocess_call(logger, subprocess_arr, cwd=None): + logger.info(f"running subprocess {subprocess_arr[0]}") + if cwd: + p = subprocess.run(subprocess_arr,cwd=cwd,capture_output=True, text=True, timeout=None) + else: + p = subprocess.run(subprocess_arr,capture_output=True, text=True, timeout=None) + + logger.info('subprocess completed... ') + + output = p.stdout + + try: + required_output = json.loads(output) + except json.decoder.JSONDecodeError as e: + logger.error(f"Could not parse required output! \n output: {output} \n Error: {e}") + raise e + + return required_output # def create_server(app, worker=None): diff --git a/augur/templates/login.j2 b/augur/templates/login.j2 index c71d02d50f..faaab620ea 100644 --- a/augur/templates/login.j2 +++ b/augur/templates/login.j2 @@ -108,7 +108,7 @@ - - - - - - Settings - Augur View - - - - - - {% include 'notifications.j2' %} -
-
-
-
- Settings -
-
- -
- -
- {# Start dashboard content #} -
-
-

Profile

- {# Start content card #} -
-
- {# Start form body #} -
-
-
-
-

{{ current_user.id }}

- Delete Account -
- -
-
-
-

Update Password

- -
-
-
-
-
- - -
-
-
- - - - - - - \ No newline at end of file diff --git a/augur/templates/repos-table.j2 b/augur/templates/repos-table.j2 index a0593c73e1..fce58eca05 100644 --- a/augur/templates/repos-table.j2 +++ b/augur/templates/repos-table.j2 @@ -86,6 +86,10 @@ {% elif query_key %}

Your search did not match any repositories

+{% elif pages %} +

Pagination Error

+

Something went wrong displaying the requested section of results.

+

Please go back and try again.

{% elif current_user.is_authenticated %}

No Repos Tracked

Add repos to your personal tracker in your profile page

diff --git a/augur/templates/settings.j2 b/augur/templates/settings.j2 index cefa4ac587..c75b6522ad 100644 --- a/augur/templates/settings.j2 +++ b/augur/templates/settings.j2 @@ -27,282 +27,385 @@ Settings - Augur View - + {% include 'notifications.j2' %} -
-
-
-
- Settings -
-
- -
-