diff --git a/add.md b/add.md new file mode 100644 index 000000000..eaf2a3fac --- /dev/null +++ b/add.md @@ -0,0 +1 @@ +dfadffd diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 548c1eeff..439b359ab 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -170,21 +170,21 @@ def determine_worker_processes(ratio,maximum): sleep_time += 6 #60% of estimate, Maximum value of 45 - core_num_processes = determine_worker_processes(.6, 45) + core_num_processes = determine_worker_processes(.6, 80) logger.info(f"Starting core worker processes with concurrency={core_num_processes}") core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h" process_list.append(subprocess.Popen(core_worker.split(" "))) sleep_time += 6 #20% of estimate, Maximum value of 25 - secondary_num_processes = determine_worker_processes(.2, 25) + secondary_num_processes = determine_worker_processes(.2, 26) logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" process_list.append(subprocess.Popen(secondary_worker.split(" "))) sleep_time += 6 #15% of estimate, Maximum value of 20 - facade_num_processes = determine_worker_processes(.2, 20) + facade_num_processes = determine_worker_processes(.2, 40) logger.info(f"Starting facade worker processes with concurrency={facade_num_processes}") facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={facade_num_processes} -n facade:{uuid.uuid4().hex}@%h -Q facade" diff --git a/augur/application/schema/alembic/versions/22_mat_view_cntrbid.py b/augur/application/schema/alembic/versions/22_mat_view_cntrbid.py new file mode 100644 index 000000000..28b58756c --- /dev/null +++ b/augur/application/schema/alembic/versions/22_mat_view_cntrbid.py @@ -0,0 +1,188 @@ +"""Fix Keys and materialized view + +Revision ID: 22 +Revises: 21 +Create Date: 2023-08-23 18:17:22.651191 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy import text + +# revision identifiers, used by Alembic. +revision = '22' +down_revision = '21' +branch_labels = None +depends_on = None + + +def upgrade(): + + add_fix_keys_22() + +def downgrade(): + + upgrade=False + + add_fix_keys_22(upgrade) + +def add_fix_keys_22(upgrade=True): + + if upgrade: + + conn = op.get_bind() + conn.execute(text(""" + alter TABLE + augur_data.commits DROP CONSTRAINT if exists fk_commits_contributors_3, + DROP CONSTRAINT if exists fk_commits_contributors_4; + alter TABLE augur_data.contributors + DROP CONSTRAINT if exists "GH-UNIQUE-C", + DROP CONSTRAINT if exists + "GL-cntrb-LOGIN-UNIQUE";""")) + + conn = op.get_bind() + conn.execute(text(""" + drop materialized view if exists augur_data.explorer_contributor_actions; """)) + + conn = op.get_bind() + conn.execute(text(""" + create materialized view augur_data.explorer_contributor_actions as + SELECT + A.ID AS cntrb_id, + A.created_at, + A.repo_id, + A.ACTION, + repo.repo_name, + A.LOGIN, + DENSE_RANK() OVER(PARTITION BY A.ID, A.repo_id ORDER BY A.created_at) AS RANK + FROM ( + select + commits.cmt_ght_author_id AS ID, + commits.cmt_author_timestamp AS created_at, + commits.repo_id, + 'commit' :: TEXT AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + ( augur_data.commits LEFT JOIN augur_data.contributors ON ( ( ( contributors.cntrb_id ) :: TEXT = ( commits.cmt_ght_author_id ) :: TEXT ) ) ) + GROUP BY + commits.cmt_commit_hash, + commits.cmt_ght_author_id, + commits.repo_id, + commits.cmt_author_timestamp, + 'commit' :: TEXT, + contributors.cntrb_login + UNION all + SELECT + issues.reporter_id AS ID, + issues.created_at, + issues.repo_id, + 'issue_opened' :: TEXT AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + ( augur_data.issues LEFT JOIN augur_data.contributors ON ( ( contributors.cntrb_id = issues.reporter_id ) ) ) + WHERE + ( issues.pull_request IS NULL ) + UNION ALL + SELECT + pull_request_events.cntrb_id AS ID, + pull_request_events.created_at, + pull_requests.repo_id, + 'pull_request_closed' :: TEXT AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + augur_data.pull_requests, + ( augur_data.pull_request_events LEFT JOIN augur_data.contributors ON ( ( contributors.cntrb_id = pull_request_events.cntrb_id ) ) ) + WHERE + pull_requests.pull_request_id = pull_request_events.pull_request_id + AND pull_requests.pr_merged_at IS NULL + AND ( ( pull_request_events.ACTION ) :: TEXT = 'closed' :: TEXT ) + UNION ALL + SELECT + pull_request_events.cntrb_id AS ID, + pull_request_events.created_at, + pull_requests.repo_id, + 'pull_request_merged' :: TEXT AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + augur_data.pull_requests, + ( augur_data.pull_request_events LEFT JOIN augur_data.contributors ON ( ( contributors.cntrb_id = pull_request_events.cntrb_id ) ) ) + WHERE + pull_requests.pull_request_id = pull_request_events.pull_request_id + AND ( ( pull_request_events.ACTION ) :: TEXT = 'merged' :: TEXT ) + UNION ALL + SELECT + issue_events.cntrb_id AS ID, + issue_events.created_at, + issues.repo_id, + 'issue_closed' :: TEXT AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + augur_data.issues, + augur_data.issue_events + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = issue_events.cntrb_id + WHERE + issues.issue_id = issue_events.issue_id + AND issues.pull_request IS NULL + AND ( ( issue_events.ACTION ) :: TEXT = 'closed' :: TEXT ) + UNION ALL + SELECT + pull_request_reviews.cntrb_id AS ID, + pull_request_reviews.pr_review_submitted_at AS created_at, + pull_requests.repo_id, + ( 'pull_request_review_' :: TEXT || ( pull_request_reviews.pr_review_state ) :: TEXT ) AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + augur_data.pull_requests, + augur_data.pull_request_reviews + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = pull_request_reviews.cntrb_id + WHERE + pull_requests.pull_request_id = pull_request_reviews.pull_request_id + UNION ALL + SELECT + pull_requests.pr_augur_contributor_id AS ID, + pull_requests.pr_created_at AS created_at, + pull_requests.repo_id, + 'pull_request_open' :: TEXT AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + augur_data.pull_requests + LEFT JOIN augur_data.contributors ON pull_requests.pr_augur_contributor_id = contributors.cntrb_id + UNION ALL + SELECT + message.cntrb_id AS ID, + message.msg_timestamp AS created_at, + pull_requests.repo_id, + 'pull_request_comment' :: TEXT AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + augur_data.pull_requests, + augur_data.pull_request_message_ref, + augur_data.message + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id + WHERE + pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + AND pull_request_message_ref.msg_id = message.msg_id + UNION ALL + SELECT + issues.reporter_id AS ID, + message.msg_timestamp AS created_at, + issues.repo_id, + 'issue_comment' :: TEXT AS ACTION, + contributors.cntrb_login AS LOGIN + FROM + augur_data.issues, + augur_data.issue_message_ref, + augur_data.message + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id + WHERE + issue_message_ref.msg_id = message.msg_id + AND issues.issue_id = issue_message_ref.issue_id + AND issues.closed_at != message.msg_timestamp + ) A, + augur_data.repo + WHERE + A.repo_id = repo.repo_id + ORDER BY + A.created_at DESC""")) + diff --git a/augur/application/schema/alembic/versions/23_add_index_ghlogin.py b/augur/application/schema/alembic/versions/23_add_index_ghlogin.py new file mode 100644 index 000000000..48a96eb3b --- /dev/null +++ b/augur/application/schema/alembic/versions/23_add_index_ghlogin.py @@ -0,0 +1,45 @@ +"""add index + +Revision ID: 23 +Revises: 22 +Create Date: 2023-08-23 18:17:22.651191 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy import text + +# revision identifiers, used by Alembic. +revision = '23' +down_revision = '22' +branch_labels = None +depends_on = None + + +def upgrade(): + + gh_loginindex() + +def downgrade(): + + upgrade=False + + gh_loginindex(upgrade) + +def gh_loginindex(upgrade=True): + + if upgrade: + + conn = op.get_bind() + conn.execute(text(""" + CREATE INDEX if not exists "gh_login" ON "augur_data"."contributors" USING btree ( + "gh_login" ASC NULLS FIRST);""")) + + else: + + + conn = op.get_bind() + conn.execute(text(""" + DROP INDEX if exists "gh_login" ON "augur_data"."contributors" USING btree ( + "gh_login" ASC NULLS FIRST);""")) diff --git a/augur/application/schema/alembic/versions/24_alter_repo_labor_unique.py b/augur/application/schema/alembic/versions/24_alter_repo_labor_unique.py new file mode 100644 index 000000000..719aeabe7 --- /dev/null +++ b/augur/application/schema/alembic/versions/24_alter_repo_labor_unique.py @@ -0,0 +1,49 @@ +"""Alter repo labor unique + +Revision ID: 24 +Revises: 23 +Create Date: 2023-08-25 18:17:22.651191 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy.sql import text +import re + +# revision identifiers, used by Alembic. +revision = '24' +down_revision = '23' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + + conn = op.get_bind() + + #Remove constraint being initially deferred. + conn.execute(text(f""" + ALTER TABLE "augur_data"."repo_labor" + DROP CONSTRAINT IF EXISTS "rl-unique", + ADD CONSTRAINT "rl-unique" UNIQUE ("repo_id", "rl_analysis_date", "file_path", "file_name"); + """)) + """ + + """ + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + conn = op.get_bind() + + #Make unique initially deferred + conn.execute(text(f""" + ALTER TABLE "augur_data"."repo_labor" + DROP CONSTRAINT IF EXISTS "rl-unique", + ADD CONSTRAINT "rl-unique" UNIQUE ("repo_id", "rl_analysis_date", "file_path", "file_name") DEFERRABLE INITIALLY DEFERRED; + """)) + + # ### end Alembic commands ### diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index eaeac96af..9176a9328 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -12,9 +12,10 @@ from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler from augur.application.db.util import execute_session_query from augur.tasks.git.dependency_tasks.dependency_util import dependency_calculator as dep_calc +from augur.tasks.util.worker_util import parse_json_from_subprocess_call def generate_deps_data(session, repo_id, path): - """Runs deps modules on repo and stores data in database + """Run dependency logic on repo and stores data in database :param repo_id: Repository ID :param path: Absolute path of the Repostiory """ @@ -80,16 +81,8 @@ def generate_scorecard(session,repo_id,path): key_handler = GithubApiKeyHandler(session) os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() - p= subprocess.run(['./scorecard', command, '--format=json'], cwd= path_to_scorecard ,capture_output=True, text=True, timeout=None) - session.logger.info('subprocess completed successfully... ') - output = p.stdout - - try: - required_output = json.loads(output) - except json.decoder.JSONDecodeError as e: - session.logger.error(f"Could not parse required output! \n output: {output} \n Error: {e}") - return - + required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) + session.logger.info('adding to database...') session.logger.debug(f"output: {required_output}") diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index d407011b0..3f08fde97 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -46,6 +46,7 @@ from augur.tasks.git.dependency_tasks.tasks import process_dependency_metrics from augur.tasks.git.dependency_libyear_tasks.tasks import process_libyear_dependency_metrics +from augur.tasks.git.scc_value_tasks.tasks import process_scc_value_metrics from augur.tasks.github.util.github_paginator import GithubPaginator, hit_api from augur.tasks.github.util.gh_graphql_entities import PullRequest @@ -526,7 +527,8 @@ def facade_phase(repo_git): group( chain(*facade_core_collection), process_dependency_metrics.si(repo_git), - process_libyear_dependency_metrics.si(repo_git) + process_libyear_dependency_metrics.si(repo_git), + process_scc_value_metrics.si(repo_git) ) ) diff --git a/augur/tasks/git/scc_value_tasks/__init__.py b/augur/tasks/git/scc_value_tasks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/augur/tasks/git/scc_value_tasks/core.py b/augur/tasks/git/scc_value_tasks/core.py new file mode 100644 index 000000000..5fd7afb7b --- /dev/null +++ b/augur/tasks/git/scc_value_tasks/core.py @@ -0,0 +1,57 @@ +from datetime import datetime +import logging +import requests +import json +import os +import subprocess +import re +import traceback +from augur.application.db.models import * +from augur.application.db.session import DatabaseSession +from augur.application.config import AugurConfig +from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler +from augur.application.db.util import execute_session_query +from augur.tasks.util.worker_util import parse_json_from_subprocess_call + +def value_model(session,repo_git,repo_id, path): + """Runs scc on repo and stores data in database + :param repo_id: Repository ID + :param path: absolute file path of the Repostiory + """ + + session.logger.info('Generating value data for repo') + session.logger.info(f"Repo ID: {repo_id}, Path: {path}") + session.logger.info('Running scc...') + + path_to_scc = os.environ['HOME'] + '/scc' + + required_output = parse_json_from_subprocess_call(session.logger,['./scc', '-f','json','--by-file', path], cwd=path_to_scc) + + session.logger.info('adding scc data to database... ') + session.logger.debug(f"output: {required_output}") + + to_insert = [] + for record in required_output: + for file in record['Files']: + repo_labor = { + 'repo_id': repo_id, + 'rl_analysis_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ'), + 'programming_language': file['Language'], + 'file_path': file['Location'], + 'file_name': file['Filename'], + 'total_lines': file['Lines'], + 'code_lines': file['Code'], + 'comment_lines': file['Comment'], + 'blank_lines': file['Blank'], + 'code_complexity': file['Complexity'], + 'repo_url': repo_git, + 'tool_source': 'value_model', + 'data_source': 'Git', + 'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') + } + + to_insert.append(repo_labor) + + session.insert_data(to_insert, RepoLabor, ["repo_id", "rl_analysis_date", "file_path", "file_name" ]) + + session.logger.info(f"Done generating scc data for repo {repo_id} from path {path}") diff --git a/augur/tasks/git/scc_value_tasks/tasks.py b/augur/tasks/git/scc_value_tasks/tasks.py new file mode 100644 index 000000000..a2e4d11fc --- /dev/null +++ b/augur/tasks/git/scc_value_tasks/tasks.py @@ -0,0 +1,28 @@ +import logging +import traceback +from augur.application.db.session import DatabaseSession +from augur.tasks.git.scc_value_tasks.core import * +from augur.tasks.init.celery_app import celery_app as celery +from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask, AugurCoreRepoCollectionTask +from augur.application.db.util import execute_session_query +from augur.application.config import AugurConfig +from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path + + +@celery.task(base=AugurFacadeRepoCollectionTask) +def process_scc_value_metrics(repo_git): + + from augur.tasks.init.celery_app import engine + + logger = logging.getLogger(process_scc_value_metrics.__name__) + + with DatabaseSession(logger,engine) as session: + logger.info(f"repo_git: {repo_git}") + + query = session.query(Repo).filter(Repo.repo_git == repo_git) + repo = execute_session_query(query, 'one') + + config = AugurConfig(session.logger, session) + absolute_repo_path = get_absolute_repo_path(config.get_section("Facade")['repo_directory'],repo.repo_id,repo.repo_path,repo.repo_name) + + value_model(session,repo_git,repo.repo_id, absolute_repo_path) \ No newline at end of file diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index ac6e18fc6..706541d1c 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -52,7 +52,8 @@ class CollectionState(Enum): git_tasks = ['augur.tasks.git.facade_tasks', 'augur.tasks.git.dependency_tasks.tasks', - 'augur.tasks.git.dependency_libyear_tasks.tasks'] + 'augur.tasks.git.dependency_libyear_tasks.tasks', + 'augur.tasks.git.scc_value_tasks.tasks'] data_analysis_tasks = ['augur.tasks.data_analysis.message_insights.tasks', 'augur.tasks.data_analysis.clustering_worker.tasks', @@ -139,6 +140,7 @@ def on_failure(self,exc,task_id,args,kwargs,einfo): 'augur.tasks.github.pull_requests.tasks.collect_pull_request_review_comments': {'queue': 'secondary'}, 'augur.tasks.git.dependency_tasks.tasks.process_ossf_dependency_metrics': {'queue': 'secondary'}, 'augur.tasks.git.dependency_tasks.tasks.process_dependency_metrics': {'queue': 'facade'}, + 'augur.tasks.git.scc_value_tasks.tasks.process_scc_value_metrics' : {'queue': 'facade'}, 'augur.tasks.git.dependency_libyear_tasks.tasks.process_libyear_dependency_metrics': {'queue': 'facade'}, 'augur.tasks.frontend.*': {'queue': 'frontend'}, 'augur.tasks.data_analysis.contributor_breadth_worker.*': {'queue': 'secondary'}, diff --git a/augur/tasks/util/worker_util.py b/augur/tasks/util/worker_util.py index b12646cd3..6380ed22b 100644 --- a/augur/tasks/util/worker_util.py +++ b/augur/tasks/util/worker_util.py @@ -9,6 +9,8 @@ from typing import Optional, List, Any, Tuple from datetime import datetime, timedelta +import json +import subprocess def create_grouped_task_load(*args,processes=8,dataList=[],task=None): @@ -122,6 +124,24 @@ def calculate_date_weight_from_timestamps(added,last_collection,domain_start_day #Else increase its weight return -1 * factor +def parse_json_from_subprocess_call(logger, subprocess_arr, cwd=None): + logger.info(f"running subprocess {subprocess_arr[0]}") + if cwd: + p = subprocess.run(subprocess_arr,cwd=cwd,capture_output=True, text=True, timeout=None) + else: + p = subprocess.run(subprocess_arr,capture_output=True, text=True, timeout=None) + + logger.info('subprocess completed... ') + + output = p.stdout + + try: + required_output = json.loads(output) + except json.decoder.JSONDecodeError as e: + session.logger.error(f"Could not parse required output! \n output: {output} \n Error: {e}") + raise e + + return required_output # def create_server(app, worker=None): diff --git a/scripts/control/.gitignore b/scripts/control/.gitignore new file mode 100644 index 000000000..56c55cec0 --- /dev/null +++ b/scripts/control/.gitignore @@ -0,0 +1 @@ +rebuild-matviews.sh diff --git a/scripts/install/workers.sh b/scripts/install/workers.sh index 6a7c95f48..4f6dd7a62 100755 --- a/scripts/install/workers.sh +++ b/scripts/install/workers.sh @@ -80,3 +80,20 @@ else echo "scorecard build done" cd $CURRENT_DIR fi + +#Do the same thing for scc for value worker +if [ -d "$HOME/scc" ]; then + echo " Scc already exists, skipping cloning ..." + echo " Updating Scc ... " + rm -rf $HOME/scc +fi + +echo "Cloning Sloc Cloc and Code (SCC) to generate value data ..." +git clone https://github.com/boyter/scc $HOME/scc +cd $HOME/scc +CURRENT_DIR=$PWD; +cd $CURRENT_DIR +cd $HOME/scc; +go build; +echo "scc build done" +cd $CURRENT_DIR \ No newline at end of file