diff --git a/.gitignore b/.gitignore index 7ebccb6a1..2feaa9803 100644 --- a/.gitignore +++ b/.gitignore @@ -192,4 +192,8 @@ pgdata/ postgres-data/ # Generated files from github -.history/ \ No newline at end of file +.history/sendgrid.env +sendgrid.env +*sendgrid*.env +./sendgrid.env +sendgrid.env diff --git a/augur/api/routes/application.py b/augur/api/routes/application.py index d758b020f..3d2b22b8e 100644 --- a/augur/api/routes/application.py +++ b/augur/api/routes/application.py @@ -12,7 +12,7 @@ import pandas as pd from flask import request, Response, jsonify, session from flask_login import login_user, logout_user, current_user, login_required -from werkzeug.security import generate_password_hash, check_password_hash +from werkzeug.security import check_password_hash from sqlalchemy.sql import text from sqlalchemy.orm import sessionmaker from sqlalchemy.orm.exc import NoResultFound diff --git a/augur/api/routes/user.py b/augur/api/routes/user.py index cb2635e1d..6e8ae680e 100644 --- a/augur/api/routes/user.py +++ b/augur/api/routes/user.py @@ -12,7 +12,7 @@ import pandas as pd from flask import request, Response, jsonify, session from flask_login import login_user, logout_user, current_user, login_required -from werkzeug.security import generate_password_hash, check_password_hash +from werkzeug.security import check_password_hash from sqlalchemy.sql import text from sqlalchemy.orm import sessionmaker from sqlalchemy.orm.exc import NoResultFound @@ -212,7 +212,7 @@ def update_user(): return jsonify({"status": "Email Updated"}) if new_password is not None: - current_user.login_hashword = generate_password_hash(new_password) + current_user.login_hashword = User.compute_hashsed_password(new_password) session.commit() session = Session() return jsonify({"status": "Password Updated"}) diff --git a/augur/api/view/api.py b/augur/api/view/api.py index 084ee4bce..287b07943 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -1,8 +1,11 @@ from flask import Flask, render_template, render_template_string, request, abort, jsonify, redirect, url_for, session, flash +import re from flask_login import current_user, login_required -from augur.application.db.models import Repo +from augur.application.db.models import Repo, RepoGroup, UserGroup, UserRepo +from augur.tasks.frontend import add_org_repo_list, parse_org_and_repo_name, parse_org_name from .utils import * -from ..server import app +from ..server import app, engine +from augur.application.db.session import DatabaseSession @app.route('/cache/file/') @app.route('/cache/file/') @@ -11,6 +14,36 @@ def cache(file=None): return redirect(url_for('static', filename="cache")) return redirect(url_for('static', filename="cache/" + toCacheFilename(file, False))) + +def add_existing_repo_to_group(session, user_id, group_name, repo_id): + + logger.info("Adding existing repo to group") + + group_id = UserGroup.convert_group_name_to_id(session, user_id, group_name) + if group_id is None: + return False + + result = UserRepo.insert(session, repo_id, group_id) + if not result: + return False + +def add_existing_org_to_group(session, user_id, group_name, rg_id): + + logger.info("Adding existing org to group") + + group_id = UserGroup.convert_group_name_to_id(session, user_id, group_name) + if group_id is None: + return False + + repos = session.query(Repo).filter(Repo.repo_group_id == rg_id).all() + logger.info("Length of repos in org: " + str(len(repos))) + for repo in repos: + result = UserRepo.insert(session, repo.repo_id, group_id) + if not result: + logger.info("Failed to add repo to group") + + + @app.route('/account/repos/add', methods = ['POST']) @login_required def av_add_user_repo(): @@ -33,46 +66,51 @@ def av_add_user_repo(): if group == "None": group = current_user.login_name + "_default" + invalid_urls = [] + + with DatabaseSession(logger, engine) as session: + for url in urls: + + # matches https://github.com/{org}/ or htts://github.com/{org} + if (org_name := Repo.parse_github_org_url(url)): + rg_obj = RepoGroup.get_by_name(session, org_name) + if rg_obj: + # add the orgs repos to the group + add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) + + # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} + elif Repo.parse_github_repo_url(url)[0]: + org_name, repo_name = Repo.parse_github_repo_url(url) + repo_git = f"https://github.com/{org_name}/{repo_name}" + repo_obj = Repo.get_by_repo_git(session, repo_git) + if repo_obj: + add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) + + # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} + elif (match := parse_org_and_repo_name(url)): + org, repo = match.groups() + repo_git = f"https://github.com/{org}/{repo}" + repo_obj = Repo.get_by_repo_git(session, repo_git) + if repo_obj: + add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) + + # matches /{org}/ or /{org} or {org}/ or {org} + elif (match := parse_org_name(url)): + org_name = match.group(1) + rg_obj = RepoGroup.get_by_name(session, org_name) + logger.info(rg_obj) + if rg_obj: + # add the orgs repos to the group + add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) + + else: + invalid_urls.append(url) - added_orgs = 0 - added_repos = 0 - for url in urls: - - # matches https://github.com/{org}/ or htts://github.com/{org} - if Repo.parse_github_org_url(url): - added = current_user.add_org(group, url) - if added: - added_orgs += 1 - - # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} - elif Repo.parse_github_repo_url(url)[0]: - print("Adding repo") - added = current_user.add_repo(group, url) - if added: - print("Repo added") - added_repos += 1 - - # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} - elif (match := re.match(r'^\/?([a-zA-Z0-9_-]+)\/([a-zA-Z0-9_-]+)\/?$', url)): - org, repo = match.groups() - repo_url = f"https://github.com/{org}/{repo}/" - added = current_user.add_repo(group, repo_url) - if added: - added_repos += 1 - - # matches /{org}/ or /{org} or {org}/ or {org} - elif (match := re.match(r'^\/?([a-zA-Z0-9_-]+)\/?$', url)): - org = match.group(1) - org_url = f"https://github.com/{org}/" - added = current_user.add_org(group, org_url) - if added: - added_orgs += 1 - - - if not added_orgs and not added_repos: - flash(f"Unable to add any repos or orgs") - else: - flash(f"Successfully added {added_repos} repos and {added_orgs} orgs") + if urls: + urls = [url.lower() for url in urls] + add_org_repo_list.si(current_user.user_id, group, urls).apply_async() + + flash("Adding repos and orgs in the background") return redirect(url_for("user_settings") + "?section=tracker") diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 7cf1292ca..631d9d755 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -75,6 +75,7 @@ def start(disable_collection, development, port): if not port: port = config.get_value("Server", "port") + worker_vmem_cap = config.get_value("Celery", 'worker_process_vmem_cap') gunicorn_command = f"gunicorn -c {gunicorn_location} -b {host}:{port} augur.api.server:app" server = subprocess.Popen(gunicorn_command.split(" ")) @@ -83,30 +84,18 @@ def start(disable_collection, development, port): logger.info('Gunicorn webserver started...') logger.info(f'Augur is running at: {"http" if development else "https"}://{host}:{port}') - scheduling_worker_process = None - core_worker_process = None - secondary_worker_process = None - celery_beat_process = None - facade_worker_process = None - if not disable_collection: - - if os.path.exists("celerybeat-schedule.db"): + processes = start_celery_worker_processes(float(worker_vmem_cap), disable_collection) + time.sleep(5) + if os.path.exists("celerybeat-schedule.db"): logger.info("Deleting old task schedule") os.remove("celerybeat-schedule.db") - scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=2 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" - core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n core:{uuid.uuid4().hex}@%h" - secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=10 -n secondary:{uuid.uuid4().hex}@%h -Q secondary" - facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=15 -n facade:{uuid.uuid4().hex}@%h -Q facade" - - scheduling_worker_process = subprocess.Popen(scheduling_worker.split(" ")) - core_worker_process = subprocess.Popen(core_worker.split(" ")) - secondary_worker_process = subprocess.Popen(secondary_worker.split(" ")) - facade_worker_process = subprocess.Popen(facade_worker.split(" ")) + celery_beat_process = None + celery_command = "celery -A augur.tasks.init.celery_app.celery_app beat -l debug" + celery_beat_process = subprocess.Popen(celery_command.split(" ")) - time.sleep(5) + if not disable_collection: - with DatabaseSession(logger) as session: clean_collection_status(session) @@ -120,10 +109,6 @@ def start(disable_collection, development, port): augur_collection_monitor.si().apply_async() - - celery_command = "celery -A augur.tasks.init.celery_app.celery_app beat -l debug" - celery_beat_process = subprocess.Popen(celery_command.split(" ")) - else: logger.info("Collection disabled") @@ -135,21 +120,10 @@ def start(disable_collection, development, port): logger.info("Shutting down server") server.terminate() - if core_worker_process: - logger.info("Shutting down celery process: core") - core_worker_process.terminate() - - if scheduling_worker_process: - logger.info("Shutting down celery process: scheduling") - scheduling_worker_process.terminate() - - if secondary_worker_process: - logger.info("Shutting down celery process: secondary") - secondary_worker_process.terminate() - - if facade_worker_process: - logger.info("Shutting down celery process: facade") - facade_worker_process.terminate() + logger.info("Shutting down all celery worker processes") + for p in processes: + if p: + p.terminate() if celery_beat_process: logger.info("Shutting down celery beat process") @@ -162,6 +136,54 @@ def start(disable_collection, development, port): except RedisConnectionError: pass +def start_celery_worker_processes(vmem_cap_ratio, disable_collection=False): + + #Calculate process scaling based on how much memory is available on the system in bytes. + #Each celery process takes ~500MB or 500 * 1024^2 bytes + + process_list = [] + + #Cap memory usage to 30% of total virtual memory + available_memory_in_bytes = psutil.virtual_memory().total * vmem_cap_ratio + available_memory_in_megabytes = available_memory_in_bytes / (1024 ** 2) + max_process_estimate = available_memory_in_megabytes // 500 + + #Get a subset of the maximum procesess available using a ratio, not exceeding a maximum value + def determine_worker_processes(ratio,maximum): + return max(min(round(max_process_estimate * ratio),maximum),1) + + frontend_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n frontend:{uuid.uuid4().hex}@%h -Q frontend" + max_process_estimate -= 1 + process_list.append(subprocess.Popen(frontend_worker.split(" "))) + + if not disable_collection: + + #2 processes are always reserved as a baseline. + scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=2 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" + max_process_estimate -= 2 + process_list.append(subprocess.Popen(scheduling_worker.split(" "))) + + #60% of estimate, Maximum value of 45 + core_num_processes = determine_worker_processes(.6, 45) + logger.info(f"Starting core worker processes with concurrency={core_num_processes}") + core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h" + process_list.append(subprocess.Popen(core_worker.split(" "))) + + #20% of estimate, Maximum value of 25 + secondary_num_processes = determine_worker_processes(.2, 25) + logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") + secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" + process_list.append(subprocess.Popen(secondary_worker.split(" "))) + + #15% of estimate, Maximum value of 20 + facade_num_processes = determine_worker_processes(.2, 20) + logger.info(f"Starting facade worker processes with concurrency={facade_num_processes}") + facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={facade_num_processes} -n facade:{uuid.uuid4().hex}@%h -Q facade" + + process_list.append(subprocess.Popen(facade_worker.split(" "))) + + return process_list + @cli.command('stop') def stop(): @@ -378,7 +400,6 @@ def raise_open_file_limit(num_files): return - # def initialize_components(augur_app, disable_housekeeper): # master = None # manager = None diff --git a/augur/application/cli/user.py b/augur/application/cli/user.py index e2846d5f3..9d0b822be 100644 --- a/augur/application/cli/user.py +++ b/augur/application/cli/user.py @@ -8,7 +8,6 @@ import os import click import logging -from werkzeug.security import generate_password_hash from augur.application.db.models import User from augur.application.db.engine import DatabaseEngine from sqlalchemy.orm import sessionmaker @@ -48,7 +47,7 @@ def add_user(username, email, firstname, lastname, admin, phone_number, password user = session.query(User).filter(User.login_name == username).first() if not user: - password = generate_password_hash(password) + password = User.compute_hashsed_password(password) new_user = User(login_name=username, login_hashword=password, email=email, text_phone=phone_number, first_name=firstname, last_name=lastname, admin=admin, tool_source="User CLI", tool_version=None, data_source="CLI") session.add(new_user) session.commit() @@ -59,4 +58,21 @@ def add_user(username, email, firstname, lastname, admin, phone_number, password session.close() engine.dispose() - return 0 \ No newline at end of file + return 0 + +@cli.command('password_reset', short_help="Reset a user's password") +@click.argument("username") +@click.password_option(help="New password") +def reset_password(username, password): + session = Session() + + user = session.query(User).filter(User.login_name == username).first() + + if not user: + return click.echo("invalid username") + + password = User.compute_hashsed_password(password) + user.login_hashword = password + session.commit() + + return click.echo("Password updated") \ No newline at end of file diff --git a/augur/application/config.py b/augur/application/config.py index 134dd3daf..c9aff085b 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -37,6 +37,7 @@ def get_development_flag(): "github": "", "gitlab": "" }, + #TODO: a lot of these are deprecated. "Facade": { "check_updates": 1, "create_xlsx_summary_files": 1, @@ -66,7 +67,8 @@ def get_development_flag(): "log_level": "INFO", }, "Celery": { - "concurrency": 12 + "worker_process_vmem_cap": 0.25, + "refresh_materialized_views_interval_in_days": 7 }, "Redis": { "cache_group": 0, diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index 70caa0230..676a71dee 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -25,6 +25,7 @@ import logging import re from typing import List, Any, Dict +import json from augur.application.db.models.base import Base @@ -1691,6 +1692,21 @@ class RepoBadging(Base): repo = relationship("Repo") + @staticmethod + def insert(session, repo_id: int, data: dict) -> dict: + + insert_statement = text("""INSERT INTO repo_badging (repo_id,tool_source,tool_version,data_source,data) + VALUES (:repo_id,:t_source,:t_version,:d_source,:data) + """).bindparams( + repo_id=repo_id, + t_source="collect_linux_badge_info", + t_version="0.50.3", + d_source="OSSF CII", + data=json.dumps(data,indent=4) + ) + + session.execute_sql(insert_statement) + class RepoClusterMessage(Base): __tablename__ = "repo_cluster_messages" diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index e65363ffb..0b40921a9 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -317,6 +317,17 @@ def get_user(session, username: str): return user except NoResultFound: return None + + @staticmethod + def get_by_id(session, user_id: int): + + if not isinstance(user_id, int): + return None + try: + user = session.query(User).filter(User.user_id == user_id).one() + return user + except NoResultFound: + return None @staticmethod def create_user(username: str, password: str, email: str, first_name:str, last_name:str, admin=False): @@ -335,7 +346,7 @@ def create_user(username: str, password: str, email: str, first_name:str, last_n return False, {"status": "A User already exists with that email"} try: - user = User(login_name = username, login_hashword = generate_password_hash(password), email = email, first_name = first_name, last_name = last_name, tool_source="User API", tool_version=None, data_source="API", admin=admin) + user = User(login_name = username, login_hashword = User.compute_hashsed_password(password), email = email, first_name = first_name, last_name = last_name, tool_source="User API", tool_version=None, data_source="API", admin=admin) session.add(user) session.commit() @@ -373,7 +384,7 @@ def update_password(self, session, old_password, new_password): if not check_password_hash(self.login_hashword, old_password): return False, {"status": "Password did not match users password"} - self.login_hashword = generate_password_hash(new_password) + self.login_hashword = User.compute_hashsed_password(new_password) session.commit() return True, {"status": "Password updated"} @@ -429,9 +440,12 @@ def remove_group(self, group_name): def add_repo(self, group_name, repo_url): from augur.tasks.github.util.github_task_session import GithubTaskSession - - with GithubTaskSession(logger) as session: - result = UserRepo.add(session, repo_url, self.user_id, group_name) + from augur.tasks.github.util.github_api_key_handler import NoValidKeysError + try: + with GithubTaskSession(logger) as session: + result = UserRepo.add(session, repo_url, self.user_id, group_name) + except NoValidKeysError: + return False, {"status": "No valid keys"} return result @@ -445,9 +459,13 @@ def remove_repo(self, group_name, repo_id): def add_org(self, group_name, org_url): from augur.tasks.github.util.github_task_session import GithubTaskSession + from augur.tasks.github.util.github_api_key_handler import NoValidKeysError - with GithubTaskSession(logger) as session: - result = UserRepo.add_org_repos(session, org_url, self.user_id, group_name) + try: + with GithubTaskSession(logger) as session: + result = UserRepo.add_org_repos(session, org_url, self.user_id, group_name) + except NoValidKeysError: + return False, {"status": "No valid keys"} return result @@ -578,6 +596,10 @@ def get_favorite_groups(self, session): return None, {"status": "Error when trying to get favorite groups"} return groups, {"status": "Success"} + + @staticmethod + def compute_hashsed_password(password): + return generate_password_hash(password, method='pbkdf2:sha512', salt_length=32) @@ -864,7 +886,7 @@ def add_org_repos(session, url: List[str], user_id: int, group_name: int): # if it doesn't exist create one if not repo_group: - repo_group = RepoGroup(rg_name=owner, rg_description="", rg_website="", rg_recache=0, rg_type="Unknown", + repo_group = RepoGroup(rg_name=owner.lower(), rg_description="", rg_website="", rg_recache=0, rg_type="Unknown", tool_source="Loaded by user", tool_version="1.0", data_source="Git") session.add(repo_group) session.commit() diff --git a/augur/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py b/augur/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py index 2b94987e8..84f8f088b 100644 --- a/augur/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py +++ b/augur/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py @@ -55,7 +55,7 @@ def total_facade_reset(): shutil.rmtree(path) #Create path - path.touch() + path.mkdir() #Move credentials in shutil.move("/tmp/.git-credentials",f"{facade_base_dir}.git-credentials") diff --git a/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py b/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py new file mode 100644 index 000000000..24d7fd08c --- /dev/null +++ b/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py @@ -0,0 +1,66 @@ +"""Add extra celery options to the config if they do not exist + +Revision ID: 19 +Revises: 18 +Create Date: 2023-05-15 12:03:57.171011 + +""" +from alembic import op +import sqlalchemy as sa +from augur.application.db.session import DatabaseSession +from augur.application.config import * +from sqlalchemy.sql import text +import logging + +# revision identifiers, used by Alembic. +revision = '19' +down_revision = '18' +branch_labels = None +depends_on = None + +logger = logging.getLogger(__name__) + +def upgrade(): + + with DatabaseSession(logger) as session: + config = AugurConfig(logger,session) + config_dict = config.load_config() + + #Update the missing fields of the celery section in the config + section = config_dict.get("Celery") + + #Just copy the default if section doesn't exist. + if section: + if 'worker_process_vmem_cap' not in section.keys(): + section['worker_process_vmem_cap'] = 0.25 + + if 'refresh_materialized_views_interval_in_days' not in section.keys(): + section['refresh_materialized_views_interval_in_days'] = 7 + else: + section = config.default_config["Celery"] + + config.add_section_from_json("Celery", section) + + #delete old setting + session.execute_sql(text(f""" + DELETE FROM augur_operations.config + WHERE section_name='Celery' AND setting_name='concurrency'; + """)) + + + +def downgrade(): + + conn = op.get_bind() + + conn.execute(text(f""" + DELETE FROM augur_operations.config + WHERE section_name='Celery' AND (setting_name='worker_process_vmem_cap' OR setting_name='refresh_materialized_views_interval_in_days'); + """)) + + try: + conn.execute(text(f""" + INSERT INTO augur_operations.config (section_name,setting_name,value,type) VALUES ('Celery','concurrency',12,'int'); + """)) + except: + pass \ No newline at end of file diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py new file mode 100644 index 000000000..b8eb8b203 --- /dev/null +++ b/augur/tasks/frontend.py @@ -0,0 +1,89 @@ +import logging +import re + +from augur.tasks.init.celery_app import celery_app as celery +from augur.tasks.github.util.github_task_session import GithubTaskSession +from augur.application.db.models import UserRepo, Repo, User + +def parse_org_name(string): + + match = re.match(r'^\/?([a-zA-Z0-9_-]+)\/?$', string) + return match + +def parse_org_and_repo_name(string): + + match = re.match(r'^\/?([a-zA-Z0-9_-]+)\/([a-zA-Z0-9_-]+)\/?$', string) + return match + + +@celery.task +def add_org_repo_list(user_id, group_name, urls): + + logger = logging.getLogger(add_org_repo_list.__name__) + + with GithubTaskSession(logger) as session: + + user = User.get_by_id(session, user_id) + + invalid_urls = [] + valid_orgs = [] + valid_repos = [] + for url in urls: + + # matches https://github.com/{org}/ or htts://github.com/{org} + if Repo.parse_github_org_url(url): + added = user.add_org(group_name, url)[0] + if added: + valid_orgs.append(url) + + # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} + elif Repo.parse_github_repo_url(url)[0]: + added = user.add_repo(group_name, url)[0] + if added: + valid_repos.append(url) + + # matches /{org}/{repo}/ or /{org}/{repo} or {org}/{repo}/ or {org}/{repo} + elif (match := parse_org_and_repo_name(url)): + org, repo = match.groups() + repo_url = f"https://github.com/{org}/{repo}/" + added = user.add_repo(group_name, repo_url)[0] + if added: + valid_repos.append(url) + + # matches /{org}/ or /{org} or {org}/ or {org} + elif (match := parse_org_name(url)): + org = match.group(1) + org_url = f"https://github.com/{org}/" + added = user.add_org(group_name, org_url)[0] + if added: + valid_orgs.append(url) + else: + invalid_urls.append(url) + + return valid_orgs, valid_repos, invalid_urls + + + + + + +@celery.task +def add_repo(user_id, group_name, repo_url): + + logger = logging.getLogger(add_org.__name__) + + with GithubTaskSession(logger) as session: + result = UserRepo.add(session, repo_url, user_id, group_name) + + print(repo_url, result) + + +@celery.task +def add_org(user_id, group_name, org_url): + + logger = logging.getLogger(add_org.__name__) + + with GithubTaskSession(logger) as session: + result = UserRepo.add_org_repos(session, org_url, user_id, group_name) + + print(org_url, result) diff --git a/augur/tasks/git/dependency_tasks/tasks.py b/augur/tasks/git/dependency_tasks/tasks.py index 898de37cb..0cdd333b2 100644 --- a/augur/tasks/git/dependency_tasks/tasks.py +++ b/augur/tasks/git/dependency_tasks/tasks.py @@ -25,10 +25,10 @@ def process_dependency_metrics(repo_git): @celery.task(base=AugurCoreRepoCollectionTask) -def process_ossf_scorecard_metrics(repo_git): +def process_ossf_dependency_metrics(repo_git): from augur.tasks.init.celery_app import engine - logger = logging.getLogger(process_ossf_scorecard_metrics.__name__) + logger = logging.getLogger(process_ossf_dependency_metrics.__name__) with DatabaseSession(logger, engine) as session: logger.info(f"repo_git: {repo_git}") diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index 52b3f2813..d407011b0 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -353,25 +353,26 @@ def clone_repos(): try: git_repo_initialize(session, repo_git) session.commit() + + # get the commit count + commit_count = get_repo_commit_count(session, repo_git) + facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) + + update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) + + # set repo to update + setattr(repoStatus,"facade_status", CollectionState.UPDATE.value) + session.commit() except GitCloneError: # continue to next repo, since we can't calculate # commit_count or weight without the repo cloned - setattr(repoStatus,"facade_status", CollectionState.FAILED_CLONE.value) session.commit() - continue - - #logger.info("GOT HERE ISAAC") - - # get the commit count - commit_count = get_repo_commit_count(session, repo_git) - facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) - - update_facade_scheduling_fields(session, repo_git, facade_weight, commit_count) - - # set repo to update - setattr(repoStatus,"facade_status", CollectionState.UPDATE.value) - session.commit() + except Exception as e: + logger.error(f"Ran into unexpected issue when cloning repositories \n Error: {e}") + # set repo to error + setattr(repoStatus,"facade_status", CollectionState.ERROR.value) + session.commit() clone_repos.si().apply_async(countdown=60*5) diff --git a/augur/tasks/github/repo_info/core.py b/augur/tasks/github/repo_info/core.py index 287f7368f..50fa88068 100644 --- a/augur/tasks/github/repo_info/core.py +++ b/augur/tasks/github/repo_info/core.py @@ -12,7 +12,8 @@ from augur.tasks.github.util.gh_graphql_entities import hit_api_graphql, request_graphql_dict from augur.application.db.models import * from augur.tasks.github.util.github_task_session import * - +from augur.application.db.models.augur_data import RepoBadging +from urllib.parse import quote def query_committers_count(key_auth, logger, owner, repo): @@ -292,3 +293,35 @@ def repo_info_model(augur_db, key_auth, repo_orm_obj, logger): logger.info(f"Inserted info for {owner}/{repo}\n") +def badges_model(logger,repo_git,repo_id,db): + """ Data collection and storage method + Query the CII API and store the result in the DB for the badges model + + This is a github task because it only covers github repos, this is not + part of the regular repo info model because it uses a differant api + github. + """ + cii_endpoint = "https://bestpractices.coreinfrastructure.org/projects.json?pq=" + + + #https://github.com/chaoss/grimoirelab-hatstall + logger.info(f"Collecting badge data for {repo_git}") + git_url_extension = quote(repo_git[0:-4]) + + url = cii_endpoint + git_url_extension + logger.debug(f"Hitting CII endpoint: {url}") + + #Hit cii api with no api key. + response = hit_api(None, url, logger) + + try: + response_data = response.json() + except: + response_data = json.loads(json.dumps(response.text)) + + #Insert any data that was returned + if len(response_data) > 0: + RepoBadging.insert(db, repo_id, response_data) + else: + logger.info(f"Could not find CII data for {repo_git}") + + diff --git a/augur/tasks/github/repo_info/tasks.py b/augur/tasks/github/repo_info/tasks.py index fe31e5800..d35c5dbdf 100644 --- a/augur/tasks/github/repo_info/tasks.py +++ b/augur/tasks/github/repo_info/tasks.py @@ -6,6 +6,8 @@ from augur.application.db.util import execute_session_query import traceback + +#Task to get regular misc github info @celery.task(base=AugurCoreRepoCollectionTask) def collect_repo_info(repo_git: str): @@ -17,3 +19,17 @@ def collect_repo_info(repo_git: str): repo = execute_session_query(query, 'one') repo_info_model(augur_db, manifest.key_auth, repo, logger) + + +#Task to get CII api data for linux badge info using github data. +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_linux_badge_info(repo_git: str): + + logger = logging.getLogger(collect_linux_badge_info.__name__) + + with GithubTaskManifest(logger) as manifest: + augur_db = manifest.augur_db + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo = execute_session_query(query, 'one') + + badges_model(logger, repo_git, repo.repo_id, augur_db) diff --git a/augur/tasks/github/util/github_api_key_handler.py b/augur/tasks/github/util/github_api_key_handler.py index 86055d7a7..2406ecef0 100644 --- a/augur/tasks/github/util/github_api_key_handler.py +++ b/augur/tasks/github/util/github_api_key_handler.py @@ -8,6 +8,11 @@ from augur.application.db.session import DatabaseSession from augur.application.config import AugurConfig + +class NoValidKeysError(Exception): + pass + + class GithubApiKeyHandler(): """Handles Github API key retrieval from the database and redis @@ -122,6 +127,9 @@ def get_api_keys(self) -> List[str]: # add all the keys to redis self.redis_key_list.extend(valid_keys) + if not valid_keys: + raise NoValidKeysError("No valid github api keys found in the config or worker oauth table") + return valid_keys def is_bad_api_key(self, client: httpx.Client, oauth_key: str) -> bool: diff --git a/augur/tasks/github/util/github_paginator.py b/augur/tasks/github/util/github_paginator.py index 1c252d8ce..548d25b0f 100644 --- a/augur/tasks/github/util/github_paginator.py +++ b/augur/tasks/github/util/github_paginator.py @@ -384,6 +384,10 @@ def retrieve_data(self, url: str) -> Tuple[Optional[List[dict]], Optional[httpx. timeout = timeout * 1.1 num_attempts += 1 continue + + # if api returns a status of 204 No Content then return empty list + if response.status_code == 204: + return [], response, GithubApiResult.SUCCESS page_data = parse_json_response(self.logger, response) diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index 60701aab6..48c5db32e 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -3,6 +3,7 @@ import logging from typing import List, Dict import os +import datetime from enum import Enum import traceback import celery @@ -61,10 +62,12 @@ class CollectionState(Enum): materialized_view_tasks = ['augur.tasks.db.refresh_materialized_views'] +frontend_tasks = ['augur.tasks.frontend'] + +tasks = start_tasks + github_tasks + git_tasks + materialized_view_tasks + frontend_tasks + if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": - tasks = start_tasks + github_tasks + git_tasks + materialized_view_tasks + data_analysis_tasks -else: - tasks = start_tasks + github_tasks + git_tasks + materialized_view_tasks + tasks += data_analysis_tasks redis_db_number, redis_conn_string = get_redis_conn_values() @@ -129,9 +132,10 @@ def on_failure(self,exc,task_id,args, kwargs, einfo): 'augur.tasks.github.pull_requests.files_model.tasks.*': {'queue': 'secondary'}, 'augur.tasks.github.pull_requests.tasks.collect_pull_request_reviews': {'queue': 'secondary'}, 'augur.tasks.github.pull_requests.tasks.collect_pull_request_review_comments': {'queue': 'secondary'}, - 'augur.tasks.git.dependency_tasks.tasks.process_ossf_scorecard_metrics': {'queue': 'secondary'}, + 'augur.tasks.git.dependency_tasks.tasks.process_ossf_dependency_metrics': {'queue': 'secondary'}, 'augur.tasks.git.dependency_tasks.tasks.process_dependency_metrics': {'queue': 'facade'}, - 'augur.tasks.git.dependency_libyear_tasks.tasks.process_libyear_dependency_metrics': {'queue': 'facade'} + 'augur.tasks.git.dependency_libyear_tasks.tasks.process_libyear_dependency_metrics': {'queue': 'facade'}, + 'augur.tasks.frontend.*': {'queue': 'frontend'} } #Setting to be able to see more detailed states of running tasks @@ -210,8 +214,9 @@ def setup_periodic_tasks(sender, **kwargs): logger.info(f"Scheduling non-repo-domain collection every {non_domain_collection_interval/60} minutes") sender.add_periodic_task(non_domain_collection_interval, non_repo_domain_tasks.s()) + mat_views_interval = int(config.get_value('Celery', 'refresh_materialized_views_interval_in_days')) logger.info(f"Scheduling refresh materialized view every night at 1am CDT") - sender.add_periodic_task(crontab(hour=1, minute=0), refresh_materialized_views.s()) + sender.add_periodic_task(datetime.timedelta(days=mat_views_interval), refresh_materialized_views.s()) logger.info(f"Scheduling update of collection weights on midnight each day") sender.add_periodic_task(crontab(hour=0, minute=0),augur_collection_update_weights.s()) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 32cb4e886..797a2903a 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -19,10 +19,10 @@ from augur.tasks.data_analysis import * from augur.tasks.github.detect_move.tasks import detect_github_repo_move_core, detect_github_repo_move_secondary from augur.tasks.github.releases.tasks import collect_releases -from augur.tasks.github.repo_info.tasks import collect_repo_info +from augur.tasks.github.repo_info.tasks import collect_repo_info, collect_linux_badge_info from augur.tasks.github.pull_requests.files_model.tasks import process_pull_request_files from augur.tasks.github.pull_requests.commits_model.tasks import process_pull_request_commits -from augur.tasks.git.dependency_tasks.tasks import process_ossf_scorecard_metrics +from augur.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics from augur.tasks.github.traffic.tasks import collect_github_repo_clones_data from augur.tasks.git.facade_tasks import * from augur.tasks.db.refresh_materialized_views import * @@ -63,35 +63,30 @@ def prelim_phase_secondary(repo_git): #This is the phase that defines the message for core augur collection +#A chain is needed for each repo. def primary_repo_collect_phase(repo_git): logger = logging.getLogger(primary_repo_collect_phase.__name__) - #Here the term issues also includes prs. This list is a bunch of chains that run in parallel to process issue data. - issue_dependent_tasks = [] - #repo_info should run in a group - repo_info_tasks = [] - - np_clustered_array = [] - - #A chain is needed for each repo. - repo_info_task = collect_repo_info.si(repo_git)#collection_task_wrapper(self) + #Define primary group of jobs for the primary collect phase: issues and pull requests. primary_repo_jobs = group( collect_issues.si(repo_git), collect_pull_requests.si(repo_git) ) + #Define secondary group that can't run until after primary jobs have finished. secondary_repo_jobs = group( collect_events.si(repo_git),#*create_grouped_task_load(dataList=first_pass, task=collect_events).tasks, collect_github_messages.si(repo_git), #*create_grouped_task_load(dataList=first_pass,task=collect_github_messages).tasks, collect_github_repo_clones_data.si(repo_git), ) + #Other tasks that don't need other tasks to run before they do just put in final group. repo_task_group = group( - repo_info_task, + collect_repo_info.si(repo_git), chain(primary_repo_jobs | issue_pr_task_update_weight_util.s(repo_git=repo_git),secondary_repo_jobs,process_contributors.si()), #facade_phase(logger,repo_git), - + collect_linux_badge_info.si(repo_git), collect_releases.si(repo_git), grab_comitters.si(repo_git) ) @@ -107,7 +102,7 @@ def secondary_repo_collect_phase(repo_git): repo_task_group = group( process_pull_request_files.si(repo_git), process_pull_request_commits.si(repo_git), - process_ossf_scorecard_metrics.si(repo_git), + process_ossf_dependency_metrics.si(repo_git), chain(collect_pull_request_reviews.si(repo_git), collect_pull_request_review_comments.si(repo_git)) ) diff --git a/augur/util/repo_load_controller.py b/augur/util/repo_load_controller.py index 2f8bad615..3436167ae 100644 --- a/augur/util/repo_load_controller.py +++ b/augur/util/repo_load_controller.py @@ -131,19 +131,19 @@ def paginate_repos(self, source, page=0, page_size=25, sort="repo_id", direction order_by = sort if sort else "repo_id" order_direction = direction if direction else "ASC" - query = self.generate_repo_query(source, count=False, order_by=order_by, direction=order_direction, + query, query_args, result = self.generate_repo_query(source, count=False, order_by=order_by, direction=order_direction, page=page, page_size=page_size, **kwargs) - if not query[0]: - return None, {"status": query[1]["status"]} + if not query: + return None, {"status": result["status"]} - if query[1]["status"] == "No data": + if result["status"] == "No data": return [], {"status": "No data"} - get_page_of_repos_sql = s.sql.text(query[0]) + get_page_of_repos_sql = s.sql.text(query) with DatabaseEngine(connection_pool_size=1) as engine: - results = pd.read_sql(get_page_of_repos_sql, engine) + results = pd.read_sql(get_page_of_repos_sql, engine, params=query_args) results['url'] = results['url'].apply(lambda datum: datum.split('//')[1]) @@ -170,24 +170,27 @@ def get_repo_count(self, source, **kwargs): print("Func: get_repo_count. Error: Invalid source") return None, {"status": "Invalid source"} - query = self.generate_repo_query(source, count=True, **kwargs) - if not query[0]: - return None, query[1] + query, query_args, result = self.generate_repo_query(source, count=True, **kwargs) + if not query: + return None, result - if query[1]["status"] == "No data": + if result["status"] == "No data": return 0, {"status": "No data"} # surround query with count query so we just get the count of the rows - final_query = f"SELECT count(*) FROM ({query[0]}) a;" + final_query = f"SELECT count(*) FROM ({query}) a;" get_page_of_repos_sql = s.sql.text(final_query) - result = self.session.fetchall_data_from_sql_text(get_page_of_repos_sql) + result = self.session.execute(get_page_of_repos_sql, query_args).fetchall() return result[0]["count"], {"status": "success"} def generate_repo_query(self, source, count, **kwargs): # TODO: need more flexible way of calculating count for variable column queries + + query_args = {} + if count: # only query for repos ids so the query is faster for getting the count select = """ DISTINCT(augur_data.repo.repo_id), @@ -195,7 +198,7 @@ def generate_repo_query(self, source, count, **kwargs): (regexp_match(augur_data.repo.repo_git, 'github\.com\/([A-Za-z0-9 \- _]+)\/[A-Za-z0-9 \- _ .]+$'))[1] as repo_owner""" else: - select = f""" DISTINCT(augur_data.repo.repo_id), + select = """ DISTINCT(augur_data.repo.repo_id), augur_data.repo.description, augur_data.repo.repo_git AS url, COALESCE(a.commits_all_time, 0) as commits_all_time, @@ -226,7 +229,9 @@ def generate_repo_query(self, source, count, **kwargs): query += "\t\t JOIN augur_operations.user_repos ON augur_data.repo.repo_id = augur_operations.user_repos.repo_id\n" query += "\t\t JOIN augur_operations.user_groups ON augur_operations.user_repos.group_id = augur_operations.user_groups.group_id\n" - query += f"\t\t WHERE augur_operations.user_groups.user_id = {user.user_id}\n" + query += "\t\t WHERE augur_operations.user_groups.user_id = :user_id\n" + + query_args["user_id"] = user.user_id elif source == "group": @@ -246,7 +251,9 @@ def generate_repo_query(self, source, count, **kwargs): return None, {"status": "Group does not exists"} query += "\t\t JOIN augur_operations.user_repos ON augur_data.repo.repo_id = augur_operations.user_repos.repo_id\n" - query += f"\t\t WHERE augur_operations.user_repos.group_id = {group_id}\n" + query += "\t\t WHERE augur_operations.user_repos.group_id = :group_id \n" + + query_args["group_id"] = group_id # implement sorting by query_key search = kwargs.get("search") @@ -264,21 +271,41 @@ def generate_repo_query(self, source, count, **kwargs): # It is only included because it is required by the SQL syntax if isinstance(qkey, list) and len(qkey) > 0: - query += f"\tWHERE {qkey.pop(0)} ilike '%{search}%'\n" - for key in qkey: - query += f"OR {key} ilike '%{search}%'\n" + query += f"\tWHERE :qkey_where ilike :search\n" + query_args["qkey_where"] = qkey.pop(0) + + for i, key in enumerate(qkey): + param_name = f"qkey_or_{i}" + query += f"OR :{param_name} ilike :search\n" + query_args[param_name] = key else: - query += f"\tWHERE {qkey} ilike '%{search}%'\n" + query += f"\tWHERE :qkey ilike :search\n" + query_args["qkey"] = qkey + + query_args["search"] = f'%{search}%' + if not count: order_by = kwargs.get("order_by") or "repo_id" - direction = kwargs.get("direction") or "ASC" page = kwargs.get("page") or 0 page_size = kwargs.get("page_size") or 25 + direction = kwargs.get("direction") or "ASC" + + if direction not in ["ASC", "DESC"]: + return None, None, {"status": "Invalid direction"} + + if order_by not in ["repo_id", "repo_name", "repo_owner", "commits_all_time", "issues_all_time"]: + return None, None, {"status": "Invalid order by"} + + offset = page*page_size query += f"\tORDER BY {order_by} {direction}\n" - query += f"\tLIMIT {page_size}\n" - query += f"\tOFFSET {page*page_size};\n" + query += "\tLIMIT :page_size\n" + query += "\tOFFSET :offset;\n" + + query_args["page_size"] = page_size + query_args["offset"] = offset + query_args["order_by"] = order_by - return query, {"status": "success"} + return query, query_args, {"status": "success"} diff --git a/conftest.py b/conftest.py index 4e9f4f8f8..218ba3195 100644 --- a/conftest.py +++ b/conftest.py @@ -29,8 +29,54 @@ def create_full_routes(routes): full_routes.append(route) return full_routes -@pytest.fixture -def database(): + +def create_connection(dbname='postgres'): + """ + Creates a connection to the postgres server specified in the database string and connects to the dbname specified. + Returns the connection and cursor objects. + """ + + db_string = get_database_string() + user, password, host, port, _ = parse_database_string(db_string) + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname=dbname + ) + conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) + return conn, conn.cursor() + + +def create_database(conn, cursor, db_name, template=None): + """ + Creates a database with the name db_name. + If template is specified, the database will be created with the template specified. + """ + + if template: + cursor.execute(sql.SQL("CREATE DATABASE {} WITH TEMPLATE {};").format(sql.Identifier(db_name), sql.Identifier(template))) + else: + cursor.execute(sql.SQL("CREATE DATABASE {};").format(sql.Identifier(db_name))) + conn.commit() + +def drop_database(cursor, db_name): + """ + Drops the database with the name db_name. + """ + + # ensure connections are removed + cursor.execute(sql.SQL("SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='{}';".format(db_name))) + # drop temporary database + cursor.execute(sql.SQL("DROP DATABASE {};").format(sql.Identifier(db_name))) + + +def generate_db_from_template(template_name): + """ + Generator function that creates a new database from the template specified. + Yields the engine object for the database created. + """ db_string = get_database_string() @@ -55,14 +101,7 @@ def database(): # remove database_name and add test_db_name test_db_string = db_string[:db_string.rfind("/")+1] + test_db_name - # create the temporary database - cursor.execute(sql.SQL("CREATE DATABASE {};").format(sql.Identifier(test_db_name))) - - # Commit changes - conn.commit() - - # Install schema - execute_sql_file("tests/entire_db.sql", test_db_name, user, password, host, port) + create_database(conn, cursor, test_db_name, template_name) # create engine to connect to db engine = create_database_engine(test_db_string, poolclass=StaticPool) @@ -72,17 +111,116 @@ def database(): # dispose engine engine.dispose() + drop_database(cursor, test_db_name) + + # Close the cursor and the connection + cursor.close() + conn.close() + + +def generate_template_db(sql_file_path): + """ + Generator function that creates a new database and install the sql file specified + Yields the name of the database created. + """ + + db_string = get_database_string() + + user, password, host, port, _ = parse_database_string(db_string) + + # Connect to the default 'postgres' database + conn = psycopg2.connect( + host=host, + port=port, + user=user, + password=password, + dbname='postgres' + ) + + # Set the isolation level to AUTOCOMMIT because CREATE DATABASE + # cannot be executed in a transaction block + conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) + cursor = conn.cursor() + + + test_db_name = "test_db_template_" + uuid.uuid4().hex + create_database(conn, cursor, test_db_name) + + # Install schema + execute_sql_file(sql_file_path, test_db_name, user, password, host, port) + + # ensure connections are removed cursor.execute(sql.SQL("SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname='{}';".format(test_db_name))) # drop temporary database - cursor.execute(sql.SQL("DROP DATABASE {};").format(sql.Identifier(test_db_name))) + + yield test_db_name + + drop_database(cursor, test_db_name) # Close the cursor and the connection cursor.close() conn.close() + +@pytest.fixture(scope='session') +def empty_db_template(): + """ + This fixture creates a template database with the entire schema installed. + Returns the name of the database created. + """ + + yield from generate_template_db("tests/entire_db.sql") + + +@pytest.fixture(scope='session') +def empty_db(empty_db_template): + """ + This fixture creates a database from the empty_db_template + """ + + yield from generate_db_from_template(empty_db_template) + + +# TODO: Add populated db template and populated db fixtures so this fixture is more useful +@pytest.fixture(scope='session') +def read_only_db(empty_db): + """ + This fixtture creates a read-only database from the populated_db_template. + Yields a read-only engine object for the populated_db. + """ + + database_name = empty_db.url.database + test_username = "testuser" + test_password = "testpass" + schemas = ["public", "augur_data", "augur_operations"] + + # create read-only user + empty_db.execute(s.text(f"CREATE USER testuser WITH PASSWORD '{test_password}';")) + empty_db.execute(s.text(f"GRANT CONNECT ON DATABASE {database_name} TO {test_username};")) + for schema in schemas: + empty_db.execute(s.text(f"GRANT USAGE ON SCHEMA {schema} TO {test_username};")) + empty_db.execute(s.text(f"GRANT SELECT ON ALL TABLES IN SCHEMA {schema} TO {test_username};")) + + # create engine for read-only user + db_string = get_database_string() + _, _, host, port, _ = parse_database_string(db_string) + read_only_engine = s.create_engine(f'postgresql+psycopg2://{test_username}:{test_password}@{host}:{port}/{database_name}') + + yield read_only_engine + + read_only_engine.dispose() + + # remove read-only user + empty_db.execute(s.text(f'REVOKE CONNECT ON DATABASE {database_name} FROM {test_username};')) + for schema in schemas: + empty_db.execute(s.text(f'REVOKE USAGE ON SCHEMA {schema} FROM {test_username};')) + empty_db.execute(s.text(f'REVOKE SELECT ON ALL TABLES IN SCHEMA {schema} FROM {test_username};')) + empty_db.execute(s.text(f'DROP USER {test_username};')) + + @pytest.fixture def test_db_engine(): diff --git a/docs/new-install.md b/docs/new-install.md index 9f46f496e..1adc1c6d9 100644 --- a/docs/new-install.md +++ b/docs/new-install.md @@ -71,11 +71,22 @@ CREATE USER augur WITH ENCRYPTED PASSWORD 'password'; GRANT ALL PRIVILEGES ON DATABASE augur TO augur; ``` -Once you are successfully logged out, return to your user by exiting `psql`, then typing `exit` to exit the postgres user, and `exit` a SECOND time to exit the root user. +**If you're using PostgreSQL 15 or later**, default database permissions will prevent Augur's installer from configuring the database. Add one last line after the above to fix this: +```sql +GRANT ALL ON SCHEMA public TO augur; +``` + +After that, return to your user by exiting `psql` ``` postgres=# \quit ``` +Here we want to start an SSL connection to the `augur` database on port 5432: +```shell +psql -h localhost -U postgres -p 5432 +``` + +Now type `exit` to log off the postgres user, and `exit` a SECOND time to log off the root user. ```shell exit exit @@ -98,6 +109,11 @@ sudo rabbitmqctl set_permissions -p augur_vhost augur ".*" ".*" ".*" NOTE: it is important to have a static hostname when using rabbitmq as it uses hostname to communicate with nodes. +RabbitMQ's server can then be started from systemd: +```shell +sudo systemctl start rabbitmq-server +``` + If your setup of rabbitmq is successful your broker url should look like this: **broker_url = `amqp://augur:password123@localhost:5672/augur_vhost`** @@ -139,7 +155,7 @@ Where AugurB is the vhost. The management API at port 15672 will only exist if y ## Proxying Augur through Nginx Assumes nginx is installed. -Then you create a file for the server you want Augur to run under in the location of your `sites-enabled` directory for nginx (In this example, Augur is running on port 5038: (the long timeouts on the settings page is for when a user adds a large number of repos or orgs in a single session to prevent timeouts from nginx) +Then you create a file for the server you want Augur to run under in the location of your `sites-enabled` directory for nginx. In this example, Augur is running on port 5038: (the long timeouts on the settings page is for when a user adds a large number of repos or orgs in a single session to prevent timeouts from nginx) ``` server { @@ -234,9 +250,17 @@ Create a Python Virtual Environment `python3 -m venv ~/virtual-env-directory` Activate your Python Virtual Environment `source ~/virtual-env-directory/bin/activate` -From the root of the Augur Directory, type `make install` +From the root of the Augur Directory, type `make install`. You will be prompted to provide: -You will be prompted to provide your GitHub username and password, your GitLab username and password, and the postgresql database where you want to have the Augur Schema built. You will also be prompted to provide a directory where repositories will be clone into. +- "User" is the PSQL database user, which is `augur` if you followed instructions exactly +- "Password" is the above user's password +- "Host" is the domain used with nginx, e.g. `ai.chaoss.io` +- "Port" is 5432 unless you reconfigured something +- "Database" is the name of the Augur database, which is `augur` if you followed instructions exactly +- The GitHub token created earlier +- Then the username associated with it +- Then the same for GitLab +- and finally a directory to clone repositories to ## Post Installation of Augur @@ -324,6 +348,8 @@ To access command line options, use `augur --help`. To load repos from GitHub or Start a Flower Dashboard, which you can use to monitor progress, and report any failed processes as issues on the Augur GitHub site. The error rate for tasks is currently 0.04%, and most errors involve unhandled platform API timeouts. We continue to identify and add fixes to handle these errors through additional retries. Starting Flower: `(nohup celery -A augur.tasks.init.celery_app.celery_app flower --port=8400 --max-tasks=1000000 &)` NOTE: You can use any open port on your server, and access the dashboard in a browser with http://servername-or-ip:8400 in the example above (assuming you have access to that port, and its open on your network.) +If you're using a virtual machine within Windows and you get an error about missing AVX instructions, you should kill Hyper-V. Even if it doesn't *appear* to be active, it might still be affecting your VM. Follow [these instructions](https://stackoverflow.com/a/68214280) to disable Hyper-V, and afterward AVX should pass to the VM. + ## Starting your Augur Instance Start Augur: `(nohup augur backend start &)` diff --git a/docs/new-install.rst b/docs/new-install.rst index 86f6f0eeb..6b5a0ca9c 100644 --- a/docs/new-install.rst +++ b/docs/new-install.rst @@ -100,14 +100,29 @@ Then, from within the resulting postgresql shell: CREATE USER augur WITH ENCRYPTED PASSWORD 'password'; GRANT ALL PRIVILEGES ON DATABASE augur TO augur; -Once you are successfully logged out, return to your user by exiting -``psql``, then typing ``exit`` to exit the postgres user, and ``exit`` a -SECOND time to exit the root user. +**If you're using PostgreSQL 15 or later**, default database permissions will +prevent Augur's installer from configuring the database. Add one last line +after the above to fix this: + +.. code:: sql + + GRANT ALL ON SCHEMA public TO augur; + +After that, return to your user by exiting ``psql`` :: postgres=# \quit +Here we want to start an SSL connection to the ``augur`` database on port 5432: + +.. code:: shell + + psql -h localhost -U postgres -p 5432 + +Now type ``exit`` to log off the postgres user, and ``exit`` a SECOND time to +log off the root user. + .. code:: shell exit @@ -136,6 +151,12 @@ instance. You can accomplish this by running the below commands: NOTE: it is important to have a static hostname when using rabbitmq as it uses hostname to communicate with nodes. +RabbitMQ's server can then be started from systemd: + +.. code:: shell + + sudo systemctl start rabbitmq-server + If your setup of rabbitmq is successful your broker url should look like this: @@ -296,12 +317,20 @@ Create a Python Virtual Environment Activate your Python Virtual Environment ``source ~/virtual-env-directory/bin/activate`` -From the root of the Augur Directory, type ``make install`` - -You will be prompted to provide your GitHub username and password, your -GitLab username and password, and the postgresql database where you want -to have the Augur Schema built. You will also be prompted to provide a -directory where repositories will be clone into. +From the root of the Augur Directory, type ``make install``. You will be +prompted to provide: + +- "User" is the PSQL database user, which is ``augur`` if you followed + instructions exactly +- "Password" is the above user's password +- "Host" is the domain used with nginx, e.g. ``ai.chaoss.io`` +- "Port" is 5432 unless you reconfigured something +- "Database" is the name of the Augur database, which is ``augur`` if you + followed instructions exactly +- The GitHub token created earlier +- Then the username associated with it +- Then the same for GitLab +- and finally a directory to clone repositories to Post Installation of Augur -------------------------- @@ -439,6 +468,12 @@ NOTE: You can use any open port on your server, and access the dashboard in a browser with http://servername-or-ip:8400 in the example above (assuming you have access to that port, and its open on your network.) +If you're using a virtual machine within Windows and you get an error about +missing AVX instructions, you should kill Hyper-V. Even if it doesn't *appear* +to be active, it might still be affecting your VM. Follow +`these instructions `_ to disable +Hyper-V, and afterward AVX should pass to the VM. + Starting your Augur Instance ---------------------------- diff --git a/docs/source/development-guide/create-a-metric/metrics-steps.rst b/docs/source/development-guide/create-a-metric/metrics-steps.rst index c9225f072..fe0138871 100644 --- a/docs/source/development-guide/create-a-metric/metrics-steps.rst +++ b/docs/source/development-guide/create-a-metric/metrics-steps.rst @@ -79,4 +79,4 @@ If we look at the Augur Schema, we can see that effort and cost are contained in .. note:: - Augur uses https://github.com/boyter/scc to calculate information contained in the ``labor_value`` table, which is populated by the ``value_worker``. + Augur uses https://github.com/boyter/scc to calculate information contained in the ``labor_value`` table, which is populated by the ``value_worker`` tasks. diff --git a/docs/source/development-guide/logging.rst b/docs/source/development-guide/logging.rst index a37d1abf1..cdff38c10 100644 --- a/docs/source/development-guide/logging.rst +++ b/docs/source/development-guide/logging.rst @@ -3,8 +3,8 @@ Logging Augur's log output can be configured with some basic verbosity and log levels. If you are contributing to Augur, we recommend you set the ``debug`` flag in the ``Logging`` section of your config file to ``1``. This will -turn the verbosity up, capture **all** logs of every level, and it will allow the workers to print their output to the screen -if they are being run manually in a separate terminal (as is often the case when one is developing a worker). +turn the verbosity up, capture **all** logs of every level, and it will allow the data collection tasks to print their output to the screen +if they are being run manually in a separate terminal. The verbosity and minimum log level can be controlled with the ``verbose`` (boolean flag) and ``log_level`` (one of ``DEBUG``, ``INFO``, ``WARNING``, ``ERROR``, or ``CRITICAL``) options respectively. There is also diff --git a/docs/source/development-guide/workers/clustering_worker.rst b/docs/source/development-guide/workers/clustering_worker.rst index 8ea363564..a89bff350 100644 --- a/docs/source/development-guide/workers/clustering_worker.rst +++ b/docs/source/development-guide/workers/clustering_worker.rst @@ -1,8 +1,8 @@ -Clustering Worker +Clustering Task ========================== -The worker analyzes the comments in issues and pull requests, and clusters the repositories based on contents of those messages. -The worker also performs topic modeling using Latent Dirichlet allocation +The task analyzes the comments in issues and pull requests, and clusters the repositories based on contents of those messages. +The task also performs topic modeling using Latent Dirichlet allocation Clustering of text documents @@ -11,9 +11,9 @@ Clustering of text documents Clustering is a type of unsupervised machine learning technique that involves grouping together similar data points. In case of textual data, it involves grouping together semantically similar documents. The document is a collection of sentences. In our case, document represents the collection of comments across issues and pull requests across a particular repository. Since, clustering algorithm works with numerical features, we need to first convert documents into vector representation. -Worker Implementation ---------------------- -The worker performs two tasks — clustering of the repositories represented as documents (collection of all messages from issues and pull requests within the repository) and topic modeling. If the pre-trained model doesn’t exist in the worker folder, the data from all the repository in the connected database are used to train the model. After the training, the following model files are dumped in the worker folder +Implementation +-------------- +The task performs two tasks — clustering of the repositories represented as documents (collection of all messages from issues and pull requests within the repository) and topic modeling. If the pre-trained model doesn’t exist in the clustering task's folder, the data from all the repository in the connected database are used to train the model. After the training, the following model files are dumped in the clustering task's folder - vocabulary : the set of features obtained from TF-IDF vectorization on text data (required in prediction phase) - kmeans_repo_messages : trained kmeans clustering model on tfidf features @@ -25,28 +25,14 @@ In addition, the training phase populates the ‘topic words’ database table w **Prediction** -If the trained model exists in the worker directory, the prediction is made on the documents corresponding to the repositories in the repo groups specified in the configuration. The worker populates the following tables +If the trained model exists in the task directory, the prediction is made on the documents corresponding to the repositories in the repo groups specified in the configuration. The task populates the following tables repo_topic : stores probability distribution over the topics for a particular repository repo_cluster_messages : stores clustering label assigned to a repository -Worker Configuration +Task Configuration -------------------- -Like standard worker configuration, we need to define delay, given, model and repo_group_id in housekeeper configuration block. - -{ - - "delay": 10000, - - "given":["git_url"], - - "model" : "clustering", - - "repo_group_id" : 60003 - -} - -Further, in workers configuration block, we need to define port, switch and number of workers. +For this task's configuration, in workers configuration block, we need to define port, switch and number of workers. .. code-block:: json @@ -60,7 +46,7 @@ Further, in workers configuration block, we need to define port, switch and numb "num_clusters" : 4 } -Additional Worker Parameters in `augur.config.json`: +Additional Worker Parameters: ------------------------------------------------------ In addition to standard worker parameters, clustering worker requires some worker-specific parameters which are described below: diff --git a/docs/source/getting-started/collecting-data.rst b/docs/source/getting-started/collecting-data.rst index ea7778427..399699973 100644 --- a/docs/source/getting-started/collecting-data.rst +++ b/docs/source/getting-started/collecting-data.rst @@ -1,7 +1,7 @@ Collecting data =============== -Now that you’ve installed Augur’s application server, it’s time to configure your data collection workers. If you just want to run Augur using the one repository in the default database, and default worker settings, all you need to do is start the redis server in one terminal, make sure rabbitmq is running, and the augur application in the other terminal. (Don't forget that the AUGUR_DB environment variable needs to be set in the terminal, or set permanently) +Now that you’ve installed Augur’s application server, it’s time to configure data collection if needed. If you just want to run Augur using the default repositories in the default database, and default celery collection settings, all you need to do is start the redis server in one terminal, make sure rabbitmq is running, and the augur application in the other terminal. (Don't forget that the AUGUR_DB environment variable needs to be set in the terminal, or set permanently) .. code-block:: bash @@ -10,13 +10,6 @@ Now that you’ve installed Augur’s application server, it’s time to configu # Starts the redis server redis-server -.. code-block:: bash - - # Terminal Window 2 - - # Start celery worker so it can accept tasks - celery -A augur.tasks.init.celery_app.celery_app worker --loglevel=info - .. code-block:: bash @@ -29,50 +22,42 @@ Now that you’ve installed Augur’s application server, it’s time to configu augur backend stop augur backend kill -Now, here's a ton of brain-splitting detail about workers, and their configuration. There are 2 pieces to data collection with Augur: the housekeeper, and the data collection workers. The housekeeper creates long-running "jobs" that specify what kind of data to collect for what set of repositories. The data collection workers can then accept these jobs, after which they will use the information provided in the job to find the repositories in question and collect the requested data. - -Since the default housekeeper setup will work for most use cases, we'll first cover how to configure the workers and then briefly touch on the housekeeper configuration options, after which we'll cover how to add repos and repo groups to the database. - -Configuring the Workers ------------------------- - -There are a few workers that ship ready to collect out of the box: +Now, here's a ton of brain-splitting detail about celery collection. There are 2 pieces to data collection with Augur: the celery worker processes, and the job messages passed through rabbitmq. The jobs to collect are determined by a monitor process started through the cli that starts the rest of augur. The monitor process generates the jobs messages to send to rabbitmq through the collection_status table that informs the status of jobs that have yet to be run. The celery collection workers can then accept these jobs, after which they will use the information provided in the job to find the repositories in question and collect the requested data. -- ``facade_worker`` (collects raw commit and contributor data by parsing Git logs) -- ``github_worker`` (collects issue data from the GitHub API) -- ``contributor_worker`` (collects contributor data from the GitHub API) -- ``pull_request_worker`` (collects pull request data from the GitHub API) -- ``repo_info_worker`` (collects repository statistics from the GitHub API) -- ``release_worker`` (collects release data from the GitHub API) -- ``linux_badge_worker`` (collects `CII badging `_ data from the CII API) -- ``insight_worker`` (queries Augur's metrics API to find interesting anomalies in the collected data) +Since the default setup will work for most use cases, we'll first cover how to configure some specific data collection jobs and then briefly touch on the celery configuration options, after which we'll cover how to add repos and repo groups to the database. -All worker configuration options are found in the ``Workers`` block of the ``augur.config.json`` file (which was generated for you at the end of the previous section). This file is located at ``$HOME/.augur/augur.config.json``. Each worker has its subsection with the same title as the worker's name. We recommend leaving the defaults and only changing them when explicitly necessary, as the default parameters will work for most use cases. Read on for more on how to make sure your workers are properly configured. +Configuring Collection +---------------------- -Standard configuration options -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +There are many collection jobs that ship ready to collect out of the box: -Each worker has 3 configuration options that are standard across all workers. The worker-specific options are detailed in the sections following this one. +- ``augur.tasks.git.facade_taks`` (collects raw commit and contributor data by parsing Git logs) +- ``augur.tasks.github`` (parent module of all github specific collection jobs) +- ``augur.tasks.github.contributors.tasks`` (collects contributor data from the GitHub API) +- ``augur.tasks.github.pull_requests.tasks`` (collects pull request data from the GitHub API) +- ``augur.tasks.github.repo_info.tasks`` (collects repository statistics from the GitHub API) +- ``augur.tasks.github.releases.tasks`` (collects release data from the GitHub API) +- ``augur.tasks.data_analysis.insight_worker.tasks`` (queries Augur's metrics API to find interesting anomalies in the collected data) -The standard options are: - -- ``switch``, a boolean flag indicating if the worker should automatically be started with Augur. Defaults to ``0`` (false). -- ``workers``, the number of instances of this worker that Augur should spawn if ``switch`` is set to ``1``. Defaults to ``1`` for all workers except the ``value_worker`` and ``insight_worker``. -- ``port``, which is the base TCP port the worker will use the communicate with Augur's broker. The default is different for each worker, but the lowest is ``50100`` and each worker increments the default starting port by 100. If the ``workers`` parameter is > 1, then workers will bind to ``port`` + ``i`` for the ``i``'th worker spawned - -Keeping ``workers`` at 1 should be fine for small collection sets, but if you have a lot of repositories to collect data for, you can raise it. We also suggest double-checking that the default worker ports are free on your machine. +All worker configuration options are found in the config table generated when augur was installed. The config table is located in the augur_operations schema of your postgresql database. Each configurable data collection job set has its subsection with the same or similar title as the task's name. We recommend leaving the defaults and only changing them when explicitly necessary, as the default parameters will work for most use cases. Read on for more on how to make sure your workers are properly configured. Worker-specific configuration options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Next up are the configuration options specific to each worker (but some workers require no additional configuration beyond the defaults). The most pertinent of these options is the ``facade_worker's`` ``repo_directory``, so make sure to pay attention to that one. +Next up are the configuration options specific to some collection tasks (but some tasks require no additional configuration beyond the defaults). The most pertinent of these options is the ``Facade`` section ``repo_directory``, so make sure to pay attention to that one. -``facade_worker`` +``Facade`` :::::::::::::::::: -- ``repo_directory``, which is the local directory where the ``facade_worker`` will clone the repositories it needs to analyze. You should have been prompted for this during installation, but if you need to change it, make sure that it's an absolute path (environment variables like ``$HOME`` are not supported) and that the directory already exists. Defaults to ``repos/``, but it's highly recommended you change this. +- ``repo_directory``, which is the local directory where the facade tasks will clone the repositories it needs to analyze. You should have been prompted for this during installation, but if you need to change it, make sure that it's an absolute path (environment variables like ``$HOME`` are not supported) and that the directory already exists. Defaults to ``repos/``, but it's highly recommended you change this. +- ``limited_run``, toggle between 0 and 1 to determine whether to run all facade tasks or not. Runs all tasks if set to 0 +- ``pull_repos``, toggle whether to pull updates from repos after cloning them. If turned off updates to repos will not be collected. +- ``run_analysis``, toggle whether to process commit data at all. If turned off will only clone repos and run tertiary tasks such as resolving contributors from any existing commits or collecting dependency relationships. Mainly used for testing. +- ``run_facade_contributors``, toggle whether to run contributor resolution tasks. This will process and parse through commit data to link emails to contributors as well as aliases, etc. +- ``force_invalidate_caches``, set every repo to reset the status of commit email affillation, which is the organization that an email is associated with. +- ``rebuild_caches``, toggle whether to enable parsing through commit data to determine affillation and web cache -``insight_worker`` +``Insight_Task`` :::::::::::::::::: We recommend leaving the defaults in place for the insight worker unless you are interested in other metrics, or anomalies for a different time period. @@ -83,63 +68,42 @@ We recommend leaving the defaults in place for the insight worker unless you are - ``contamination``, which is the "sensitivity" parameter for detecting anomalies. Acts as an estimated percentage of the training_days that are expected to be anomalous. The default is ``0.041`` for the default training days of 365: 4.1% of 365 days means that about 15 data points of the 365 days are expected to be anomalous. -- ``metrics``, which specifies which metrics the ``insight_worker`` should run the anomaly detection algorithm on. This is structured like so:: - { - 'endpoint_name_1': 'field_1_of_endpoint', - 'endpoint_name_1': 'field_2_of_endpoint', - 'endpoint_name_2': 'field_1_of_endpoint', - ... - } - - # defaults to the following - - { - "issues-new": "issues", - "code-changes": "commit_count", - "code-changes-lines": "added", - "reviews": "pull_requests", - "contributors-new": "new_contributors" - } - -``value_worker`` -:::::::::::::::::: +- ``switch``, toggles whether to run insight tasks at all. +- ``workers``, number of worker processes to use for insight tasks. -- ``scc_bin``, the command that the ``value_worker`` should use to invoke ``scc``. If installed with ``go get github.com/boyter/scc``, then the default of ``scc`` should probably work, but double check for your particular Go installation. +``Task_Routine`` +:::::::::::::::::: + +This section is for toggling sets of jobs on or off. -Housekeeper ------------- +- ``prelim_phase``, toggles whether to run preliminary tasks that check to see whether repos are valid or not. +- ``primary_repo_collect_phase``, toggle the standard collection jobs, mainly pull requests and issues +- ``secondary_repo_collect_phase``, toggle the secondary collection jobs, mainly jobs that take a while +- ``facade_phase``, toggle all facade jobs +- ``machine_learning_phase``, toggle all ml related jobs -**We strongly recommend leaving the default housekeeper blocks generated by the installation process, but if you would like to know more, or fine-tune them to your needs, read on.** +Celery Configuration +-------------------- -The housekeeper is responsible for generating the tasks that will tell the workers what data to collect, and how. Housekeeper configuration options are found in the ``Housekeeper`` block of the config file. The ``Housekeeper`` block has a single key, ``jobs``, which is an array of tasks the housekeeper should create. Each task has the following structure:: +**We strongly recommend leaving the default celery blocks generated by the installation process, but if you would like to know more, or fine-tune them to your needs, read on.** - { - "delay": , - "given": [ - "" - ], - "model": "", - "repo_group_id": , - ... //other task-specific parameters - } +The celery monitor is responsible for generating the tasks that will tell the other worker processes what data to collect, and how. The ``Celery`` block has 2 keys; one for memory cap and one for materialized views interval. +- ``worker_process_vmem_cap``, float between zero and one that determines the maximum percentage of total memory to use for worker processes -- The ``delay`` parameter is the amount of time the housekeeper should wait before scheduling a new update task. -- The ``given`` parameter is used in conjunction with the ``model`` parameter to determine which workers can accept a data collection task. Each worker can collect data if it is "given" data in a certain format, for example, a ``github_url`` (in the case of the ``github_worker`` and ``pull_request_worker``) or perhaps just any valid ``git_url`` (as in the case of the ``facade_worker``). -- The ``model`` parameter is the other parameter used to determine which workers can accept a given task. It represents the part of the conceptual data model that the worker can fulfill; for example, the ``facade_worker`` fills out the ``commits`` model since it primarly gathers data about commits, and the ``github_worker`` fills out both the ``issues`` and ``contributors`` model. -- The ``repo_group_id`` parameter specifies which group of repos the housekeeper should collect data for; use the default of ``0`` to specify ALL repo groups in the database. +- ``refresh_materialized_views_interval_in_days``, number of days to wait between refreshes of materialized views. Adding repos for collection ----------------------------- -If you're using the Docker container, you can use the `provided UI <../docker/usage.html>`_ to load your repositories. Otherwise, you'll need to use the `Augur CLI `_ to load your repositories. Please reference the respective sections of the documentation for detailed instructions on how to accomplish both of these steps. +If you're using the Docker container, you can use the `provided UI <../docker/usage.html>`_ to load your repositories. Otherwise, you'll need to use the `Augur CLI `_ or the augur frontend to load your repositories. Please reference the respective sections of the documentation for detailed instructions on how to accomplish both of these steps. Running collections -------------------- Congratulations! At this point you (hopefully) have a fully functioning and configured Augur instance. -After you've loaded your repos, you're ready for your first collection run. We recommend running only the default workers first to gather the initial data. If you're collecting data for a lot of repositories, or repositories with a lot of data, we recommend increasing the number of ``github_workers`` and ``pull_request_workers``. +After you've loaded your repos, you're ready for your first collection run. We recommend running only the default jobs first to gather the initial data. You can now run Augur and start the data collection by issuing the ``augur backend start`` command in the root ``augur`` directory. All your logs (including worker logs and error files) will be saved to a ``logs/`` subdirectory in that same folder, but this can be customized - more on that and other logging utilities `in the development guide <../development-guide/logging.html>`_. diff --git a/docs/source/quick-start.rst b/docs/source/quick-start.rst index 88e7ac5d4..6ae6836cf 100644 --- a/docs/source/quick-start.rst +++ b/docs/source/quick-start.rst @@ -1,9 +1,6 @@ Quickstart =============== -Augur Setup ------------ - Ubuntu 22.x =========== diff --git a/docs/source/schema/overview.rst b/docs/source/schema/overview.rst index 718e25a87..32d8900f5 100644 --- a/docs/source/schema/overview.rst +++ b/docs/source/schema/overview.rst @@ -36,42 +36,43 @@ Augur Data ------------------------------------------------------- The ``augur_data`` schema contains *most* of the information analyzed -and constructed by Augur. The origin’s of the data inside of augur are: +and constructed by Augur. The origin’s of the data inside of augur are +from data collection tasks and populate this schema.: -1. ``workers/augur_github_worker``: Pulls data from the GitHub API. -Presently this is focused on issues, including issue_comments, -issue_events, issue_labels and contributors. Note that all messages are -stored in Augur in the ``messages`` table. This is to facilitate easy -analysis of the tone and characteristics of text communication in a -project from one place. +1. ``augur.tasks.github.*``: Tasks that pull data from the GitHub API. +Primarily, pull requests and issues are collected before more complicated +data. Note that all messages are stored in Augur in the ``messages`` table. +This is to facilitate easy analysis of the tone and characteristics of text +communication in a project from one place. -2. ``workers/facade_worker``: Based on +2. ``augur.tasks.git.facade_tasks``: Based on http://www.github.com/brianwarner/facade, but substantially modified in the fork located at http://github.com/sgoggins/facade. The modifications include modularization of code, connections to Postgresql data instead -of MySQL and other changes noted in the commit logs. +of MySQL and other changes noted in the commit logs. Further modifications +have been made to work with augur as well as seemlessly integrate it into +data collection. -3. ``workers/insight_worker``: Generates summarizations from raw data +3. ``augur.tasks.data_analysis.insight_worker.tasks``: Generates summarizations from raw data gathered from commits, issues, and other info. -4. ``workers/linux_badge_worker``: Pulls data from the Linux Foundation’s -badging program. - -5. ``workers/value_worker``: Populates the table -``repo_labor`` using the “SCC” tool provided the -https://github.com/boyter/scc project. “SCC” required Go to be installed on your system. Visit `this resource `__ for instructions on Go installation. - -6. ``workers/pull_request_worker``: Collects Pull Request related data such as commits, contributors,assignees, etc. from the Github API and stores it in the Augur database. +4. ``augur.tasks.github.pull_requests.tasks``: Collects Pull Request related data such as commits, contributors,assignees, etc. from the Github API and stores it in the Augur database. Augur Operations ------------------------------------------------------- The ``augur_operations`` tables are where most of the operations tables -are going to exist. There are a few, like ``settings`` that remain in +exist. There are a few, like ``settings`` that remain in ``augur_data`` for now, but will be moved. They keep records related to analytical history and data provenance for data in the schema. They also store information including API keys. +Some key tables in this schema include: + +- ``config``, which contains the config options for the application. Key options include the facade repo_directory as well as primary api key. + +- ``collection_status``, contains the status of each aspect of data collection for each repo added to Augur. For example, it shows the status of the facade jobs for every repository. + SPDX ------------------------------------------------------- diff --git a/docs/source/schema/regularly_used_data.rst b/docs/source/schema/regularly_used_data.rst index 7e6f504b5..826b0c820 100644 --- a/docs/source/schema/regularly_used_data.rst +++ b/docs/source/schema/regularly_used_data.rst @@ -1,14 +1,14 @@ List of Regularly Used Data Tables In Augur =========================================== -**This is a list of data tables in augur that are regularly used and the various workers attached to them.** +**This is a list of data tables in augur that are regularly used and the various tasks attached to them.** Commits ------- This is where a record for every file in every commit in every repository in an Augur instance is kept. - * Worker: Facade worker collects, and also stores platform user information in the commits table. + * Task: Facade tasks collect, and also stores platform user information in the commits table. .. image:: images/commits.png :width: 200 @@ -30,7 +30,7 @@ Contributor_repo Storage of a snowball sample of all the repositories anyone in your schema has accessed on GitHub. So, for example, if you wanted to know all the repositories that people on your project contributed to, this would be the table. - * Contributor_breadth_worker populates this table + * contributor_breadth_model populates this table * Population of this table happens last, and can take a long time. .. image:: images/contributor_repo.png @@ -41,13 +41,13 @@ Contributors These are all the contributors to a project/repo. In Augur, all types of contributions create a contributor record. This includes issue comments, pull request comments, label addition, etc. This is different than how GitHub counts contributors; they only include committers. - * Workers Adding Contributors: + * Tasks Adding Contributors: - * Github Issue Worker - * Pull Request Worker - * GitLab Issue Worker - * GitLab Merge Request Worker - * Facade Worker + * Github Issue Tasks + * Pull Request Tasks + * GitLab Issue Tasks + * GitLab Merge Request Tasks + * Facade Tasks .. image:: images/contributors.png :width: 200 @@ -57,9 +57,9 @@ Contributors_aliases These are all the alternate emails that the same contributor might use. These records arise almost entirely from the commit log. For example, if I have two different emails on two different computers that I use when I make a commit, then an alias is created for whatever the 2nd to nth email Augur runs across. If a user’s email cannot be resolved, it is placed in the unresolved_commit_emails table. Coverage is greater than 98% since Augur 1.2.4. - * Worker: + * Tasks: - * Facade Worker + * Facade Tasks .. image:: images/contributors_aliases.png :width: 200 @@ -67,7 +67,7 @@ Contributors_aliases Discourse_insights ------------------ -There are nine specific discourse act types identified by the computational linguistic algorithm that underlies the discourse insights worker. This worker analyzes each comment on each issue or pull request sequentially so that context is applied when determining the discourse act type. These types are: +There are nine specific discourse act types identified by the computational linguistic algorithm that underlies the discourse insights task. This task analyzes each comment on each issue or pull request sequentially so that context is applied when determining the discourse act type. These types are: * negative-reaction * answer @@ -79,18 +79,18 @@ There are nine specific discourse act types identified by the computational ling * announcement * appreciation - * Worker: + * Tasks: - * Discourse Insights Worker + * Discourse Insights Task .. image:: images/discourse_insights.png :width: 200 issue_assignees || issue_events || issue_labels ---------------------------------------------- - * Worker: + * Task: - * Github or Gitlab Issues Worker + * Github or Gitlab Issues Task .. image:: images/issue_assignees.png :width: 200 @@ -100,9 +100,9 @@ issue_message_ref A link between the issue and each message stored in the message table. - * Worker: + * Task: - * Github or Gitlab Issues Worker + * Github or Gitlab Issues Task .. image:: images/issue_message_ref.png :width: 200 @@ -112,9 +112,9 @@ issues Is all the data related to a GitHub Issue. - * Worker: + * Task: - * Github or Gitlab Issues Worker + * Github or Gitlab Issues Task .. image:: images/issues.png :width: 200 @@ -132,9 +132,9 @@ Message_analysis Two factors evaluated for every pull request on issues message: What is the sentiment of the message (positive or negative), and what is the novelty of the message in the context of other messages in that repository. - * Worker: + * Task: - * Message Insights Worker + * Message Insights Task .. image:: images/message_analysis.png :width: 200 @@ -144,9 +144,9 @@ Message_analysis_summary A summary level representation of the granular data in message_analysis. - * Worker: + * Task: - * Message Insights Worker + * Message Insights Task .. image:: images/message_analysis_summary.png :width: 200 @@ -156,21 +156,15 @@ Platform Reference data with two rows: one for GitHub, one for GitLab. - * Worker: - * Platform_worker - - .. image:: images/platform.png - :width: 200 - Pull_request_analysis --------------------- - A representation of the probability of a pull request being merged into a repository, based on analysis of the properties of previously merged pull requests in a repository. (Machine learning worker) + A representation of the probability of a pull request being merged into a repository, based on analysis of the properties of previously merged pull requests in a repository. (Machine learning tasks) - * Worker: + * Task: - * Pull request analysis worker + * Pull request analysis task .. image:: images/pull_request_analysis.png :width: 200 @@ -228,9 +222,9 @@ Releases Github declared software releases or release tags. For example: https://github.com/chaoss/augur/releases - * Worker: + * Task: - * Release Worker. + * Release Task. .. image:: images/releases.png :width: 200 @@ -248,21 +242,15 @@ Repo_badging A list of CNCF badging information for a project. Reads this api endpoint: https://bestpractices.coreinfrastructure.org/projects.json - * Worker: - - * linux_badge_worker - - .. image:: images/repo_badging.png - :width: 200 Repo_cluster_messages --------------------- Identifying which messages and repositories are clustered together. Identifies project similarity based on communication patterns. - * Worker: + * Task: - * Clustering Worker + * Clustering task .. image:: images/repo_cluster_messages.png :width: 200 @@ -272,9 +260,9 @@ Repo_dependencies Enumerates every dependency, including dependencies that are not package managed. - * Worker: + * Task: - * deps_worker + * process_dependency_metrics .. image:: images/repo_dependencies.png :width: 200 @@ -282,15 +270,15 @@ Repo_dependencies Repo_deps_libyear ----------------- - (enumerates every package managed dependency) Looks up the latest release of any library that is imported into a project. Then it compares that release date, the release version of the library version in your project (and its release date), and calculates how old your version is, compared to the latest version. The resulting statistic is “libyear”. This worker runs at least once a month, so over time, you will see if your libraries are being kept up to date, or not. + (enumerates every package managed dependency) Looks up the latest release of any library that is imported into a project. Then it compares that release date, the release version of the library version in your project (and its release date), and calculates how old your version is, compared to the latest version. The resulting statistic is “libyear”. This task runs with the facade tasks, so over time, you will see if your libraries are being kept up to date, or not. * Scenarios: * If a library is updated, but you didn’t change your version, the libyear statistic gets larger * If you updated a library and it didn’t get older, the libyear statistic gets smaller. - * Worker: + * Task: - * deps_libyear_worker + * process_libyear_dependency_metrics .. image:: images/repo_deps_libyear.png :width: 200 @@ -300,9 +288,9 @@ Repo_deps_scorecard Runs the OSSF Scorecard over every repository ( https://github.com/ossf/scorecard ) : There are 16 factors that are explained at that repository location. - * Worker: + * Task: - * deps_worker + * process_ossf_scorecard_metrics .. image:: images/repo_deps_scorecard.png :width: 200 @@ -318,11 +306,11 @@ Repo_groups Repo_info --------- - This worker gathers metadata from the platform API that includes things like “number of stars”, “number of forks”, etc. AND it also gives us : Number of issues, number of pull requests, etc. .. THAT information we use to determine if we have collected all of the PRs and Issues associated with a repository. + This task gathers metadata from the platform API that includes things like “number of stars”, “number of forks”, etc. AND it also gives us : Number of issues, number of pull requests, etc. .. THAT information we use to determine if we have collected all of the PRs and Issues associated with a repository. - * Worker: + * Task: - * repo info worker + * repo info task .. image:: images/repo_info.png :width: 200 @@ -330,9 +318,9 @@ Repo_info Repo_insights ----------- - * Worker: + * Task: - * Insight worker + * Insight task .. image:: images/repo_insights.png :width: 200 @@ -340,22 +328,13 @@ Repo_insights Repo_insights_records ---------- - * Worker: + * Task: - * Insight worker + * Insight task .. image:: images/repo_insights_records.png :width: 200 -Repo_labor --------- - - * Worker: - - * Value worker - - .. image:: images/repo_labor.png - :width: 200 Repo_meta --------- @@ -386,9 +365,9 @@ Repo_topic Identifies probable topics of conversation in discussion threads around issues and pull requests. - * Worker: + * Task: - * Clustering Worker + * Clustering task .. image:: images/repo_topic.png :width: 200 @@ -398,9 +377,9 @@ Topic_words Unigrams, bigrams, and trigrams associated with topics in the repo_topic table. - * Worker: + * Task: - * Clustering Worker + * Clustering task .. image:: images/topic_words.png :width: 200 @@ -410,9 +389,9 @@ Unresolved_commit_emails Emails from commits that were not initially able to be resolved using automated mechanisms. - * Worker: + * Task: - * Facade Worker. + * Facade Tasks. .. image:: images/unresolved_commit_emails.png :width: 200 diff --git a/docs/source/schema/working_tables.rst b/docs/source/schema/working_tables.rst index d26bca88c..ffd3b2c08 100644 --- a/docs/source/schema/working_tables.rst +++ b/docs/source/schema/working_tables.rst @@ -1,20 +1,20 @@ List of Working Data Tables In Augur =================================== -**This Is A List of Working Tables In Augur and The Workers Attached to Them.** +**This Is A List of Working Tables In Augur and The Tasks Attached to Them.** They are in lowercase to represent exactly how they look like on the actual table. - * analysis_log - this table is a record of the analysis steps the facade worker has taken on an augur instance. A listing of all the analysis steps taken for every repository is recorded as they are completed. + * analysis_log - this table is a record of the analysis steps the facade tasks have taken on an augur instance. A listing of all the analysis steps taken for every repository is recorded as they are completed. - * Worker Associated With It? + * Tasks Associated With It? - * Facade Worker + * Facade Tasks .. image:: images/analysis_log.png :width: 200 - * commit_parents - this table keeps a record of parent commits that are squashed during Facade Worker execution. + * commit_parents - this table keeps a record of parent commits that are squashed during Facade collection. .. image:: images/commit_parents.png :width: 200 diff --git a/sendgridtest.py b/sendgridtest.py new file mode 100644 index 000000000..0aea1e25c --- /dev/null +++ b/sendgridtest.py @@ -0,0 +1,19 @@ +# using SendGrid's Python Library +# https://github.com/sendgrid/sendgrid-python +import os +from sendgrid import SendGridAPIClient +from sendgrid.helpers.mail import Mail + +message = Mail( + from_email='metrix@goggins.com', + to_emails='gogginss@missouri.edu', + subject='Sending with Twilio SendGrid is Fun', + html_content='and easy to do anywhere, even with Python') +try: + sg = SendGridAPIClient(os.environ.get('SENDGRID_API_KEY')) + response = sg.send(message) + print(response.status_code) + print(response.body) + print(response.headers) +except Exception as e: + print(e.message) diff --git a/setup.py b/setup.py index bd7ef6d09..7e344e553 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ ], install_requires=[ "wheel", + "sendgrid", "alembic==1.8.1", # 1.8.1 "coloredlogs==15.0", # 15.0.1 "Beaker==1.11.0", # 1.11.0 diff --git a/tests/test_applicaton/test_repo_load_controller/helper.py b/tests/test_applicaton/test_repo_load_controller/helper.py index 11ac16640..29aa0dc9c 100644 --- a/tests/test_applicaton/test_repo_load_controller/helper.py +++ b/tests/test_applicaton/test_repo_load_controller/helper.py @@ -4,10 +4,9 @@ from augur.util.repo_load_controller import ORG_REPOS_ENDPOINT from augur.application.db.session import DatabaseSession -from augur.application.db.models import Config +from augur.application.db.models import Config, User from augur.tasks.github.util.github_paginator import hit_api from augur.application.db.util import execute_session_query -from werkzeug.security import generate_password_hash logger = logging.getLogger(__name__) @@ -105,7 +104,7 @@ def get_repo_group_insert_statement(rg_id): def get_user_insert_statement(user_id, username="bil", email="default@gmail.com", password="pass"): - return """INSERT INTO "augur_operations"."users" ("user_id", "login_name", "login_hashword", "email", "first_name", "last_name", "admin") VALUES ({}, '{}', '{}', '{}', 'bill', 'bob', false);""".format(user_id, username, generate_password_hash(password), email) + return """INSERT INTO "augur_operations"."users" ("user_id", "login_name", "login_hashword", "email", "first_name", "last_name", "admin") VALUES ({}, '{}', '{}', '{}', 'bill', 'bob', false);""".format(user_id, username, User.compute_hashsed_password(password), email) def get_user_group_insert_statement(user_id, group_name, group_id=None):