Merge pull request #178 from jimchamp/comment-digest-workflow

Comment digest workflow
jimchamp · Jan 17, 2024 · 50cf50f · 50cf50f
2 parents 3677dd2 + 9e76a7b
commit 50cf50f
Show file tree

Hide file tree

Showing 4 changed files with 334 additions and 0 deletions.
diff --git a/.github/workflows/new_comment_digest.yml b/.github/workflows/new_comment_digest.yml
@@ -0,0 +1,21 @@
+name: new_comment_digest
+on:
+  schedule:  # 08:30 daily
+    - cron: '30 8 * * *'
+  workflow_dispatch:  # This job can also be run on-demand (is this needed?)
+permissions:
+  contents: read  # Is this needed?
+
+jobs:
+  new_comment_digeste:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@4
+      - uses: actions/setup-python@4
+        with:
+          python-version: 3.x
+      - run: pip install requests
+      - run: scripts/gh_scripts/issue_comment_bot.py 24 "$SLACK_CHANNEL" "$SLACK_TOKEN"
+        env:
+          SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL_ABC_TEAM_PLUS }}
diff --git a/scripts/gh_scripts/README.md b/scripts/gh_scripts/README.md
@@ -0,0 +1,27 @@
+# GitHub Project Management Scripts
+
+This directory contains scripts that the Open Library team uses to interact with this GitHub repository.
+
+To quickly see a script's purpose and arguments, run the script with the `-h` or `--help` flag.
+
+## `issue_comment_bot.py`
+
+This script fetches issues that have new comments from contributors within the past number of hours, then posts a message to the team in our Slack channel.
+
+### Usage:
+This script has three positional arguments:
+```
+  hours        Fetch issues that have been updated since this many hours ago
+  channel      Issues will be published to this Slack channel
+  slack-token  Slack authentication token
+```
+
+__Running the script locally:__
+```
+docker compose exec -e PYTHONPATH=. web bash
+
+# Publish digest of new comments from the past day to #openlibrary-g:
+./scripts/gh_scripts/issue_comment_bot.py 24 "#openlibrary-g" "replace-with-slack-token"
+```
+
+__Note:__ When adding arguments, be sure to place any hyphenated values within double quotes.
diff --git a/scripts/gh_scripts/issue_comment_bot.py b/scripts/gh_scripts/issue_comment_bot.py
@@ -0,0 +1,285 @@
+#!/usr/bin/env python
+"""
+Fetches Open Library GitHub issues that have been commented on
+within some amount of time, in hours.
+
+Writes links to each issue to given Slack channel.
+"""
+import argparse
+import errno
+import sys
+import time
+
+from datetime import datetime, timedelta
+from typing import Any
+
+import requests
+
+# Maps lead label to GitHub username
+lead_label_to_username = {
+    'Lead: @mekarpeles': 'mekarpeles',
+    'Lead: @cdrini': 'cdrini',
+    'Lead: @scottbarnes': 'scottbarnes',
+    'Lead: @seabelis': 'seabelis',
+    'Lead: @jimchamp': 'jimchamp',
+}
+
+# Maps GitHub username to Slack ID
+username_to_slack_id = {
+    'mekarpeles': '<@mek>',
+    'cdrini': '<@cdrini>',
+    'scottbarnes': '<@U03MNR6T7FH>',
+    'seabelis': '<@UAHQ39ACT>',
+    'jimchamp': '<@U01ARTHG9EV>',
+    'hornc': '<@U0EUS8DV0>',
+}
+
+
+def fetch_issues(updated_since: str):
+    """
+    Fetches all GitHub issues that have been updated since the given date string and have at least one comment.
+
+    GitHub results are paginated.  This functions appends each result to a list, and does so for all pages.
+    To keep API calls to a minimum, we request the maximum number of results per request (100 per page, as of writing).
+
+    Important: Updated issues need not have a recent comment. Update events include many other things, such as adding a
+    label to an issue, or moving an issue to a milestone.  Issues returned by this function will require additional
+    processing in order to determine if they have recent comments.
+    """
+    # Make initial query for updated issues:
+    query = f'repo:internetarchive/openlibrary is:open is:issue comments:>0 updated:>{updated_since}'
+    p: dict[str, str|int] = {
+        'q': query,
+        'per_page': 100,
+    }
+    response = requests.get(
+        'https://api.github.com/search/issues',
+        params=p,
+    )
+    d = response.json()
+    results = d['items']
+
+    # Fetch additional updated issues, if any exist
+    def get_next_page(url: str):
+        """Returns list of issues and optional url for next page"""
+        resp = requests.get(url)
+        # Get issues
+        d = resp.json()
+        issues = d['items']
+        # Prepare url for next page
+        next = resp.links.get('next', {})
+        next_url = next.get('url', '')
+
+        return issues, next_url
+
+    links = response.links
+    next = links.get('next', {})
+    next_url = next.get('url', '')
+    while next_url:
+        # Make call with next link
+        issues, next_url = get_next_page(next_url)
+        results = results + issues
+
+    return results
+
+
+def filter_issues(issues: list, since: datetime):
+    """
+    Returns list of issues that were not last responded to by staff.
+    Requires fetching the most recent comments for the given issues.
+    """
+    results = []
+
+    for i in issues:
+        # Fetch comments using URL from previous GitHub search results
+        comments_url = i.get('comments_url')
+        resp = requests.get(
+            comments_url,
+            params={
+                'per_page': 100
+            }
+        )
+
+        # Ensure that we have the last page of comments
+        links = resp.links
+        last = links.get('last', {})
+        last_url = last.get('url', '')
+
+        if last_url:
+            resp = requests.get(last_url)
+
+        # Get last comment
+        comments = resp.json()
+        last_comment = comments[-1]
+
+        # Determine if last comment meets our criteria for Slack notifications
+        # First step: Ensure that the last comment was left after the given `since` datetime
+        created = datetime.fromisoformat(last_comment['created_at'])
+        # Removing timezone info to avoid TypeErrors, which occur when
+        # comparing a timezone-aware datetime with a timezone-naive datetime
+        created = created.replace(tzinfo=None)
+        if created > since:
+            # Next step: Determine if the last commenter is a staff member
+            last_commenter = last_comment['user']['login']
+            if last_commenter not in username_to_slack_id:
+                lead_label = find_lead_label(i.get('labels', []))
+                results.append({
+                    'comment_url': last_comment['html_url'],
+                    'commenter': last_commenter,
+                    'issue_title': i['title'],
+                    'lead_label': lead_label,
+                })
+
+    return results
+
+
+def find_lead_label(labels: list[dict[str, Any]]) -> str:
+    """
+    Finds and returns the name of the first lead label found in the given list of GitHub labels.
+
+    Returns an empty string if no lead label is found
+    """
+    result = ''
+    for label in labels:
+        if label['name'].startswith('Lead:'):
+            result = label['name']
+            break
+
+    return result
+
+
+def publish_digest(issues: list[dict[str, str]], slack_channel: str, slack_token: str, hours_passed: int):
+    """
+    Creates a threaded Slack messaged containing a digest of recently commented GitHub issues.
+
+    Parent Slack message will say how many comments were left, and the timeframe. Each reply
+    will include a link to the comment, as well as additional information.
+    """
+    # Create the parent message
+    parent_thread_msg = f'{len(issues)} new GitHub comment(s) since {hours_passed} hour(s) ago'
+
+    response = requests.post(
+        'https://slack.com/api/chat.postMessage',
+        headers={
+            'Authorization': f"Bearer {slack_token}",
+            'Content-Type': 'application/json;  charset=utf-8',
+        },
+        json={
+            'channel': slack_channel,
+            'text': parent_thread_msg,
+        },
+    )
+
+    if response.status_code != 200:
+        # XXX : Log this
+        print(f'Failed to send message to Slack.  Status code: {response.status_code}')
+        # XXX : Add retry logic?
+        sys.exit(errno.ECOMM)
+
+    d = response.json()
+    # Store timestamp, which, along with the channel, uniquely identifies the parent thread
+    ts = d.get('ts')
+
+    def comment_on_thread(message: str):
+        """
+        Posts the given message as a reply to the parent message.
+        """
+        response = requests.post(
+            'https://slack.com/api/chat.postMessage',
+            headers={
+                'Authorization': f"Bearer {slack_token}",
+                'Content-Type': 'application/json;  charset=utf-8',
+            },
+            json={
+                'channel': slack_channel,
+                'text': message,
+                'thread_ts': ts,
+            },
+        )
+        if response.status_code != 200:
+            # XXX : Check "ok" field for errors
+            # XXX : Log this
+            print(f'Failed to POST slack message\n  Status code: {response.status_code}\n  Message: {message}')
+            # XXX : Retry logic?
+
+    for i in issues:
+        # Slack rate limit is roughly 1 request per second
+        time.sleep(1)
+
+        comment_url = i['comment_url']
+        issue_title = i['issue_title']
+        commenter = i['commenter']
+        message = f'<{comment_url}|Latest comment for: *{issue_title}*>\n'
+
+        username = lead_label_to_username.get(i['lead_label'], '')
+        slack_id = username_to_slack_id.get(username, '')
+        if slack_id:
+            message += f'Lead: {slack_id}\n'
+        elif i['lead_label']:
+            message += f'{i["lead_label"]}\n'
+        else:
+            message += 'Lead: N/A\n'
+
+        message += f'Commenter: *{commenter}*'
+        comment_on_thread(message)
+
+
+def time_since(hours):
+    """Returns datetime and string representations of the current time, minus the given hour"""
+    now = datetime.now()
+    # XXX : Add a minute or two to the delta (to avoid dropping issues)?
+    since = now - timedelta(hours=hours)
+    return since, since.strftime('%Y-%m-%dT%H:%M:%S')
+
+
+def start_job(args: argparse.Namespace):
+    """
+    Starts the new comment digest job.
+    """
+    since, date_string = time_since(args.hours)
+    issues = fetch_issues(date_string)
+    filtered_issues = filter_issues(issues, since)
+
+    # XXX : If we are only running this script daily, we can remove this condition to
+    # always post a message to Slack. If the digest is ever not published, we'll know
+    # that something is wrong with our script runner.
+    if filtered_issues:
+        publish_digest(filtered_issues, args.channel, args.slack_token, args.hours)
+        # XXX : Log this
+        print('Digest posted to Slack.')
+    else:
+        # XXX : Log this
+        print('No issues needing attention found.')
+
+
+def _get_parser() -> argparse.ArgumentParser:
+    """
+    Creates and returns an ArgumentParser containing default values which were
+    read from the config file.
+    """
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        'hours',
+        help='Fetch issues that have been updated since this many hours ago',
+        type=int,
+    )
+    parser.add_argument(
+        'channel',
+        help="Issues will be published to this Slack channel",
+        type=str,
+    )
+    parser.add_argument(
+        'slack_token',
+        metavar='slack-token',
+        help='Slack auth token',
+        type=str,
+    )
+
+    return parser
+
+
+if __name__ == '__main__':
+    # Process command-line arguments and starts the notification job
+    parser = _get_parser()
+    args = parser.parse_args()
+    start_job(args)
diff --git a/scripts/gh_scripts/requirements.txt b/scripts/gh_scripts/requirements.txt
@@ -0,0 +1 @@
+requests==2.31.0