openedx · rayzhou-bit · Oct 23, 2024 · Nov 8, 2024 · Nov 19, 2024 · Nov 20, 2024
diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py
@@ -7,6 +7,8 @@
 import os
 import shutil
 import tarfile
+import re
+import requests
 from datetime import datetime
 from tempfile import NamedTemporaryFile, mkdtemp
 
@@ -53,8 +55,10 @@
     translation_language,
     delete_course
 )
+from cms.djangoapps.contentstore.xblock_storage_handlers.view_handlers import get_block_info
 from cms.djangoapps.models.settings.course_metadata import CourseMetadata
 from common.djangoapps.course_action_state.models import CourseRerunState
+from common.djangoapps.static_replace import replace_static_urls
 from common.djangoapps.student.auth import has_course_author_access
 from common.djangoapps.student.roles import CourseInstructorRole, CourseStaffRole, LibraryUserRole
 from common.djangoapps.util.monitoring import monitor_import_failure
@@ -1066,3 +1070,135 @@ def undo_all_library_source_blocks_ids_for_course(course_key_string, v1_to_v2_li
             store.update_item(draft_library_source_block, None)
     # return success
     return
+
+
+class CourseLinkCheckTask(UserTask):  # pylint: disable=abstract-method
+    """
+    Base class for course link check tasks.
+    """
+
+    @staticmethod
+    def calculate_total_steps(arguments_dict):
+        """
+        Get the number of in-progress steps in the link check process, as shown in the UI.
+
+        For reference, these are:
+        1. Scanning
+        2. Verifying
+        """
+        return 2
+
+    @classmethod
+    def generate_name(cls, arguments_dict):
+        """
+        Create a name for this particular task instance.
+
+        Arguments:
+            arguments_dict (dict): The arguments given to the task function
+
+        Returns:
+            str: The generated name
+        """
+        key = arguments_dict['course_key_string']
+        return f'Broken link check of {key}'
+
+
+@shared_task(base=CourseLinkCheckTask, bind=True)
+def check_broken_links(self, user_id, course_key_string, language):
+    """
+    Checks for broken links in a course. Store the results in a file.
+    """
+    def validate_user():
+        """Validate if the user exists. Otherwise log error. """
+        try:
+            return User.objects.get(pk=user_id)
+        except User.DoesNotExist as exc:
+            with translation_language(language):
+                self.status.fail(UserErrors.UNKNOWN_USER_ID.format(user_id))
+            return
+
+    def get_urls(content):
+        """Returns all urls after href and src in content."""
+        regex = r'\s+(?:href|src)=["\']([^"\']*)["\']'
+        urls = re.findall(regex, content)
+        return urls
+
+    def convert_to_standard_url(url, course_key):
+        """
+        Returns standard urls when given studio urls.
+        Example urls:
+          /assets/courseware/v1/506da5d6f866e8f0be44c5df8b6e6b2a/asset-v1:edX+DemoX+Demo_Course+type@asset+block/getting-started_x250.png
+          /static/getting-started_x250.png
+          /container/block-v1:edX+DemoX+Demo_Course+type@vertical+block@2152d4a4aadc4cb0af5256394a3d1fc7
+        """
+        if not url.startswith('http://') and not url.startswith('https://'):
+            if url.startswith('/static/'):
+                processed_url = replace_static_urls(f'\"{url}\"', course_id=course_key)[1:-1]
+                return 'http://' + settings.CMS_BASE + processed_url
+            elif url.startswith('/'):
+                return 'http://' + settings.CMS_BASE + url
+            else:
+                return 'http://' + settings.CMS_BASE + '/container/' + url
+
+    def verify_url(url):
+        """Returns true if url request returns 200"""
+        try:
+            response = requests.get(url, timeout=5)
+            return response.status_code == 200
+        except requests.exceptions.RequestException as e:
+            return False
+
+    def scan_course(course_key):
+        """
+        Scans the course and returns broken link tuples.
+          [<block_id>, <broken_link>]
+        """
+        broken_links = []
+        verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'})
+        blocks = []
+
+        for vertical in verticals:
+            blocks.extend(vertical.get_children())
+
+        for block in blocks:
+            usage_key = block.usage_key
+            block_info = get_block_info(block)
+            block_data = block_info['data']
+            urls = get_urls(block_data)
+
+            for url in urls:
+                if url == '#':
+                    break
+                standardized_url = convert_to_standard_url(url, course_key)
+                if not verify_url(standardized_url):
+                    broken_links.append([str(usage_key), url])
+
+        return broken_links
+
+    user = validate_user()
+
+    self.status.set_state('Scanning')
+    courselike_key = CourseKey.from_string(course_key_string)
+    data = scan_course(courselike_key)
+
+    try:
+        self.status.set_state('Saving')
+        self.status.increment_completed_steps()
+
+        file_name = str(courselike_key)
+        links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json')
+        LOGGER.debug('json file being generated at %s', links_file.name)
+
+        with open(links_file.name, 'w') as file:
+            json.dump(data, file, indent=4)
+
+        artifact = UserTaskArtifact(status=self.status, name='BrokenLinks')
+        artifact.file.save(name=os.path.basename(links_file.name), content=File(links_file))
+        artifact.save()
+
+    # catch all exceptions so we can record useful error messages
+    except Exception as exception:  # pylint: disable=broad-except
+        LOGGER.exception('Error checking links for course %s', courselike_key, exc_info=True)
+        if self.status.state != UserTaskStatus.FAILED:
+            self.status.fail({'raw_error_msg': str(exception)})
+        return
diff --git a/cms/djangoapps/contentstore/views/__init__.py b/cms/djangoapps/contentstore/views/__init__.py
@@ -4,6 +4,7 @@
 from .checklists import *
 from .component import *
 from .course import *  # lint-amnesty, pylint: disable=redefined-builtin
+from .course_optimizer import *
 from .entrance_exam import *
 from .error import *
 from .export_git import *

diff --git a/cms/djangoapps/contentstore/views/course_optimizer.py b/cms/djangoapps/contentstore/views/course_optimizer.py
@@ -0,0 +1,212 @@
+"""
+These views handle all actions in Studio related to link checking of
+courses
+"""
+
+
+import json
+import logging
+import os
+from wsgiref.util import FileWrapper
+
+from django.conf import settings
+from django.contrib.auth.decorators import login_required
+from django.core.exceptions import PermissionDenied
+from django.db import transaction
+from django.http import Http404, HttpResponse, StreamingHttpResponse
+from django.utils.translation import gettext as _
+from django.views.decorators.csrf import ensure_csrf_cookie
+from django.views.decorators.http import require_GET, require_http_methods
+from opaque_keys.edx.keys import CourseKey
+from user_tasks.conf import settings as user_tasks_settings
+from user_tasks.models import UserTaskArtifact, UserTaskStatus
+
+from common.djangoapps.student.auth import has_course_author_access
+from common.djangoapps.util.json_request import JsonResponse
+from common.djangoapps.util.views import ensure_valid_course_key
+from cms.djangoapps.contentstore.xblock_storage_handlers.view_handlers import get_xblock
+from cms.djangoapps.contentstore.xblock_storage_handlers.xblock_helpers import usage_key_with_run
+from xmodule.modulestore.django import modulestore  # lint-amnesty, pylint: disable=wrong-import-order
+
+from ..tasks import CourseLinkCheckTask, check_broken_links
+from ..utils import reverse_course_url
+
+__all__ = [
+    'link_check_handler',
+    'link_check_status_handler',
+]
+
+log = logging.getLogger(__name__)
+
+# Tuple containing zero or more filters for UserTaskStatus listing REST API calls.
+STATUS_FILTERS = user_tasks_settings.USER_TASKS_STATUS_FILTERS
+
+
+@transaction.non_atomic_requests
+@ensure_csrf_cookie
+@login_required
+@require_http_methods(('POST'))
+@ensure_valid_course_key
+def link_check_handler(request, course_key_string):
+    """
+    The restful handler for checking broken links in a course.
+
+    POST
+        Start a Celery task to check broken links in the course
+
+    The Studio UI uses a POST request to start the link check asynchronously, with
+    a link appearing on the page once it's ready.
+    """
+    course_key = CourseKey.from_string(course_key_string)
+    if not has_course_author_access(request.user, course_key):
+        raise PermissionDenied()
+    courselike_block = modulestore().get_course(course_key)
+    if courselike_block is None:
+        raise Http404
+    context = {
+        'context_course': courselike_block,
+        'courselike_home_url': reverse_course_url("course_handler", course_key),
+    }
+    context['status_url'] = reverse_course_url('link_check_status_handler', course_key)
+
+    # an _accept URL parameter will be preferred over HTTP_ACCEPT in the header.
+    requested_format = request.GET.get('_accept', request.META.get('HTTP_ACCEPT', 'text/html'))
+
+    check_broken_links.delay(request.user.id, course_key_string, request.LANGUAGE_CODE)
+    return JsonResponse({'LinkCheckStatus': 1})
+
+
+@transaction.non_atomic_requests
+@require_GET
+@ensure_csrf_cookie
+@login_required
+@ensure_valid_course_key
+def link_check_status_handler(request, course_key_string):
+    """
+    Returns an integer corresponding to the status of a link check. These are:
+
+        -X : Link check unsuccessful due to some error with X as stage [0-3]
+        0 : No status info found (task not yet created)
+        1 : Scanning
+        2 : Saving
+        3 : Success
+
+    If the link check was successful, an output result is also returned.
+    """
+    course_key = CourseKey.from_string(course_key_string)
+    if not has_course_author_access(request.user, course_key):
+        raise PermissionDenied()
+
+    # The task status record is authoritative once it's been created
+    task_status = _latest_task_status(request, course_key_string, link_check_status_handler)
+    json_content = None
+    test = None
+    response = None
+    error = None
+    broken_links_dto = None
+    if task_status is None:
+        # The task hasn't been initialized yet; did we store info in the session already?
+        try:
+            session_status = request.session["link_check_status"]
+            status = session_status[course_key_string]
+        except KeyError:
+            status = 0
+    elif task_status.state == UserTaskStatus.SUCCEEDED:
+        status = 3
+        artifact = UserTaskArtifact.objects.get(status=task_status, name='BrokenLinks')
+        with artifact.file as file:
+            content = file.read()
+            json_content = json.loads(content)
+            broken_links_dto = _create_dto(json_content, request.user)
+    elif task_status.state in (UserTaskStatus.FAILED, UserTaskStatus.CANCELED):
+        status = max(-(task_status.completed_steps + 1), -2)
+        errors = UserTaskArtifact.objects.filter(status=task_status, name='Error')
+        if errors:
+            error = errors[0].text
+            try:
+                error = json.loads(error)
+            except ValueError:
+                # Wasn't JSON, just use the value as a string
+                pass
+    else:
+        status = min(task_status.completed_steps + 1, 2)
+
+    response = {
+        "LinkCheckStatus": status,
+    }
+    if broken_links_dto:
+        response["LinkCheckOutput"] = broken_links_dto
+    if error:
+        response['LinkCheckError'] = error
+    return JsonResponse(response)
+
+
+def _latest_task_status(request, course_key_string, view_func=None):
+    """
+    Get the most recent link check status update for the specified course
+    key.
+    """
+    args = {'course_key_string': course_key_string}
+    name = CourseLinkCheckTask.generate_name(args)
+    task_status = UserTaskStatus.objects.filter(name=name)
+    for status_filter in STATUS_FILTERS:
+        task_status = status_filter().filter_queryset(request, task_status, view_func)
+    return task_status.order_by('-created').first()
+
+
+def _create_dto(json_content, request_user):
+    """
+    Returns a Data Transfer Object for frontend given a list of broken links.
+
+    json_content contains a list of the following:
+        [block_id, link]
+
+    Returned DTO structure:
+    {
+        section: {
+            display_name,
+            subsection: {
+                display_name,
+                unit: {
+                    display_name,
+                    block: {
+                        display_name,
+                        url,
+                        broken_links: [],
+                    }
+                }
+            }
+        }
+    }
+    """
+    result = {}
+    for item in json_content:
+        block_id, link = item
+        usage_key = usage_key_with_run(block_id)
+        block = get_xblock(usage_key, request_user)
+        _add_broken_link_description(result, block, link)
+
+    return result
+
+
+def _add_broken_link_description(result, block, link):
+    """
+    Adds broken link found in the specified block along with other block data.
+    Note that because the celery queue does not have credentials, some broken links will
+    need to be checked client side.
+    """
+    hierarchy = []
+    current = block
+    while current:
+        hierarchy.append(current)
+        current = current.get_parent()
+
+    current_dict = result
+    for xblock in reversed(hierarchy):
+        current_dict = current_dict.setdefault(
+            str(xblock.location.block_id), 
+            { 'display_name': xblock.display_name }
+        )
+
+    current_dict['url'] = f'/course/{block.course_id}/editor/{block.category}/{block.location}'
+    current_dict.setdefault('broken_links', []).append(link)
diff --git a/cms/urls.py b/cms/urls.py
@@ -146,6 +146,10 @@
             name='export_output_handler'),
     re_path(fr'^export_status/{COURSELIKE_KEY_PATTERN}$', contentstore_views.export_status_handler,
             name='export_status_handler'),
+    re_path(fr'^link_check/{COURSELIKE_KEY_PATTERN}$', contentstore_views.link_check_handler,
+            name='link_check_handler'),
+    re_path(fr'^link_check_status/{COURSELIKE_KEY_PATTERN}$', contentstore_views.link_check_status_handler,
+            name='link_check_status_handler'),
     re_path(fr'^xblock/outline/{settings.USAGE_KEY_PATTERN}$', contentstore_views.xblock_outline_handler,
             name='xblock_outline_handler'),
     re_path(fr'^xblock/container/{settings.USAGE_KEY_PATTERN}$', contentstore_views.xblock_container_handler,