Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2u/course optimizer #35887

Draft
wants to merge 16 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 136 additions & 0 deletions cms/djangoapps/contentstore/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import os
import shutil
import tarfile
import re
import requests
from datetime import datetime
from tempfile import NamedTemporaryFile, mkdtemp

Expand Down Expand Up @@ -53,8 +55,10 @@
translation_language,
delete_course
)
from cms.djangoapps.contentstore.xblock_storage_handlers.view_handlers import get_block_info
from cms.djangoapps.models.settings.course_metadata import CourseMetadata
from common.djangoapps.course_action_state.models import CourseRerunState
from common.djangoapps.static_replace import replace_static_urls
from common.djangoapps.student.auth import has_course_author_access
from common.djangoapps.student.roles import CourseInstructorRole, CourseStaffRole, LibraryUserRole
from common.djangoapps.util.monitoring import monitor_import_failure
Expand Down Expand Up @@ -1066,3 +1070,135 @@ def undo_all_library_source_blocks_ids_for_course(course_key_string, v1_to_v2_li
store.update_item(draft_library_source_block, None)
# return success
return


class CourseLinkCheckTask(UserTask): # pylint: disable=abstract-method
"""
Base class for course link check tasks.
"""

@staticmethod
def calculate_total_steps(arguments_dict):
"""
Get the number of in-progress steps in the link check process, as shown in the UI.

For reference, these are:
1. Scanning
2. Verifying
"""
return 2

@classmethod
def generate_name(cls, arguments_dict):
"""
Create a name for this particular task instance.

Arguments:
arguments_dict (dict): The arguments given to the task function

Returns:
str: The generated name
"""
key = arguments_dict['course_key_string']
return f'Broken link check of {key}'


@shared_task(base=CourseLinkCheckTask, bind=True)
def check_broken_links(self, user_id, course_key_string, language):
"""
Checks for broken links in a course. Store the results in a file.
"""
def validate_user():
"""Validate if the user exists. Otherwise log error. """
try:
return User.objects.get(pk=user_id)
except User.DoesNotExist as exc:
with translation_language(language):
self.status.fail(UserErrors.UNKNOWN_USER_ID.format(user_id))
return

def get_urls(content):
"""Returns all urls after href and src in content."""
regex = r'\s+(?:href|src)=["\']([^"\']*)["\']'
urls = re.findall(regex, content)
return urls

def convert_to_standard_url(url, course_key):
"""
Returns standard urls when given studio urls.
Example urls:
/assets/courseware/v1/506da5d6f866e8f0be44c5df8b6e6b2a/asset-v1:edX+DemoX+Demo_Course+type@asset+block/getting-started_x250.png
/static/getting-started_x250.png
/container/block-v1:edX+DemoX+Demo_Course+type@vertical+block@2152d4a4aadc4cb0af5256394a3d1fc7
"""
if not url.startswith('http://') and not url.startswith('https://'):
if url.startswith('/static/'):
processed_url = replace_static_urls(f'\"{url}\"', course_id=course_key)[1:-1]
return 'http://' + settings.CMS_BASE + processed_url
elif url.startswith('/'):
return 'http://' + settings.CMS_BASE + url
else:
return 'http://' + settings.CMS_BASE + '/container/' + url

def verify_url(url):
"""Returns true if url request returns 200"""
try:
response = requests.get(url, timeout=5)
return response.status_code == 200
except requests.exceptions.RequestException as e:
return False

def scan_course(course_key):
"""
Scans the course and returns broken link tuples.
[<block_id>, <broken_link>]
"""
broken_links = []
verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'})
blocks = []

for vertical in verticals:
blocks.extend(vertical.get_children())

for block in blocks:
usage_key = block.usage_key
block_info = get_block_info(block)
block_data = block_info['data']
urls = get_urls(block_data)

for url in urls:
rayzhou-bit marked this conversation as resolved.
Show resolved Hide resolved
if url == '#':
rayzhou-bit marked this conversation as resolved.
Show resolved Hide resolved
break
standardized_url = convert_to_standard_url(url, course_key)
if not verify_url(standardized_url):
broken_links.append([str(usage_key), url])

return broken_links

user = validate_user()

self.status.set_state('Scanning')
courselike_key = CourseKey.from_string(course_key_string)
data = scan_course(courselike_key)

try:
self.status.set_state('Saving')
self.status.increment_completed_steps()

file_name = str(courselike_key)
links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json')
LOGGER.debug('json file being generated at %s', links_file.name)

with open(links_file.name, 'w') as file:
json.dump(data, file, indent=4)

artifact = UserTaskArtifact(status=self.status, name='BrokenLinks')
artifact.file.save(name=os.path.basename(links_file.name), content=File(links_file))
artifact.save()

# catch all exceptions so we can record useful error messages
except Exception as exception: # pylint: disable=broad-except
LOGGER.exception('Error checking links for course %s', courselike_key, exc_info=True)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So... I see this is done this way in other tasks.
Is this how celery works? No actual error should be raised in the task? Because this swallows all errors.
I understand it does a logger exception. But I'm a bit confused because I don't know if this actually the correct way it should be handled with celery, or whether the error should not be caught so it can cancel the running celery task? Need to look into it.

I don't exactly know how this logger is configured but this way I don't see datadog showing an exception, for example. Maybe if we do swallow this error, we should add some logic to make it show in datadog celery errors? Or do you think the logger exception somehow does this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Focus:
Out of scope. Create issue to look into this.
Quite important but better done separately.

if self.status.state != UserTaskStatus.FAILED:
self.status.fail({'raw_error_msg': str(exception)})
return
1 change: 1 addition & 0 deletions cms/djangoapps/contentstore/views/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .checklists import *
from .component import *
from .course import * # lint-amnesty, pylint: disable=redefined-builtin
from .course_optimizer import *
from .entrance_exam import *
from .error import *
from .export_git import *
Expand Down
212 changes: 212 additions & 0 deletions cms/djangoapps/contentstore/views/course_optimizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
"""
These views handle all actions in Studio related to link checking of
courses
"""


import json
import logging
import os
from wsgiref.util import FileWrapper

from django.conf import settings
from django.contrib.auth.decorators import login_required
from django.core.exceptions import PermissionDenied
from django.db import transaction
from django.http import Http404, HttpResponse, StreamingHttpResponse
from django.utils.translation import gettext as _
from django.views.decorators.csrf import ensure_csrf_cookie
from django.views.decorators.http import require_GET, require_http_methods
from opaque_keys.edx.keys import CourseKey
from user_tasks.conf import settings as user_tasks_settings
from user_tasks.models import UserTaskArtifact, UserTaskStatus

from common.djangoapps.student.auth import has_course_author_access
from common.djangoapps.util.json_request import JsonResponse
from common.djangoapps.util.views import ensure_valid_course_key
from cms.djangoapps.contentstore.xblock_storage_handlers.view_handlers import get_xblock
from cms.djangoapps.contentstore.xblock_storage_handlers.xblock_helpers import usage_key_with_run
from xmodule.modulestore.django import modulestore # lint-amnesty, pylint: disable=wrong-import-order

from ..tasks import CourseLinkCheckTask, check_broken_links
from ..utils import reverse_course_url

__all__ = [
'link_check_handler',
'link_check_status_handler',
]

log = logging.getLogger(__name__)

# Tuple containing zero or more filters for UserTaskStatus listing REST API calls.
STATUS_FILTERS = user_tasks_settings.USER_TASKS_STATUS_FILTERS
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A one line comment as to how these will be used would be helpful. The module filters user tasks on the basis of these, but I'm unfamiliar with this package's use. The one line saves me the trouble of having to go read up on the package if all I want is some notion of what it's used for.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment above "# Tuple containing zero or more filters for UserTaskStatus listing REST API calls." does not help my understanding at all. I understand we apply filters. What do we apply them for?

I went through code quite a bit to figure that out, which was not straightforward.

The result:
This is an adjustable django setting. It specifies general filters that are applied to all tasks before they are returned to the user. Currently this setting is not specified in edx-app settings. So it defaults to a default filter. Which is quite hard to find to figure out where this is defined (in some other repo called django_user_tasks.
This is the code place:
https://github.com/openedx/django-user-tasks/blob/20e4e6eb81f5e981d5bbdba390ffcf8e6c3e0d0a/user_tasks/filters.py#L9

So it will be quite hard for a user to understand what the filter actually does or what it's purpose is.

The explaining comments for the default filter are:

    Default filter for UserTaskArtifact listings in the REST API.

    Ensures that superusers can see all artifacts, but other users
    can only see artifacts for tasks they personally triggered.

And then more specific:

Filter out any artifacts which the requesting user does not have permission to view.

On top of that, there are two sets of these filters - one is for task artifacts, one for statuses. They function the exact same way.

This is important to capture on edx-platform in some way. Maybe something like: "These filters can be overridden in django settings of edx-platform. If they are not, the default behavior is the following: ..."

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Focus:
Above the line - it's just a comment to add and it's super helpful I feel



@transaction.non_atomic_requests
@ensure_csrf_cookie
@login_required
@require_http_methods(('POST'))
@ensure_valid_course_key
def link_check_handler(request, course_key_string):
"""
The restful handler for checking broken links in a course.

POST
Start a Celery task to check broken links in the course

The Studio UI uses a POST request to start the link check asynchronously, with
a link appearing on the page once it's ready.
"""
course_key = CourseKey.from_string(course_key_string)
if not has_course_author_access(request.user, course_key):
raise PermissionDenied()
courselike_block = modulestore().get_course(course_key)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's a courselike_block? Naming isn't that clear to me

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Focus:
nit. Not important.

Copy link
Contributor

@bradenmacdonald bradenmacdonald Nov 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We prefer the term "learning context" over "courselike" now. But in this case, I doubt this works with libraries at all, so why not just call it course_root_block and be super clear?

if courselike_block is None:
raise Http404
context = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this context variable used anywhere? I may just be blind

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Focus:
Below the line. Nice to have but not important.

'context_course': courselike_block,
'courselike_home_url': reverse_course_url("course_handler", course_key),
}
context['status_url'] = reverse_course_url('link_check_status_handler', course_key)

# an _accept URL parameter will be preferred over HTTP_ACCEPT in the header.
requested_format = request.GET.get('_accept', request.META.get('HTTP_ACCEPT', 'text/html'))

check_broken_links.delay(request.user.id, course_key_string, request.LANGUAGE_CODE)
return JsonResponse({'LinkCheckStatus': 1})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some thoughts for readability:
I have the impression that 1 here seems like a magic number. If I look at the code later, it would be hard to figure out what it means, and it would be easy for me when editing the code to return something that is somehow wrong.

I'll need to look in another function to figure out what link check statuses there are according to a comment. And if someone changes the logic but forgets to update that comment, that can lead to bugs.

A pattern I like to use to avoid this problem is to define something similar to an enum.

It could be:

LINK_CHECK_STATUSES = {
    "IN_PROGRESS": 1,
    "SUCCESS": 3,
    ...
}

And then here and in other places you can just use it like
return JsonResponse({'LinkCheckStatus': LINK_CHECK_STATUSES["IN_PROGRESS"]})

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Focus:
Above the line, important
You're already overhauling link check statuses so when you've implemented that you can just resolve this discussion



@transaction.non_atomic_requests
@require_GET
@ensure_csrf_cookie
@login_required
@ensure_valid_course_key
def link_check_status_handler(request, course_key_string):
"""
Returns an integer corresponding to the status of a link check. These are:
rayzhou-bit marked this conversation as resolved.
Show resolved Hide resolved

-X : Link check unsuccessful due to some error with X as stage [0-3]
0 : No status info found (task not yet created)
1 : Scanning
2 : Saving
3 : Success

If the link check was successful, an output result is also returned.
"""
course_key = CourseKey.from_string(course_key_string)
if not has_course_author_access(request.user, course_key):
raise PermissionDenied()

# The task status record is authoritative once it's been created
task_status = _latest_task_status(request, course_key_string, link_check_status_handler)
json_content = None
test = None
response = None
error = None
broken_links_dto = None
if task_status is None:
# The task hasn't been initialized yet; did we store info in the session already?
try:
session_status = request.session["link_check_status"]
status = session_status[course_key_string]
except KeyError:
status = 0
elif task_status.state == UserTaskStatus.SUCCEEDED:
status = 3
artifact = UserTaskArtifact.objects.get(status=task_status, name='BrokenLinks')
with artifact.file as file:
content = file.read()
json_content = json.loads(content)
broken_links_dto = _create_dto(json_content, request.user)
elif task_status.state in (UserTaskStatus.FAILED, UserTaskStatus.CANCELED):
status = max(-(task_status.completed_steps + 1), -2)
Copy link
Contributor

@bszabo bszabo Nov 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using a max() function to compute status is highly suspicious! (Ditto the min function below). It seems like you're combining the number of completed steps with information as to whether the task failed or was canceled. Have you considered using independent variables or fields to capture these rather disparate concepts?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is code I copied over from import_export.py. I believe the api will return a negative number on a fail related to the step in the process.
I agree this looks bad... but I would want to update code in both places at the same time. Maybe in a separate PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Separate PR but part of the same task? General stewardship rule is "leave the code a little better than you found it"

Copy link
Member

@jesperhodge jesperhodge Nov 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Focus:
below the line. Not important.
You can just go with whatever works for you.

I see people often just copying over code from older places since it just works, and thus creating new not-great code. I don't generally think that's a good idea. In my opinion we can extract code that's reused into helper functions if the code is good, but if not, I'd prefer writing new code that is better. It also means the code author needs to think and understand the code a little bit more in-depth than when they copy it.

If you want to change the code but it's too much trouble to extract the function from the other place, I'd say just change it in the new place and then just extract the parts that are still the same

Copy link
Member

@jesperhodge jesperhodge Nov 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Focus:
Part of improving task statuses

errors = UserTaskArtifact.objects.filter(status=task_status, name='Error')
if errors:
error = errors[0].text
try:
error = json.loads(error)
except ValueError:
# Wasn't JSON, just use the value as a string
pass
else:
status = min(task_status.completed_steps + 1, 2)

response = {
"LinkCheckStatus": status,
}
if broken_links_dto:
response["LinkCheckOutput"] = broken_links_dto
if error:
response['LinkCheckError'] = error
return JsonResponse(response)


def _latest_task_status(request, course_key_string, view_func=None):
"""
Get the most recent link check status update for the specified course
key.
"""
args = {'course_key_string': course_key_string}
name = CourseLinkCheckTask.generate_name(args)
task_status = UserTaskStatus.objects.filter(name=name)
for status_filter in STATUS_FILTERS:
task_status = status_filter().filter_queryset(request, task_status, view_func)
return task_status.order_by('-created').first()


def _create_dto(json_content, request_user):
"""
Returns a Data Transfer Object for frontend given a list of broken links.

json_content contains a list of the following:
[block_id, link]

Returned DTO structure:
{
section: {
display_name,
subsection: {
display_name,
unit: {
display_name,
block: {
display_name,
url,
broken_links: [],
}
}
}
}
}
"""
result = {}
for item in json_content:
block_id, link = item
usage_key = usage_key_with_run(block_id)
block = get_xblock(usage_key, request_user)
_add_broken_link_description(result, block, link)

return result


def _add_broken_link_description(result, block, link):
"""
Adds broken link found in the specified block along with other block data.
Note that because the celery queue does not have credentials, some broken links will
need to be checked client side.
"""
hierarchy = []
current = block
while current:
hierarchy.append(current)
current = current.get_parent()

current_dict = result
for xblock in reversed(hierarchy):
current_dict = current_dict.setdefault(
str(xblock.location.block_id),
{ 'display_name': xblock.display_name }
)

current_dict['url'] = f'/course/{block.course_id}/editor/{block.category}/{block.location}'
current_dict.setdefault('broken_links', []).append(link)
4 changes: 4 additions & 0 deletions cms/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,10 @@
name='export_output_handler'),
re_path(fr'^export_status/{COURSELIKE_KEY_PATTERN}$', contentstore_views.export_status_handler,
name='export_status_handler'),
re_path(fr'^link_check/{COURSELIKE_KEY_PATTERN}$', contentstore_views.link_check_handler,
name='link_check_handler'),
re_path(fr'^link_check_status/{COURSELIKE_KEY_PATTERN}$', contentstore_views.link_check_status_handler,
name='link_check_status_handler'),
re_path(fr'^xblock/outline/{settings.USAGE_KEY_PATTERN}$', contentstore_views.xblock_outline_handler,
name='xblock_outline_handler'),
re_path(fr'^xblock/container/{settings.USAGE_KEY_PATTERN}$', contentstore_views.xblock_container_handler,
Expand Down
Loading