Skip to content

Commit

Permalink
Merge pull request #268 from open-craft/nizar/overriding_existing_non…
Browse files Browse the repository at this point in the history
…_duplicate_transcripts

[SE-3520] Adds waffle flag to enable overriding existing transcripts on import
  • Loading branch information
DawoudSheraz authored Jan 4, 2021
2 parents 55a05e5 + 923097b commit 84cedb3
Show file tree
Hide file tree
Showing 14 changed files with 484 additions and 156 deletions.
116 changes: 70 additions & 46 deletions edxval/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from lxml.etree import Element, SubElement
from pysrt.srtexc import Error

from edxval.config.waffle import OVERRIDE_EXISTING_IMPORTED_TRANSCRIPTS
from edxval.exceptions import (
InvalidTranscriptFormat,
InvalidTranscriptProvider,
Expand All @@ -42,7 +43,13 @@
)
from edxval.serializers import TranscriptPreferenceSerializer, TranscriptSerializer, VideoSerializer
from edxval.transcript_utils import Transcript
from edxval.utils import THIRD_PARTY_TRANSCRIPTION_PLANS, TranscriptFormat, create_file_in_fs, get_transcript_format
from edxval.utils import (
THIRD_PARTY_TRANSCRIPTION_PLANS,
TranscriptFormat,
create_file_in_fs,
get_transcript_format,
is_duplicate_file,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -1155,54 +1162,71 @@ def import_transcript_from_fs(edx_video_id, language_code, file_name, provider,
static_dir (str): The Directory to retrieve transcript file.
"""
file_format = None
transcript_data = get_video_transcript_data(edx_video_id, language_code)
existing_transcript = VideoTranscript.get_or_none(edx_video_id, language_code)

# First check if transcript record does not exist.
if not transcript_data:
# Read file from import file system and attach it to transcript record in DS.
try:
with resource_fs.open(combine(static_dir, file_name), 'r', encoding='utf-8-sig') as f:
file_content = f.read()
except ResourceNotFound:
# Don't raise exception in case transcript file is not found in course OLX.
logger.warning(
'[edx-val] "%s" transcript "%s" for video "%s" is not found.',
language_code,
file_name,
edx_video_id
)
return
except UnicodeDecodeError:
# Don't raise exception in case transcript contains non-utf8 content.
logger.warning(
'[edx-val] "%s" transcript "%s" for video "%s" contains a non-utf8 file content.',
language_code,
file_name,
edx_video_id
)
return
# check if the transcript exists and if it does, make sure that overriding
# existing transcripts is enabled before proceeding to import it
if (existing_transcript and
not OVERRIDE_EXISTING_IMPORTED_TRANSCRIPTS.is_enabled()):
return

# Get file format from transcript content.
try:
file_format = get_transcript_format(file_content)
except Error:
# Don't raise exception, just don't create transcript record.
logger.warning(
'[edx-val] Error while getting transcript format for video=%s -- language_code=%s --file_name=%s',
edx_video_id,
language_code,
file_name
)
return

# Create transcript record.
create_video_transcript(
video_id=edx_video_id,
language_code=language_code,
file_format=file_format,
content=ContentFile(file_content.encode('utf-8')),
provider=provider
# Read file from import file system and attach it to transcript record in DS.
try:
with resource_fs.open(combine(static_dir, file_name), 'r', encoding='utf-8-sig') as f:
file_content = f.read()
except ResourceNotFound:
# Don't raise exception in case transcript file is not found in course OLX.
logger.warning(
'[edx-val] "%s" transcript "%s" for video "%s" is not found.',
language_code,
file_name,
edx_video_id
)
return
except UnicodeDecodeError:
# Don't raise exception in case transcript contains non-utf8 content.
logger.warning(
'[edx-val] "%s" transcript "%s" for video "%s" contains a non-utf8 file content.',
language_code,
file_name,
edx_video_id
)
return

# change file content to utf8
utf8_encoded_file_content = file_content.encode('utf-8')
new_transcript_content_file = ContentFile(utf8_encoded_file_content)

# check if transcript content already exists, and if it does, make sure
# the transcript isn't a duplicate transcript to the already existing one
if (existing_transcript and
is_duplicate_file(new_transcript_content_file, existing_transcript.transcript.file)):
return

# Get file format from transcript content.
try:
file_format = get_transcript_format(file_content)
except Error:
# Don't raise exception, just don't create transcript record.
logger.warning(
'[edx-val] Error while getting transcript format for video=%s -- language_code=%s --file_name=%s',
edx_video_id,
language_code,
file_name
)
return

# Create transcript record.
create_or_update_video_transcript(
video_id=edx_video_id,
language_code=language_code,
metadata={
'provider': provider,
'file_format': file_format,
'language_code': language_code,
},
file_data=new_transcript_content_file,
)


def create_transcript_objects(xml, edx_video_id, resource_fs, static_dir, external_transcripts):
Expand Down
Empty file added edxval/config/__init__.py
Empty file.
38 changes: 38 additions & 0 deletions edxval/config/waffle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""
This module contains various configuration settings via
waffle switches for edx's video abstraction layer.
"""


from edx_toggles.toggles.__future__ import WaffleFlag

WAFFLE_NAMESPACE = 'edxval'


def waffle_name(toggle_name):
"""
Method to append waffle namespace to toggle's name
Reason behind not using f-strings is backwards compatibility
Since this is a method, it should be easy to change later on
"""
return "{namespace}.{toggle_name}".format(
namespace=WAFFLE_NAMESPACE,
toggle_name=toggle_name,
)


# .. toggle_name: OVERRIDE_EXISTING_IMPORTED_TRANSCRIPTS
# .. toggle_implementation: WaffleFlag
# .. toggle_default: False
# .. toggle_description: Enables overriding existing transcripts when importing courses with already
# existing transcripts. The transcripts are compared using content hashing, and if the transcript
# being imported isn't a duplicate, but different in content, it overrides the existing one.
# Otherwise, if the transcript is a duplicate, with same content, it doesn't get uploaded.
# .. toggle_use_cases: opt_in
# .. toggle_creation_date: 2021-01-01
# .. toggle_tickets: https://openedx.atlassian.net/browse/OSPR-5117
OVERRIDE_EXISTING_IMPORTED_TRANSCRIPTS = WaffleFlag(
waffle_name('override_existing_imported_transcripts'),
module_name=__name__,
)
52 changes: 32 additions & 20 deletions edxval/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,27 @@ def filename(self):

return file_name

def save_transcript(self, file_data, file_format, file_name=None):
"""
Saves Transcript Content to a Video Transcript File
Arguments:
file_data(InMemoryUploadedFile): Transcript content.
file_format(unicode): Transcript file format.
"""
# generate transcript file name if not already given
if not file_name:
file_name = '{uuid}.{ext}'.format(uuid=uuid4().hex, ext=file_format)

# save the transcript file
if file_data:
self.transcript.save(file_name, file_data)
else:
self.transcript.name = file_name

# save the object
self.save()

@classmethod
def get_or_none(cls, video_id, language_code):
"""
Expand Down Expand Up @@ -476,18 +497,16 @@ def create(cls, video, language_code, file_format, content, provider):
provider(unicode): Transcript provider.
"""
video_transcript = cls(video=video, language_code=language_code, file_format=file_format, provider=provider)
with closing(content) as transcript_content:
try:
file_name = '{uuid}.{ext}'.format(uuid=uuid4().hex, ext=video_transcript.file_format)
video_transcript.transcript.save(file_name, transcript_content)
video_transcript.save()
except Exception:
logger.exception(
'[VAL] Transcript save failed to storage for video_id "%s" language code "%s"',
video.edx_video_id,
language_code
)
raise

try:
video_transcript.save_transcript(content, file_format)
except Exception:
logger.exception(
'[VAL] Transcript save failed to storage for video_id "%s" language code "%s"',
video.edx_video_id,
language_code
)
raise

return video_transcript

Expand Down Expand Up @@ -519,14 +538,7 @@ def create_or_update(cls, video, language_code, metadata, file_data=None):
transcript_name = metadata.get('file_name')

try:
if transcript_name:
video_transcript.transcript.name = transcript_name
elif file_data:
with closing(file_data) as transcript_file_data:
file_name = '{uuid}.{ext}'.format(uuid=uuid4().hex, ext=video_transcript.file_format)
video_transcript.transcript.save(file_name, transcript_file_data)

video_transcript.save()
video_transcript.save_transcript(file_data, video_transcript.file_format, file_name=transcript_name)
except Exception:
logger.exception(
'[VAL] Transcript save failed to storage for video_id "%s" language code "%s"',
Expand Down
1 change: 1 addition & 0 deletions edxval/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@
# Third Party
'rest_framework',
'storages',
'waffle',

# Our App
'edxval',
Expand Down
Loading

0 comments on commit 84cedb3

Please sign in to comment.