rstudio · gadenbuie · Jan 28, 2022 · Jan 20, 2022 · Jan 20, 2022 · Jan 20, 2022
diff --git a/NEWS.md b/NEWS.md
@@ -50,6 +50,8 @@
 
 -   Users are now warned if their submission contains blanks they are expected to fill in. The default blank pattern is three or more underscores, e.g. `____`. The pattern for blanks can be set with the `exercise.blanks` chunk or tutorial option (@rossellhayes #547).
 
+-   Users who submit unparsable code containing non-ASCII characters are now presented with more informative feedback. Non-ASCII characters are a common source of code problems and often appear in code when students copy and paste text from a source that applies automatic Unicode formatting. If the submission contains Unicode-formatted quotation marks (e.g. curly quotes) or dashes, the student is given a suggested replacement with ASCII characters. In other cases, the student is simply prompted to delete the non-ASCII characters and retype them manually (@rossellhayes #642).
+
 -   Authors can choose to reveal (default) or hide the solution to an exercise. Set `exercise.reveal_solution` in the chunk options of a `*-solution` chunk to choose whether or not the solution is revealed to the user. The option can also be set globally with `tutorial_options()`. In a future version of learnr, the default will likely be changed to hide solutions (#402).
 
 -   Feedback messages can now be an `htmltools::tag()`, `htmltools::tagList()`, or a character message (#458) (#458)

diff --git a/R/exercise.R b/R/exercise.R
@@ -918,21 +918,107 @@ exercise_check_code_is_parsable <- function(exercise) {
     }
   }
 
-  exercise_result(
-    list(
-      message = HTML(
-        i18n_span(
-          "text.unparsable",
-          HTML(i18n_translations()$en$translation$text$unparsable)
-        )
-      ),
-      correct = FALSE,
-      location = "append",
-      type = "error"
-    ),
-    html_output = error_message_html(error$message),
-    error_message = error$message
+  default_message <- i18n_span(
+    "text.unparsable",
+    HTML(i18n_translations()$en$translation$text$unparsable)
   )
+  unicode_message <- exercise_check_unparsable_unicode(exercise, error$message)
+
+  feedback <- list(
+    message = HTML(unicode_message %||% default_message),
+    correct = FALSE,
+    location = "append",
+    type = "error"
+  )
+
+  exercise_result_error(error$message, feedback)
+}
+
+exercise_check_unparsable_unicode <- function(exercise, error_message) {
+  code <- exercise[["code"]]
+
+  # Early exit if code is made up of all ASCII characters ----------------------
+  if (!grepl("[^\\x00-\\x7F]", code, perl = TRUE)) {
+    return(NULL)
+  }
+
+  # Determine line with offending character based on error message -------------
+  line <- as.integer(str_replace(error_message, "<text>:(\\d+):.+", "\\1"))
+
+  # Check if code contains Unicode quotation marks -----------------------------
+  single_quote_pattern <- "[\u2018\u2019\u201A\u201B\u275B\u275C\uFF07]"
+  double_quote_pattern <- "[\u201C-\u201F\u275D\u275E\u301D-\u301F\uFF02]"
+  quote_pattern <- paste(single_quote_pattern, double_quote_pattern, sep = "|")
+
+  if (grepl(quote_pattern, code)) {
+    # Replace curly single quotes with straight single quotes and
+    #   all other Unicode quotes with straight double quotes
+    replacement_pattern <- c("'", '"')
+    names(replacement_pattern) <- c(single_quote_pattern, double_quote_pattern)
+
+    return(
+      unparsable_unicode_message("unparsablequotes", code, line, quote_pattern, replacement_pattern)
+    )
+  }
+
+  # Check if code contains Unicode dashes --------------------------------------
+  # Regex searches for all characters in Unicode's general category
+  #   "Dash_Punctuation", except for the ASCII hyphen-minus
+  #   (https://www.unicode.org/reports/tr44/#General_Category_Values)
+  dash_pattern <- paste0(
+    "[\u00af\u05be\u06d4\u1400\u1428\u1806\u1b78\u2010-\u2015\u203e\u2043",
+    "\u2212\u23af\u23e4\u2500\u2796\u2e3a\u2e3b\u30fc\ufe58\ufe63\uff0d]"
+  )
+
+  if (grepl(dash_pattern, code)) {
+    # Replace Unicode dashes with ASCII hyphen-minus
+    replacement_pattern <- "-"
+    names(replacement_pattern) <- dash_pattern
+
+    return(
+      unparsable_unicode_message("unparsableunicodesuggestion", code, line, dash_pattern, replacement_pattern)
+    )
+  }
+
+  # Check if code contains any other non-ASCII characters ----------------------
+  # Regex searches for any codepoints not in the ASCII range (00-7F)
+  non_ascii_pattern <- "[^\u01-\u7f]"
+  return(
+    unparsable_unicode_message("unparsableunicode", code, line, non_ascii_pattern)
+  )
+}
+
+unparsable_unicode_message <- function(i18n_key, code, line, pattern, replacement_pattern = NULL) {
+  code <- unlist(strsplit(code, "\n"))[[line]]
+
+  character <- str_extract(code, pattern)
+  highlighted_code <- exercise_highlight_unparsable_unicode(code, pattern, line)
+
+  suggestion <- NULL
+  if (!is.null(replacement_pattern)) {
+    suggestion <- html_code_block(str_replace_all(code, replacement_pattern))
+  }
+
+  i18n_div(
+    paste0("text.", i18n_key),
+    HTML(i18n_translations()$en$translation$text[[i18n_key]]),
+    opts = list(
+      character = character,
+      code = highlighted_code,
+      suggestion = suggestion,
+      interpolation = list(escapeValue = FALSE)
+    )
+  )
+}
+
+exercise_highlight_unparsable_unicode <- function(code, pattern, line) {
+  highlighted_code <- gsub(
+    pattern = paste0("(", pattern, ")"),
+    replacement = "<mark>\\1</mark>",
+    x = code
+  )
+
+  html_code_block(paste0(line, ": ", highlighted_code), escape = FALSE)
 }
 
 exercise_result_timeout <- function() {

diff --git a/R/i18n.R b/R/i18n.R
@@ -119,15 +119,23 @@ i18n_validate_customization <- function(lng) {
   lng
 }
 
-i18n_span <- function(key, ..., opts = NULL) {
+i18n_tag <- function(key, ..., opts = NULL, tag = htmltools::span) {
   if (!is.null(opts)) {
     opts <- jsonlite::toJSON(opts, auto_unbox = TRUE, pretty = FALSE)
   }
-  x <- htmltools::span(..., `data-i18n` = key, `data-i18n-opts` = opts)
+  x <- tag(..., `data-i18n` = key, `data-i18n-opts` = opts)
   # return an html character object instead of a shiny.tag
   htmltools::HTML(format(x))
 }
 
+i18n_span <- function(key, ..., opts = NULL) {
+  i18n_tag(key, ..., opts = opts, tag = htmltools::span)
+}
+
+i18n_div <- function(key, ..., opts = NULL) {
+  i18n_tag(key, ..., opts = opts, tag = htmltools::div)
+}
+
 i18n_combine_words <- function(
   words, and = c("and", "or"), before = "", after = before, oxford_comma = TRUE
 ) {

diff --git a/R/utils.R b/R/utils.R
@@ -92,9 +92,26 @@ str_replace <- function(x, pattern, replacement) {
   if (is.null(x)) return(NULL)
   sub(pattern, replacement, x)
 }
+str_replace_all <- function(x, pattern, replacement) {
+  if (is.null(x)) return(NULL)
+
+  if (!is.null(names(pattern))) {
+    for (i in seq_along(pattern)) {
+      x <- str_replace_all(x, names(pattern)[[i]], pattern[[i]])
+    }
+
+    return(x)
+  }
+
+  gsub(pattern, replacement, x)
+}
+
 str_remove <- function(x, pattern) {
   str_replace(x, pattern, "")
 }
+str_extract <- function(x, pattern, ...) {
+  unlist(regmatches(x, regexpr(pattern, x, ...)))
+}
 
 is_tags <- function(x) {
   inherits(x, "shiny.tag") ||
@@ -118,3 +135,11 @@ is_installed <- function(package, version = NULL) {
 timestamp_utc <- function() {
   strftime(Sys.time(), "%F %H:%M:%OS3 %Z", tz = "UTC")
 }
+
+html_code_block <- function(x, escape = TRUE) {
+  if (escape) {
+    x <- htmltools::htmlEscape(x)
+  }
+
+  sprintf("<pre><code>%s</code></pre>", x)
+}
diff --git a/data-raw/i18n_translations.yml b/data-raw/i18n_translations.yml
@@ -408,6 +408,64 @@ text:
       또는 <code>&quot;</code>, <code>'</code>, <code>(</code>
       , <code>{</code>로 시작하는 구문을 닫는 <code>&quot;</code>, <code>'</code>,
       <code>)</code>, <code>}</code>을 잊었을 수도 있습니다.
+  unparsablequotes:
+    en: >
+      <p>It looks like your R code contains specially formatted quotation marks
+      or &quot;curly&quot; quotes (<code>{{character}}</code>)
+      around character strings, making your code invalid.
+      R requires character values to be contained in straight quotation
+      marks (<code>&quot;</code> or <code>'</code>).</p>
+      {{code}}
+      <p>Don't worry, this is a common source of errors when you copy code from
+      another app that applies its own formatting to text.
+      You can try replacing the code on that line with the following.
+      There may be other places that need to be fixed, too.</p>
+      {{suggestion}}
+    fr: ~
+    es: ~
+    pt: ~
+    tr: ~
+    emo: ~
+    eu: ~
+    de: ~
+    ko: ~
+  unparsableunicode:
+    en: >
+      <p>It looks like your R code contains an unexpected special character
+      (<code>{{character}}</code>) that makes your code invalid.</p>
+      {{code}}
+      <p>Sometimes your code may contain a special character that looks like a
+      regular character, especially if you copy and paste the code from
+      another app.
+      Try deleting the special character from your code and retyping
+      it manually.</p>
+    fr: ~
+    es: ~
+    pt: ~
+    tr: ~
+    emo: ~
+    eu: ~
+    de: ~
+    ko: ~
+  unparsableunicodesuggestion:
+    en: >
+      <p>It looks like your R code contains an unexpected special character
+      (<code>{{character}}</code>) that makes your code invalid.</p>
+      {{code}}
+      <p>Sometimes your code may contain a special character that looks like a
+      regular character, especially if you copy and paste the code from
+      another app.
+      You can try replacing the code on that line with the following.
+      There may be other places that need to be fixed, too.</p>
+      {{suggestion}}
+    fr: ~
+    es: ~
+    pt: ~
+    tr: ~
+    emo: ~
+    eu: ~
+    de: ~
+    ko: ~
   and:
     en: "and"
     fr: "et"

diff --git a/inst/internals/i18n_random_phrases.rds b/inst/internals/i18n_random_phrases.rds
diff --git a/inst/internals/i18n_translations.rds b/inst/internals/i18n_translations.rds
diff --git a/tests/testthat/test-exercise.R b/tests/testthat/test-exercise.R
@@ -1037,6 +1037,68 @@ test_that("Errors with global setup code result in an internal error", {
   expect_match(conditionMessage(res$feedback$error), "boom")
 })
 
+# Unparsable Unicode ------------------------------------------------------
+
+test_that("evaluate_exercise() returns message for unparsable non-ASCII code", {
+  # Curly double quotes
+  ex <- mock_exercise(
+    user_code = "str_detect(\u201ctest\u201d, \u201ct.+t\u201d)"
+  )
+  result <- evaluate_exercise(ex, new.env())
+  expect_equal(result$feedback, exercise_check_code_is_parsable(ex)$feedback)
+  expect_match(result$feedback$message, "text.unparsablequotes")
+  expect_match(
+    result$feedback$message,
+    i18n_translations()$en$translation$text$unparsablequotes,
+    fixed = TRUE
+  )
+
+  # Curly single quotes
+  ex <- mock_exercise(
+    user_code = "str_detect(\u2018test\u2019, \u2018t.+t\u2019)"
+  )
+  result <- evaluate_exercise(ex, new.env())
+  expect_equal(result$feedback, exercise_check_code_is_parsable(ex)$feedback)
+  expect_match(result$feedback$message, "text.unparsablequotes")
+  expect_match(
+    result$feedback$message,
+    i18n_translations()$en$translation$text$unparsablequotes,
+    fixed = TRUE
+  )
+
+  # En dash
+  ex     <- mock_exercise(user_code = "63 \u2013 21")
+  result <- evaluate_exercise(ex, new.env())
+  expect_equal(result$feedback, exercise_check_code_is_parsable(ex)$feedback)
+  expect_match(result$feedback$message, "text.unparsableunicodesuggestion")
+  expect_match(
+    result$feedback$message,
+    i18n_translations()$en$translation$text$unparsableunicodesuggestion,
+    fixed = TRUE
+  )
+
+  # Plus-minus sign
+  ex     <- mock_exercise(user_code = "63 \u00b1 21")
+  result <- evaluate_exercise(ex, new.env())
+  expect_equal(result$feedback, exercise_check_code_is_parsable(ex)$feedback)
+  expect_match(result$feedback$message, "text.unparsableunicode")
+  expect_match(
+    result$feedback$message,
+    i18n_translations()$en$translation$text$unparsableunicode,
+    fixed = TRUE
+  )
+})
+
+test_that("evaluate_exercise() does not return a message for parsable non-ASCII code", {
+  skip_on_os("windows")
+  # Greek variable name and interrobang in character string
+  ex <- mock_exercise(
+    user_code =
+      '\u03bc\u03b5\u03c4\u03b1\u03b2\u03bb\u03b7\u03c4\u03ae <- "What\u203d"'
+  )
+  result <- evaluate_exercise(ex, new.env())
+  expect_null(result$feedback)
+})
 
 # Timelimit ---------------------------------------------------------------