-
Notifications
You must be signed in to change notification settings - Fork 71
/
Copy pathutf8.R
153 lines (138 loc) · 4.51 KB
/
utf8.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#' Whether cli is emitting UTF-8 characters
#'
#' UTF-8 cli characters can be turned on by setting the `cli.unicode`
#' option to `TRUE`. They can be turned off by setting if to `FALSE`.
#' If this option is not set, then [base::l10n_info()] is used to detect
#' UTF-8 support.
#'
#' @return Flag, whether cli uses UTF-8 characters.
#'
#' @export
is_utf8_output <- function() {
opt <- getOption("cli.unicode", NULL)
if (! is.null(opt)) {
isTRUE(opt)
} else {
l10n_info()$`UTF-8` && !is_latex_output()
}
}
#' Count the number of characters in a character vector
#'
#' By default it counts Unicode grapheme clusters, instead of code points.
#'
#' @param x Character vector, it is converted to UTF-8.
#' @param type Whether to count graphemes (characters), code points,
#' bytes, or calculate the display width of the string.
#' @return Numeric vector, the length of the strings in the character
#' vector.
#'
#' @family UTF-8 string manipulation
#' @export
#' @examples
#' # Grapheme example, emoji with combining characters. This is a single
#' # grapheme, consisting of five Unicode code points:
#' # * `\U0001f477` is the construction worker emoji
#' # * `\U0001f3fb` is emoji modifier that changes the skin color
#' # * `\u200d` is the zero width joiner
#' # * `\u2640` is the female sign
#' # * `\ufe0f` is variation selector 16, requesting an emoji style glyph
#' emo <- "\U0001f477\U0001f3fb\u200d\u2640\ufe0f"
#' cat(emo)
#'
#' utf8_nchar(emo, "chars") # = graphemes
#' utf8_nchar(emo, "bytes")
#' utf8_nchar(emo, "width")
#' utf8_nchar(emo, "codepoints")
#'
#' # For comparision, the output for width depends on the R version used:
#' nchar(emo, "chars")
#' nchar(emo, "bytes")
#' nchar(emo, "width")
utf8_nchar <- function(x, type = c("chars", "bytes", "width", "graphemes",
"codepoints")) {
type <- match.arg(type)
if (type == "chars") type <- "graphemes"
x <- enc2utf8(x)
if (type == "width") {
.Call(clic_utf8_display_width, x)
} else if (type == "graphemes") {
.Call(clic_utf8_nchar_graphemes, x)
} else if (type == "codepoints") {
base::nchar(x, allowNA = FALSE, keepNA = TRUE, type = "chars")
} else { # bytes
base::nchar(x, allowNA = FALSE, keepNA = TRUE, type = "bytes")
}
}
#' Substring of an UTF-8 string
#'
#' This function uses grapheme clusters instead of Unicode code points in
#' UTF-8 strings.
#'
#' @param x Character vector.
#' @param start Starting index or indices, recycled to match the length
#' of `x`.
#' @param stop Ending index or indices, recycled to match the length of
#' `x`.
#' @return Character vector of the same length as `x`, containing
#' the requested substrings.
#'
#' @family UTF-8 string manipulation
#' @export
#' @examples
#' # Five grapheme clusters, select the middle three
#' str <- paste0(
#' "\U0001f477\U0001f3ff\u200d\u2640\ufe0f",
#' "\U0001f477\U0001f3ff",
#' "\U0001f477\u200d\u2640\ufe0f",
#' "\U0001f477\U0001f3fb",
#' "\U0001f477\U0001f3ff")
#' cat(str)
#' str24 <- utf8_substr(str, 2, 4)
#' cat(str24)
utf8_substr <- function(x, start, stop) {
if (!is.character(x)) x <- as.character(x)
if (!is.numeric(start) || !is.numeric(stop)) {
throw(cli_error(
"{.arg start} and {.arg stop} must be numeric vectors",
"i" = if (!is.numeric(start)) "{.arg start} is {.typeof {start}}",
"i" = if (!is.numeric(stop)) "{.arg stop} is {.typeof {stop}}"
))
}
start2 <- suppressWarnings(as.integer(start))
stop2 <- suppressWarnings(as.integer(stop))
if (!length(start2) || !length(stop2)) {
throw(cli_error(
"{.arg start} and {.arg stop} must have at least length 1",
"i" = if (!length(start2)) "{.arg start} has length 0",
"i" = if (!length(stop2)) "{.arg stop} has length 0"
))
}
x <- enc2utf8(x)
# TODO: better recycling
start2 <- rep_len(start2, length(x))
stop2 <- rep_len(stop2, length(x))
.Call(clic_utf8_substr, x, start2, stop2)
}
#' Break an UTF-8 character vector into grapheme clusters
#'
#' @param x Character vector.
#' @return List of characters vectors, the grapheme clusters of the input
#' string.
#'
#' @family UTF-8 string manipulation
#' @export
#' @examples
#' # Five grapheme clusters
#' str <- paste0(
#' "\U0001f477\U0001f3ff\u200d\u2640\ufe0f",
#' "\U0001f477\U0001f3ff",
#' "\U0001f477\u200d\u2640\ufe0f",
#' "\U0001f477\U0001f3fb",
#' "\U0001f477\U0001f3ff")
#' cat(str, "\n")
#' chrs <- utf8_graphemes(str)
utf8_graphemes <- function(x) {
if (!is.character(x)) x <- as.character(x)
x <- enc2utf8(x)
.Call(clic_utf8_graphemes, x)
}