Skip to content

Commit

Permalink
patch 9.0.1485: no functions for converting from/to UTF-16 index
Browse files Browse the repository at this point in the history
Problem:    no functions for converting from/to UTF-16 index.
Solution:   Add UTF-16 flag to existing funtions and add strutf16len() and
            utf16idx(). (Yegappan Lakshmanan, closes #12216)
  • Loading branch information
chrisbra authored and brammool committed Apr 24, 2023
1 parent e1b4822 commit 67672ef
Show file tree
Hide file tree
Showing 8 changed files with 676 additions and 55 deletions.
98 changes: 88 additions & 10 deletions runtime/doc/builtin.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,10 @@ bufnr([{buf} [, {create}]]) Number Number of the buffer {buf}
bufwinid({buf}) Number window ID of buffer {buf}
bufwinnr({buf}) Number window number of buffer {buf}
byte2line({byte}) Number line number at byte count {byte}
byteidx({expr}, {nr}) Number byte index of {nr}'th char in {expr}
byteidxcomp({expr}, {nr}) Number byte index of {nr}'th char in {expr}
byteidx({expr}, {nr} [, {utf16}])
Number byte index of {nr}'th char in {expr}
byteidxcomp({expr}, {nr} [, {utf16}])
Number byte index of {nr}'th char in {expr}
call({func}, {arglist} [, {dict}])
any call {func} with arguments {arglist}
ceil({expr}) Float round {expr} up
Expand Down Expand Up @@ -117,7 +119,7 @@ changenr() Number current change number
char2nr({expr} [, {utf8}]) Number ASCII/UTF-8 value of first char in {expr}
charclass({string}) Number character class of {string}
charcol({expr} [, {winid}]) Number column number of cursor or mark
charidx({string}, {idx} [, {countcc}])
charidx({string}, {idx} [, {countcc} [, {utf16}]])
Number char index of byte {idx} in {string}
chdir({dir}) String change current working directory
cindent({lnum}) Number C indent for line {lnum}
Expand Down Expand Up @@ -604,6 +606,8 @@ strptime({format}, {timestring})
strridx({haystack}, {needle} [, {start}])
Number last index of {needle} in {haystack}
strtrans({expr}) String translate string to make it printable
strutf16len({string} [, {countcc}])
Number number of UTF-16 code units in {string}
strwidth({expr}) Number display cell length of the String {expr}
submatch({nr} [, {list}]) String or List
specific match in ":s" or substitute()
Expand Down Expand Up @@ -704,6 +708,8 @@ undofile({name}) String undo file name for {name}
undotree() List undo file tree
uniq({list} [, {func} [, {dict}]])
List remove adjacent duplicates from a list
utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
Number UTF-16 index of byte {idx} in {string}
values({dict}) List values in {dict}
virtcol({expr} [, {list}]) Number or List
screen column of cursor or mark
Expand Down Expand Up @@ -1363,7 +1369,7 @@ byte2line({byte}) *byte2line()*
< {not available when compiled without the |+byte_offset|
feature}

byteidx({expr}, {nr}) *byteidx()*
byteidx({expr}, {nr} [, {utf16}]) *byteidx()*
Return byte index of the {nr}'th character in the String
{expr}. Use zero for the first character, it then returns
zero.
Expand All @@ -1373,6 +1379,13 @@ byteidx({expr}, {nr}) *byteidx()*
length is added to the preceding base character. See
|byteidxcomp()| below for counting composing characters
separately.
When {utf16} is present and TRUE, {nr} is used as the UTF-16
index in the String {expr} instead of as the character index.
The UTF-16 index is the index in the string when it is encoded
with 16-bit words. If the specified UTF-16 index is in the
middle of a character (e.g. in a 4-byte character), then the
byte index of the first byte in the character is returned.
Refer to |string-offset-encoding| for more information.
Example : >
echo matchstr(str, ".", byteidx(str, 3))
< will display the fourth character. Another way to do the
Expand All @@ -1384,11 +1397,17 @@ byteidx({expr}, {nr}) *byteidx()*
If there are less than {nr} characters -1 is returned.
If there are exactly {nr} characters the length of the string
in bytes is returned.

See |charidx()| and |utf16idx()| for getting the character and
UTF-16 index respectively from the byte index.
Examples: >
echo byteidx('a😊😊', 2) returns 5
echo byteidx('a😊😊', 2, 1) returns 1
echo byteidx('a😊😊', 3, 1) returns 5
<
Can also be used as a |method|: >
GetName()->byteidx(idx)
byteidxcomp({expr}, {nr}) *byteidxcomp()*
byteidxcomp({expr}, {nr} [, {utf16}]) *byteidxcomp()*
Like byteidx(), except that a composing character is counted
as a separate character. Example: >
let s = 'e' .. nr2char(0x301)
Expand Down Expand Up @@ -1493,27 +1512,36 @@ charcol({expr} [, {winid}]) *charcol()*
GetPos()->col()
<
*charidx()*
charidx({string}, {idx} [, {countcc}])
charidx({string}, {idx} [, {countcc} [, {utf16}]])
Return the character index of the byte at {idx} in {string}.
The index of the first character is zero.
If there are no multibyte characters the returned value is
equal to {idx}.

When {countcc} is omitted or |FALSE|, then composing characters
are not counted separately, their byte length is
added to the preceding base character.
are not counted separately, their byte length is added to the
preceding base character.
When {countcc} is |TRUE|, then composing characters are
counted as separate characters.

When {utf16} is present and TRUE, {idx} is used as the UTF-16
index in the String {expr} instead of as the byte index.

Returns -1 if the arguments are invalid or if {idx} is greater
than the index of the last byte in {string}. An error is
given if the first argument is not a string, the second
argument is not a number or when the third argument is present
and is not zero or one.

See |byteidx()| and |byteidxcomp()| for getting the byte index
from the character index.
from the character index and |utf16idx()| for getting the
UTF-16 index from the character index.
Refer to |string-offset-encoding| for more information.
Examples: >
echo charidx('áb́ć', 3) returns 1
echo charidx('áb́ć', 6, 1) returns 4
echo charidx('áb́ć', 16) returns -1
echo charidx('a😊😊', 4, 0, 1) returns 2
<
Can also be used as a |method|: >
GetName()->charidx(idx)
Expand Down Expand Up @@ -9244,6 +9272,28 @@ strtrans({string}) *strtrans()*
Can also be used as a |method|: >
GetString()->strtrans()
strutf16len({string} [, {countcc}]) *strutf16len()*
The result is a Number, which is the number of UTF-16 code
units in String {string} (after converting it to UTF-16).

When {countcc} is TRUE, composing characters are counted
separately.
When {countcc} is omitted or FALSE, composing characters are
ignored.

Returns zero on error.

Also see |strlen()| and |strcharlen()|.
Examples: >
echo strutf16len('a') returns 1
echo strutf16len('©') returns 1
echo strutf16len('😊') returns 2
echo strutf16len('ą́') returns 1
echo strutf16len('ą́', v:true) returns 3
Can also be used as a |method|: >
GetText()->strutf16len()
<
strwidth({string}) *strwidth()*
The result is a Number, which is the number of display cells
String {string} occupies. A Tab character is counted as one
Expand Down Expand Up @@ -10059,6 +10109,34 @@ uniq({list} [, {func} [, {dict}]]) *uniq()* *E882*

Can also be used as a |method|: >
mylist->uniq()
<
*utf16idx()*
utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
Same as |charidx()| but returns the UTF-16 index of the byte
at {idx} in {string} (after converting it to UTF-16).

When {charidx} is present and TRUE, {idx} is used as the
character index in the String {string} instead of as the byte
index.
An {idx} in the middle of a UTF-8 sequence is rounded upwards
to the end of that sequence.

See |byteidx()| and |byteidxcomp()| for getting the byte index
from the UTF-16 index and |charidx()| for getting the
character index from the UTF-16 index.
Refer to |string-offset-encoding| for more information.
Examples: >
echo utf16idx('a😊😊', 3) returns 2
echo utf16idx('a😊😊', 7) returns 4
echo utf16idx('a😊😊', 1, 0, 1) returns 2
echo utf16idx('a😊😊', 2, 0, 1) returns 4
echo utf16idx('aą́c', 6) returns 2
echo utf16idx('aą́c', 6, 1) returns 4
echo utf16idx('a😊😊', 9) returns -1
<
Can also be used as a |method|: >
GetName()->utf16idx(idx)
values({dict}) *values()*
Return a |List| with all the values of {dict}. The |List| is
Expand Down
27 changes: 27 additions & 0 deletions runtime/doc/eval.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1580,6 +1580,33 @@ Examples: >
echo $"The square root of {{9}} is {sqrt(9)}"
< The square root of {9} is 3.0 ~

*string-offset-encoding*
A string consists of multiple characters. How the characters are stored
depends on 'encoding'. Most common is UTF-8, which uses one byte for ASCII
characters, two bytes for other latin characters and more bytes for other
characters.

A string offset can count characters or bytes. Other programs may use
UTF-16 encoding (16-bit words) and an offset of UTF-16 words. Some functions
use byte offsets, usually for UTF-8 encoding. Other functions use character
offsets, in which case the encoding doesn't matter.

The different offsets for the string "a©😊" are below:

UTF-8 offsets:
[0]: 61, [1]: C2, [2]: A9, [3]: F0, [4]: 9F, [5]: 98, [6]: 8A
UTF-16 offsets:
[0]: 0061, [1]: 00A9, [2]: D83D, [3]: DE0A
UTF-32 (character) offsets:
[0]: 00000061, [1]: 000000A9, [2]: 0001F60A

You can use the "g8" and "ga" commands on a character to see the
decimal/hex/octal values.

The functions |byteidx()|, |utf16idx()| and |charidx()| can be used to convert
between these indices. The functions |strlen()|, |strutf16len()| and
|strcharlen()| return the number of bytes, UTF-16 code units and characters in
a string respectively.

option *expr-option* *E112* *E113*
------
Expand Down
2 changes: 2 additions & 0 deletions runtime/doc/usr_41.txt
Original file line number Diff line number Diff line change
Expand Up @@ -754,6 +754,7 @@ String manipulation: *string-functions*
strlen() length of a string in bytes
strcharlen() length of a string in characters
strchars() number of characters in a string
strutf16len() number of UTF-16 code units in a string
strwidth() size of string when displayed
strdisplaywidth() size of string when displayed, deals with tabs
setcellwidths() set character cell width overrides
Expand All @@ -771,6 +772,7 @@ String manipulation: *string-functions*
byteidx() byte index of a character in a string
byteidxcomp() like byteidx() but count composing characters
charidx() character index of a byte in a string
utf16idx() UTF-16 index of a byte in a string
repeat() repeat a string multiple times
eval() evaluate a string expression
execute() execute an Ex command and get the output
Expand Down
10 changes: 7 additions & 3 deletions src/evalfunc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1751,9 +1751,9 @@ static funcentry_T global_functions[] =
ret_number, f_bufwinnr},
{"byte2line", 1, 1, FEARG_1, arg1_number,
ret_number, f_byte2line},
{"byteidx", 2, 2, FEARG_1, arg2_string_number,
{"byteidx", 2, 3, FEARG_1, arg3_string_number_bool,
ret_number, f_byteidx},
{"byteidxcomp", 2, 2, FEARG_1, arg2_string_number,
{"byteidxcomp", 2, 3, FEARG_1, arg3_string_number_bool,
ret_number, f_byteidxcomp},
{"call", 2, 3, FEARG_1, arg3_any_list_dict,
ret_any, f_call},
Expand Down Expand Up @@ -1803,7 +1803,7 @@ static funcentry_T global_functions[] =
ret_number, f_charclass},
{"charcol", 1, 2, FEARG_1, arg2_string_or_list_number,
ret_number, f_charcol},
{"charidx", 2, 3, FEARG_1, arg3_string_number_bool,
{"charidx", 2, 4, FEARG_1, arg3_string_number_bool,
ret_number, f_charidx},
{"chdir", 1, 1, FEARG_1, arg1_string,
ret_string, f_chdir},
Expand Down Expand Up @@ -2601,6 +2601,8 @@ static funcentry_T global_functions[] =
ret_number, f_strridx},
{"strtrans", 1, 1, FEARG_1, arg1_string,
ret_string, f_strtrans},
{"strutf16len", 1, 2, FEARG_1, arg2_string_bool,
ret_number, f_strutf16len},
{"strwidth", 1, 1, FEARG_1, arg1_string,
ret_number, f_strwidth},
{"submatch", 1, 2, FEARG_1, arg2_number_bool,
Expand Down Expand Up @@ -2785,6 +2787,8 @@ static funcentry_T global_functions[] =
ret_dict_any, f_undotree},
{"uniq", 1, 3, FEARG_1, arg13_sortuniq,
ret_first_arg, f_uniq},
{"utf16idx", 2, 4, FEARG_1, arg3_string_number_bool,
ret_number, f_utf16idx},
{"values", 1, 1, FEARG_1, arg1_dict_any,
ret_list_member, f_values},
{"virtcol", 1, 2, FEARG_1, arg2_string_or_list_bool,
Expand Down
2 changes: 2 additions & 0 deletions src/proto/strings.pro
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,14 @@ void f_string(typval_T *argvars, typval_T *rettv);
void f_strlen(typval_T *argvars, typval_T *rettv);
void f_strcharlen(typval_T *argvars, typval_T *rettv);
void f_strchars(typval_T *argvars, typval_T *rettv);
void f_strutf16len(typval_T *argvars, typval_T *rettv);
void f_strdisplaywidth(typval_T *argvars, typval_T *rettv);
void f_strwidth(typval_T *argvars, typval_T *rettv);
void f_strcharpart(typval_T *argvars, typval_T *rettv);
void f_strpart(typval_T *argvars, typval_T *rettv);
void f_strridx(typval_T *argvars, typval_T *rettv);
void f_strtrans(typval_T *argvars, typval_T *rettv);
void f_utf16idx(typval_T *argvars, typval_T *rettv);
void f_tolower(typval_T *argvars, typval_T *rettv);
void f_toupper(typval_T *argvars, typval_T *rettv);
void f_tr(typval_T *argvars, typval_T *rettv);
Expand Down
Loading

0 comments on commit 67672ef

Please sign in to comment.